diff --git a/deps/highway/include/hwy/aligned_allocator.h b/deps/highway/include/hwy/aligned_allocator.h deleted file mode 100644 index d0671a57..00000000 --- a/deps/highway/include/hwy/aligned_allocator.h +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ -#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ - -// Memory allocator with support for alignment and offsets. - -#include -#include - -#include "hwy/base.h" - -namespace hwy { - -// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which -// requires a literal. This matches typical L1 cache line sizes, which prevents -// false sharing. -#define HWY_ALIGNMENT 64 - -// Pointers to functions equivalent to malloc/free with an opaque void* passed -// to them. -using AllocPtr = void* (*)(void* opaque, size_t bytes); -using FreePtr = void (*)(void* opaque, void* memory); - -// Returns null or a pointer to at least `payload_size` (which can be zero) -// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and -// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain -// memory or malloc() if it is null. -HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size, - AllocPtr alloc_ptr, void* opaque_ptr); - -// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it -// must have been returned from a previous call to `AllocateAlignedBytes`. -// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if -// `free_ptr` function is null, uses the default free(). -HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, - FreePtr free_ptr, void* opaque_ptr); - -// Class that deletes the aligned pointer passed to operator() calling the -// destructor before freeing the pointer. This is equivalent to the -// std::default_delete but for aligned objects. For a similar deleter equivalent -// to free() for aligned memory see AlignedFreer(). -class AlignedDeleter { - public: - AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {} - AlignedDeleter(FreePtr free_ptr, void* opaque_ptr) - : free_(free_ptr), opaque_ptr_(opaque_ptr) {} - - template - void operator()(T* aligned_pointer) const { - return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_, - TypedArrayDeleter); - } - - private: - template - static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) { - size_t elems = size_in_bytes / sizeof(T); - for (size_t i = 0; i < elems; i++) { - // Explicitly call the destructor on each element. - (static_cast(ptr) + i)->~T(); - } - } - - // Function prototype that calls the destructor for each element in a typed - // array. TypeArrayDeleter would match this prototype. - using ArrayDeleter = void (*)(void* t_ptr, size_t t_size); - - HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer, - FreePtr free_ptr, - void* opaque_ptr, - ArrayDeleter deleter); - - FreePtr free_; - void* opaque_ptr_; -}; - -// Unique pointer to T with custom aligned deleter. This can be a single -// element U or an array of element if T is a U[]. The custom aligned deleter -// will call the destructor on U or each element of a U[] in the array case. -template -using AlignedUniquePtr = std::unique_ptr; - -// Aligned memory equivalent of make_unique using the custom allocators -// alloc/free with the passed `opaque` pointer. This function calls the -// constructor with the passed Args... and calls the destructor of the object -// when the AlignedUniquePtr is destroyed. -template -AlignedUniquePtr MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free, - void* opaque, Args&&... args) { - T* ptr = static_cast(AllocateAlignedBytes(sizeof(T), alloc, opaque)); - return AlignedUniquePtr(new (ptr) T(std::forward(args)...), - AlignedDeleter(free, opaque)); -} - -// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free -// functions. -template -AlignedUniquePtr MakeUniqueAligned(Args&&... args) { - T* ptr = static_cast(AllocateAlignedBytes( - sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr)); - return AlignedUniquePtr(new (ptr) T(std::forward(args)...), - AlignedDeleter()); -} - -// Helpers for array allocators (avoids overflow) -namespace detail { - -// Returns x such that 1u << x == n (if n is a power of two). -static inline constexpr size_t ShiftCount(size_t n) { - return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); -} - -template -T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { - constexpr size_t size = sizeof(T); - - constexpr bool is_pow2 = (size & (size - 1)) == 0; - constexpr size_t bits = ShiftCount(size); - static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); - - const size_t bytes = is_pow2 ? items << bits : items * size; - const size_t check = is_pow2 ? bytes >> bits : bytes / size; - if (check != items) { - return nullptr; // overflowed - } - return static_cast(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); -} - -} // namespace detail - -// Aligned memory equivalent of make_unique for array types using the -// custom allocators alloc/free. This function calls the constructor with the -// passed Args... on every created item. The destructor of each element will be -// called when the AlignedUniquePtr is destroyed. -template -AlignedUniquePtr MakeUniqueAlignedArrayWithAlloc( - size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { - T* ptr = detail::AllocateAlignedItems(items, alloc, opaque); - if (ptr != nullptr) { - for (size_t i = 0; i < items; i++) { - new (ptr + i) T(std::forward(args)...); - } - } - return AlignedUniquePtr(ptr, AlignedDeleter(free, opaque)); -} - -template -AlignedUniquePtr MakeUniqueAlignedArray(size_t items, Args&&... args) { - return MakeUniqueAlignedArrayWithAlloc( - items, nullptr, nullptr, nullptr, std::forward(args)...); -} - -// Custom deleter for std::unique_ptr equivalent to using free() as a deleter -// but for aligned memory. -class AlignedFreer { - public: - // Pass address of this to ctor to skip deleting externally-owned memory. - static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {} - - AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {} - AlignedFreer(FreePtr free_ptr, void* opaque_ptr) - : free_(free_ptr), opaque_ptr_(opaque_ptr) {} - - template - void operator()(T* aligned_pointer) const { - // TODO(deymo): assert that we are using a POD type T. - FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); - } - - private: - FreePtr free_; - void* opaque_ptr_; -}; - -// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD -// data use AlignedUniquePtr. -template -using AlignedFreeUniquePtr = std::unique_ptr; - -// Allocate an aligned and uninitialized array of POD values as a unique_ptr. -// Upon destruction of the unique_ptr the aligned array will be freed. -template -AlignedFreeUniquePtr AllocateAligned(const size_t items, AllocPtr alloc, - FreePtr free, void* opaque) { - return AlignedFreeUniquePtr( - detail::AllocateAlignedItems(items, alloc, opaque), - AlignedFreer(free, opaque)); -} - -// Same as previous AllocateAligned(), using default allocate/free functions. -template -AlignedFreeUniquePtr AllocateAligned(const size_t items) { - return AllocateAligned(items, nullptr, nullptr, nullptr); -} - -} // namespace hwy -#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ diff --git a/deps/highway/include/hwy/base.h b/deps/highway/include/hwy/base.h deleted file mode 100644 index 8f1e161b..00000000 --- a/deps/highway/include/hwy/base.h +++ /dev/null @@ -1,1323 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_BASE_H_ -#define HIGHWAY_HWY_BASE_H_ - -// For SIMD module implementations and their callers, target-independent. - -// IWYU pragma: begin_exports -#include -#include - -#include "hwy/detect_compiler_arch.h" -#include "hwy/highway_export.h" - -// "IWYU pragma: keep" does not work for these includes, so hide from the IDE. -#if !HWY_IDE - -#if !defined(HWY_NO_LIBCXX) -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS // before inttypes.h -#endif -#include -#endif - -#if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC -#include -#endif - -#endif // !HWY_IDE - -// IWYU pragma: end_exports - -#if HWY_COMPILER_MSVC -#include // memcpy -#endif - -//------------------------------------------------------------------------------ -// Compiler-specific definitions - -#define HWY_STR_IMPL(macro) #macro -#define HWY_STR(macro) HWY_STR_IMPL(macro) - -#if HWY_COMPILER_MSVC - -#include - -#define HWY_RESTRICT __restrict -#define HWY_INLINE __forceinline -#define HWY_NOINLINE __declspec(noinline) -#define HWY_FLATTEN -#define HWY_NORETURN __declspec(noreturn) -#define HWY_LIKELY(expr) (expr) -#define HWY_UNLIKELY(expr) (expr) -#define HWY_PRAGMA(tokens) __pragma(tokens) -#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) -#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) -#define HWY_MAYBE_UNUSED -#define HWY_HAS_ASSUME_ALIGNED 0 -#if (_MSC_VER >= 1700) -#define HWY_MUST_USE_RESULT _Check_return_ -#else -#define HWY_MUST_USE_RESULT -#endif - -#else - -#define HWY_RESTRICT __restrict__ -// force inlining without optimization enabled creates very inefficient code -// that can cause compiler timeout -#ifdef __OPTIMIZE__ -#define HWY_INLINE inline __attribute__((always_inline)) -#else -#define HWY_INLINE inline -#endif -#define HWY_NOINLINE __attribute__((noinline)) -#define HWY_FLATTEN __attribute__((flatten)) -#define HWY_NORETURN __attribute__((noreturn)) -#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) -#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#define HWY_PRAGMA(tokens) _Pragma(#tokens) -#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) -#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) -// Encountered "attribute list cannot appear here" when using the C++17 -// [[maybe_unused]], so only use the old style attribute for now. -#define HWY_MAYBE_UNUSED __attribute__((unused)) -#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) - -#endif // !HWY_COMPILER_MSVC - -//------------------------------------------------------------------------------ -// Builtin/attributes (no more #include after this point due to namespace!) - -namespace hwy { - -// Enables error-checking of format strings. -#if HWY_HAS_ATTRIBUTE(__format__) -#define HWY_FORMAT(idx_fmt, idx_arg) \ - __attribute__((__format__(__printf__, idx_fmt, idx_arg))) -#else -#define HWY_FORMAT(idx_fmt, idx_arg) -#endif - -// Returns a void* pointer which the compiler then assumes is N-byte aligned. -// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); -// -// The assignment semantics are required by GCC/Clang. ICC provides an in-place -// __assume_aligned, whereas MSVC's __assume appears unsuitable. -#if HWY_HAS_BUILTIN(__builtin_assume_aligned) -#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) -#else -#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ -#endif - -// Clang and GCC require attributes on each function into which SIMD intrinsics -// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and -// automatic annotation via pragmas. -#if HWY_COMPILER_ICC -// As of ICC 2021.{1-9} the pragma is neither implemented nor required. -#define HWY_PUSH_ATTRIBUTES(targets_str) -#define HWY_POP_ATTRIBUTES -#elif HWY_COMPILER_CLANG -#define HWY_PUSH_ATTRIBUTES(targets_str) \ - HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ - apply_to = function)) -#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) -#elif HWY_COMPILER_GCC_ACTUAL -#define HWY_PUSH_ATTRIBUTES(targets_str) \ - HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) -#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) -#else -#define HWY_PUSH_ATTRIBUTES(targets_str) -#define HWY_POP_ATTRIBUTES -#endif - -//------------------------------------------------------------------------------ -// Macros - -#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED - -#define HWY_CONCAT_IMPL(a, b) a##b -#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) - -#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) -#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) - -#if HWY_COMPILER_GCC_ACTUAL -// nielskm: GCC does not support '#pragma GCC unroll' without the factor. -#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor) -#define HWY_DEFAULT_UNROLL HWY_UNROLL(4) -#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX -#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor) -#define HWY_DEFAULT_UNROLL HWY_UNROLL() -#else -#define HWY_UNROLL(factor) -#define HWY_DEFAULT_UNROLL -#endif - -// Tell a compiler that the expression always evaluates to true. -// The expression should be free from any side effects. -// Some older compilers may have trouble with complex expressions, therefore -// it is advisable to split multiple conditions into separate assume statements, -// and manually check the generated code. -// OK but could fail: -// HWY_ASSUME(x == 2 && y == 3); -// Better: -// HWY_ASSUME(x == 2); -// HWY_ASSUME(y == 3); -#if HWY_HAS_CPP_ATTRIBUTE(assume) -#define HWY_ASSUME(expr) [[assume(expr)]] -#elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC -#define HWY_ASSUME(expr) __assume(expr) -// __builtin_assume() was added in clang 3.6. -#elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume) -#define HWY_ASSUME(expr) __builtin_assume(expr) -// __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added -// later, so check for the compiler version directly. -#elif HWY_COMPILER_GCC_ACTUAL >= 405 -#define HWY_ASSUME(expr) \ - ((expr) ? static_cast(0) : __builtin_unreachable()) -#else -#define HWY_ASSUME(expr) static_cast(0) -#endif - -// Compile-time fence to prevent undesirable code reordering. On Clang x86, the -// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence -// does, without generating code. -#if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX) -#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) -#else -// TODO(janwas): investigate alternatives. On Arm, the above generates barriers. -#define HWY_FENCE -#endif - -// 4 instances of a given literal value, useful as input to LoadDup128. -#define HWY_REP4(literal) literal, literal, literal, literal - -HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) - Abort(const char* file, int line, const char* format, ...); - -#define HWY_ABORT(format, ...) \ - ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) - -// Always enabled. -#define HWY_ASSERT(condition) \ - do { \ - if (!(condition)) { \ - HWY_ABORT("Assert %s", #condition); \ - } \ - } while (0) - -#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) -#define HWY_IS_MSAN 1 -#else -#define HWY_IS_MSAN 0 -#endif - -#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) -#define HWY_IS_ASAN 1 -#else -#define HWY_IS_ASAN 0 -#endif - -#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) -#define HWY_IS_TSAN 1 -#else -#define HWY_IS_TSAN 0 -#endif - -// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo. -// You can disable MSAN by adding this attribute to the function that fails. -#if HWY_IS_MSAN -#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory)) -#else -#define HWY_ATTR_NO_MSAN -#endif - -// For enabling HWY_DASSERT and shortening tests in slower debug builds -#if !defined(HWY_IS_DEBUG_BUILD) -// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent -// MSVC defines NDEBUG (if not, could instead check _DEBUG). -#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \ - HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__) -#define HWY_IS_DEBUG_BUILD 1 -#else -#define HWY_IS_DEBUG_BUILD 0 -#endif -#endif // HWY_IS_DEBUG_BUILD - -#if HWY_IS_DEBUG_BUILD -#define HWY_DASSERT(condition) HWY_ASSERT(condition) -#else -#define HWY_DASSERT(condition) \ - do { \ - } while (0) -#endif - -//------------------------------------------------------------------------------ -// CopyBytes / ZeroBytes - -#if HWY_COMPILER_MSVC -#pragma intrinsic(memcpy) -#pragma intrinsic(memset) -#endif - -// The source/destination must not overlap/alias. -template -HWY_API void CopyBytes(const From* from, To* to) { -#if HWY_COMPILER_MSVC - memcpy(to, from, kBytes); -#else - __builtin_memcpy(static_cast(to), static_cast(from), - kBytes); -#endif -} - -HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to, - size_t num_of_bytes_to_copy) { -#if HWY_COMPILER_MSVC - memcpy(to, from, num_of_bytes_to_copy); -#else - __builtin_memcpy(to, from, num_of_bytes_to_copy); -#endif -} - -// Same as CopyBytes, but for same-sized objects; avoids a size argument. -template -HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { - static_assert(sizeof(From) == sizeof(To), ""); - CopyBytes(from, to); -} - -template -HWY_API void ZeroBytes(To* to) { -#if HWY_COMPILER_MSVC - memset(to, 0, kBytes); -#else - __builtin_memset(to, 0, kBytes); -#endif -} - -HWY_API void ZeroBytes(void* to, size_t num_bytes) { -#if HWY_COMPILER_MSVC - memset(to, 0, num_bytes); -#else - __builtin_memset(to, 0, num_bytes); -#endif -} - -//------------------------------------------------------------------------------ -// kMaxVectorSize (undocumented, pending removal) - -#if HWY_ARCH_X86 -static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 -#elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \ - __riscv_v_intrinsic >= 11000 -// Not actually an upper bound on the size. -static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; -#else -static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; -#endif - -//------------------------------------------------------------------------------ -// Alignment - -// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays -// should be allocated dynamically via aligned_allocator.h because Lanes() may -// exceed the stack size. -#if HWY_ARCH_X86 -#define HWY_ALIGN_MAX alignas(64) -#elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \ - __riscv_v_intrinsic >= 11000 -#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned -#else -#define HWY_ALIGN_MAX alignas(16) -#endif - -//------------------------------------------------------------------------------ -// Lane types - -// float16_t load/store/conversion intrinsics are always supported on Armv8 and -// VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires -// -mfp16-format=ieee. -#if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \ - (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \ - (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE)) -#define HWY_NEON_HAVE_FLOAT16C 1 -#else -#define HWY_NEON_HAVE_FLOAT16C 0 -#endif - -// If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available: -// create/get/set/dup, ld/st, sel, rev, trn, uzp, zip. -#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16) -#define HWY_SVE_HAVE_BFLOAT16 1 -#else -#define HWY_SVE_HAVE_BFLOAT16 0 -#endif - -// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name -// by concatenating base type and bits. - -// 1) ACLE's __fp16 -#if HWY_NEON_HAVE_FLOAT16C -using float16_t = __fp16; -// 2) C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets. -// Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with -// sufficiently new compiler supporting avx512fp16). Do not use on clang-cl, -// which is missing __extendhfsf2. -#elif ( \ - (HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \ - (HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \ - HWY_COMPILER_GCC_ACTUAL >= 1200))) -using float16_t = _Float16; -// 3) Otherwise emulate -#else -#define HWY_EMULATE_FLOAT16 -#pragma pack(push, 1) -struct float16_t { - uint16_t bits; -}; -#pragma pack(pop) -#endif // float16_t - -#if HWY_SVE_HAVE_BFLOAT16 -using bfloat16_t = __bf16; -#else -#pragma pack(push, 1) -struct bfloat16_t { - uint16_t bits; -}; -#pragma pack(pop) -#endif // bfloat16_t - -HWY_API float F32FromF16(float16_t f16) { -#ifdef HWY_EMULATE_FLOAT16 - uint16_t bits16; - CopySameSize(&f16, &bits16); - const uint32_t sign = static_cast(bits16 >> 15); - const uint32_t biased_exp = (bits16 >> 10) & 0x1F; - const uint32_t mantissa = bits16 & 0x3FF; - - // Subnormal or zero - if (biased_exp == 0) { - const float subnormal = - (1.0f / 16384) * (static_cast(mantissa) * (1.0f / 1024)); - return sign ? -subnormal : subnormal; - } - - // Normalized: convert the representation directly (faster than ldexp/tables). - const uint32_t biased_exp32 = biased_exp + (127 - 15); - const uint32_t mantissa32 = mantissa << (23 - 10); - const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; - - float result; - CopySameSize(&bits32, &result); - return result; -#else - return static_cast(f16); -#endif -} - -HWY_API float16_t F16FromF32(float f32) { -#ifdef HWY_EMULATE_FLOAT16 - uint32_t bits32; - CopySameSize(&f32, &bits32); - const uint32_t sign = bits32 >> 31; - const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; - const uint32_t mantissa32 = bits32 & 0x7FFFFF; - - const int32_t exp = HWY_MIN(static_cast(biased_exp32) - 127, 15); - - // Tiny or zero => zero. - float16_t out; - if (exp < -24) { - // restore original sign - const uint16_t bits = static_cast(sign << 15); - CopySameSize(&bits, &out); - return out; - } - - uint32_t biased_exp16, mantissa16; - - // exp = [-24, -15] => subnormal - if (exp < -14) { - biased_exp16 = 0; - const uint32_t sub_exp = static_cast(-14 - exp); - HWY_DASSERT(1 <= sub_exp && sub_exp < 11); - mantissa16 = static_cast((1u << (10 - sub_exp)) + - (mantissa32 >> (13 + sub_exp))); - } else { - // exp = [-14, 15] - biased_exp16 = static_cast(exp + 15); - HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); - mantissa16 = mantissa32 >> 13; - } - - HWY_DASSERT(mantissa16 < 1024); - const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; - HWY_DASSERT(bits16 < 0x10000); - const uint16_t narrowed = static_cast(bits16); // big-endian safe - CopySameSize(&narrowed, &out); - return out; -#else - return static_cast(f32); -#endif -} - -HWY_API float F32FromBF16(bfloat16_t bf) { - uint16_t bits16; - CopyBytes<2>(&bf, &bits16); - uint32_t bits = bits16; - bits <<= 16; - float f; - CopySameSize(&bits, &f); - return f; -} - -HWY_API float F32FromF16Mem(const void* ptr) { - float16_t f16; - CopyBytes<2>(ptr, &f16); - return F32FromF16(f16); -} - -HWY_API float F32FromBF16Mem(const void* ptr) { - bfloat16_t bf; - CopyBytes<2>(ptr, &bf); - return F32FromBF16(bf); -} - -HWY_API bfloat16_t BF16FromF32(float f) { - uint32_t bits; - CopySameSize(&f, &bits); - const uint16_t bits16 = static_cast(bits >> 16); - bfloat16_t bf; - CopySameSize(&bits16, &bf); - return bf; -} - -using float32_t = float; -using float64_t = double; - -#pragma pack(push, 1) - -// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: -// https://reviews.llvm.org/D86310 -struct alignas(16) uint128_t { - uint64_t lo; // little-endian layout - uint64_t hi; -}; - -// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key -// field is to be compared (Lt128Upper instead of Lt128). -struct alignas(16) K64V64 { - uint64_t value; // little-endian layout - uint64_t key; -}; - -// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier -// than when considering both to be a 64-bit key. -struct alignas(8) K32V32 { - uint32_t value; // little-endian layout - uint32_t key; -}; - -#pragma pack(pop) - -#ifdef HWY_EMULATE_FLOAT16 - -static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a, - const float16_t& b) { - return F32FromF16(a) < F32FromF16(b); -} -// Required for std::greater. -static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a, - const float16_t& b) { - return F32FromF16(a) > F32FromF16(b); -} -static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a, - const float16_t& b) { - return F32FromF16(a) == F32FromF16(b); -} - -#endif // HWY_EMULATE_FLOAT16 - -static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a, - const uint128_t& b) { - return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi; -} -// Required for std::greater. -static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a, - const uint128_t& b) { - return b < a; -} -static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a, - const uint128_t& b) { - return a.lo == b.lo && a.hi == b.hi; -} - -static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a, - const K64V64& b) { - return a.key < b.key; -} -// Required for std::greater. -static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a, - const K64V64& b) { - return b < a; -} -static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a, - const K64V64& b) { - return a.key == b.key; -} - -static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a, - const K32V32& b) { - return a.key < b.key; -} -// Required for std::greater. -static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a, - const K32V32& b) { - return b < a; -} -static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a, - const K32V32& b) { - return a.key == b.key; -} - -//------------------------------------------------------------------------------ -// Controlling overload resolution (SFINAE) - -template -struct EnableIfT {}; -template <> -struct EnableIfT { - using type = void; -}; - -template -using EnableIf = typename EnableIfT::type; - -template -struct IsSameT { - enum { value = 0 }; -}; - -template -struct IsSameT { - enum { value = 1 }; -}; - -template -HWY_API constexpr bool IsSame() { - return IsSameT::value; -} - -template -struct IfT { - using type = Then; -}; - -template -struct IfT { - using type = Else; -}; - -template -using If = typename IfT::type; - -// Insert into template/function arguments to enable this overload only for -// vectors of exactly, at most (LE), or more than (GT) this many bytes. -// -// As an example, checking for a total size of 16 bytes will match both -// Simd and Simd. -#define HWY_IF_V_SIZE(T, kN, bytes) \ - hwy::EnableIf* = nullptr -#define HWY_IF_V_SIZE_LE(T, kN, bytes) \ - hwy::EnableIf* = nullptr -#define HWY_IF_V_SIZE_GT(T, kN, bytes) \ - hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr - -#define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr -#define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr -#define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr - -#define HWY_IF_UNSIGNED(T) hwy::EnableIf()>* = nullptr -#define HWY_IF_SIGNED(T) \ - hwy::EnableIf() && !IsFloat() && !IsSpecialFloat()>* = \ - nullptr -#define HWY_IF_FLOAT(T) hwy::EnableIf()>* = nullptr -#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf()>* = nullptr -#define HWY_IF_FLOAT3264(T) hwy::EnableIf()>* = nullptr -#define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf()>* = nullptr -#define HWY_IF_SPECIAL_FLOAT(T) \ - hwy::EnableIf()>* = nullptr -#define HWY_IF_NOT_SPECIAL_FLOAT(T) \ - hwy::EnableIf()>* = nullptr -#define HWY_IF_FLOAT_OR_SPECIAL(T) \ - hwy::EnableIf() || hwy::IsSpecialFloat()>* = nullptr -#define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \ - hwy::EnableIf() && !hwy::IsSpecialFloat()>* = nullptr - -#define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf* = nullptr -#define HWY_IF_NOT_T_SIZE(T, bytes) \ - hwy::EnableIf* = nullptr -// bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds -// too similar. If you want the opposite of this (2 or 4 bytes), ask for those -// bits explicitly (0x14) instead of attempting to 'negate' 0x102. -#define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \ - hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr - -// Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double -// overloads. -#define HWY_IF_UI16(T) \ - hwy::EnableIf() || IsSame()>* = nullptr -#define HWY_IF_UI32(T) \ - hwy::EnableIf() || IsSame()>* = nullptr -#define HWY_IF_UI64(T) \ - hwy::EnableIf() || IsSame()>* = nullptr - -#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \ - hwy::EnableIf* = nullptr - -// Empty struct used as a size tag type. -template -struct SizeTag {}; - -template -struct RemoveConstT { - using type = T; -}; -template -struct RemoveConstT { - using type = T; -}; - -template -using RemoveConst = typename RemoveConstT::type; - -template -struct RemoveRefT { - using type = T; -}; -template -struct RemoveRefT { - using type = T; -}; -template -struct RemoveRefT { - using type = T; -}; - -template -using RemoveRef = typename RemoveRefT::type; - -//------------------------------------------------------------------------------ -// Type relations - -namespace detail { - -template -struct Relations; -template <> -struct Relations { - using Unsigned = uint8_t; - using Signed = int8_t; - using Wide = uint16_t; - enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint8_t; - using Signed = int8_t; - using Wide = int16_t; - enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint16_t; - using Signed = int16_t; - using Float = float16_t; - using Wide = uint32_t; - using Narrow = uint8_t; - enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint16_t; - using Signed = int16_t; - using Float = float16_t; - using Wide = int32_t; - using Narrow = int8_t; - enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint32_t; - using Signed = int32_t; - using Float = float; - using Wide = uint64_t; - using Narrow = uint16_t; - enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint32_t; - using Signed = int32_t; - using Float = float; - using Wide = int64_t; - using Narrow = int16_t; - enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint64_t; - using Signed = int64_t; - using Float = double; - using Wide = uint128_t; - using Narrow = uint32_t; - enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint64_t; - using Signed = int64_t; - using Float = double; - using Narrow = int32_t; - enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint128_t; - using Narrow = uint64_t; - enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint16_t; - using Signed = int16_t; - using Float = float16_t; - using Wide = float; - enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint16_t; - using Signed = int16_t; - using Wide = float; - enum { is_signed = 1, is_float = 1, is_bf16 = 1 }; -}; -template <> -struct Relations { - using Unsigned = uint32_t; - using Signed = int32_t; - using Float = float; - using Wide = double; - using Narrow = float16_t; - enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; -}; -template <> -struct Relations { - using Unsigned = uint64_t; - using Signed = int64_t; - using Float = double; - using Narrow = float; - enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; -}; - -template -struct TypeFromSize; -template <> -struct TypeFromSize<1> { - using Unsigned = uint8_t; - using Signed = int8_t; -}; -template <> -struct TypeFromSize<2> { - using Unsigned = uint16_t; - using Signed = int16_t; - using Float = float16_t; -}; -template <> -struct TypeFromSize<4> { - using Unsigned = uint32_t; - using Signed = int32_t; - using Float = float; -}; -template <> -struct TypeFromSize<8> { - using Unsigned = uint64_t; - using Signed = int64_t; - using Float = double; -}; -template <> -struct TypeFromSize<16> { - using Unsigned = uint128_t; -}; - -} // namespace detail - -// Aliases for types of a different category, but the same size. -template -using MakeUnsigned = typename detail::Relations::Unsigned; -template -using MakeSigned = typename detail::Relations::Signed; -template -using MakeFloat = typename detail::Relations::Float; - -// Aliases for types of the same category, but different size. -template -using MakeWide = typename detail::Relations::Wide; -template -using MakeNarrow = typename detail::Relations::Narrow; - -// Obtain type from its size [bytes]. -template -using UnsignedFromSize = typename detail::TypeFromSize::Unsigned; -template -using SignedFromSize = typename detail::TypeFromSize::Signed; -template -using FloatFromSize = typename detail::TypeFromSize::Float; - -// Avoid confusion with SizeTag where the parameter is a lane size. -using UnsignedTag = SizeTag<0>; -using SignedTag = SizeTag<0x100>; // integer -using FloatTag = SizeTag<0x200>; -using SpecialTag = SizeTag<0x300>; - -template > -constexpr auto TypeTag() - -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> { - return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>(); -} - -// For when we only want to distinguish FloatTag from everything else. -using NonFloatTag = SizeTag<0x400>; - -template > -constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> { - return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>(); -} - -//------------------------------------------------------------------------------ -// Type traits - -template -HWY_API constexpr bool IsFloat3264() { - return IsSame() || IsSame(); -} - -template -HWY_API constexpr bool IsFloat() { - // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or - // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1. - return IsSame() || IsFloat3264(); -} - -// These types are often special-cased and not supported in all ops. -template -HWY_API constexpr bool IsSpecialFloat() { - return IsSame() || IsSame(); -} - -template -HWY_API constexpr bool IsSigned() { - return T(0) > T(-1); -} -template <> -constexpr bool IsSigned() { - return true; -} -template <> -constexpr bool IsSigned() { - return true; -} - -// Largest/smallest representable integer values. -template -HWY_API constexpr T LimitsMax() { - static_assert(!IsFloat(), "Only for integer types"); - using TU = MakeUnsigned; - return static_cast(IsSigned() ? (static_cast(~0ull) >> 1) - : static_cast(~0ull)); -} -template -HWY_API constexpr T LimitsMin() { - static_assert(!IsFloat(), "Only for integer types"); - return IsSigned() ? T(-1) - LimitsMax() : T(0); -} - -// Largest/smallest representable value (integer or float). This naming avoids -// confusion with numeric_limits::min() (the smallest positive value). -// Cannot be constexpr because we use CopySameSize for [b]float16_t. -template -HWY_API T LowestValue() { - return LimitsMin(); -} -template <> -HWY_INLINE bfloat16_t LowestValue() { - const uint16_t kBits = 0xFF7F; // -1.1111111 x 2^127 - bfloat16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float16_t LowestValue() { - const uint16_t kBits = 0xFBFF; // -1.1111111111 x 2^15 - float16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float LowestValue() { - return -3.402823466e+38F; -} -template <> -HWY_INLINE double LowestValue() { - return -1.7976931348623158e+308; -} - -template -HWY_API T HighestValue() { - return LimitsMax(); -} -template <> -HWY_INLINE bfloat16_t HighestValue() { - const uint16_t kBits = 0x7F7F; // 1.1111111 x 2^127 - bfloat16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float16_t HighestValue() { - const uint16_t kBits = 0x7BFF; // 1.1111111111 x 2^15 - float16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float HighestValue() { - return 3.402823466e+38F; -} -template <> -HWY_INLINE double HighestValue() { - return 1.7976931348623158e+308; -} - -// Difference between 1.0 and the next representable value. Equal to -// 1 / (1ULL << MantissaBits()), but hard-coding ensures precision. -template -HWY_API T Epsilon() { - return 1; -} -template <> -HWY_INLINE bfloat16_t Epsilon() { - const uint16_t kBits = 0x3C00; // 0.0078125 - bfloat16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float16_t Epsilon() { - const uint16_t kBits = 0x1400; // 0.0009765625 - float16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float Epsilon() { - return 1.192092896e-7f; -} -template <> -HWY_INLINE double Epsilon() { - return 2.2204460492503131e-16; -} - -// Returns width in bits of the mantissa field in IEEE binary16/32/64. -template -constexpr int MantissaBits() { - static_assert(sizeof(T) == 0, "Only instantiate the specializations"); - return 0; -} -template <> -constexpr int MantissaBits() { - return 7; -} -template <> -constexpr int MantissaBits() { - return 10; -} -template <> -constexpr int MantissaBits() { - return 23; -} -template <> -constexpr int MantissaBits() { - return 52; -} - -// Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with -// the largest possible (biased) exponent field. Used by IsInf. -template -constexpr MakeSigned MaxExponentTimes2() { - return -(MakeSigned{1} << (MantissaBits() + 1)); -} - -// Returns bitmask of the sign bit in IEEE binary16/32/64. -template -constexpr MakeUnsigned SignMask() { - return MakeUnsigned{1} << (sizeof(T) * 8 - 1); -} - -// Returns bitmask of the exponent field in IEEE binary16/32/64. -template -constexpr MakeUnsigned ExponentMask() { - return (~(MakeUnsigned{1} << MantissaBits()) + 1) & ~SignMask(); -} - -// Returns bitmask of the mantissa field in IEEE binary16/32/64. -template -constexpr MakeUnsigned MantissaMask() { - return (MakeUnsigned{1} << MantissaBits()) - 1; -} - -// Returns 1 << mantissa_bits as a floating-point number. All integers whose -// absolute value are less than this can be represented exactly. -template -HWY_INLINE T MantissaEnd() { - static_assert(sizeof(T) == 0, "Only instantiate the specializations"); - return 0; -} -template <> -HWY_INLINE bfloat16_t MantissaEnd() { - const uint16_t kBits = 0x4300; // 1.0 x 2^7 - bfloat16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float16_t MantissaEnd() { - const uint16_t kBits = 0x6400; // 1.0 x 2^10 - float16_t ret; - CopySameSize(&kBits, &ret); - return ret; -} -template <> -HWY_INLINE float MantissaEnd() { - return 8388608.0f; // 1 << 23 -} -template <> -HWY_INLINE double MantissaEnd() { - // floating point literal with p52 requires C++17. - return 4503599627370496.0; // 1 << 52 -} - -// Returns width in bits of the exponent field in IEEE binary16/32/64. -template -constexpr int ExponentBits() { - // Exponent := remaining bits after deducting sign and mantissa. - return 8 * sizeof(T) - 1 - MantissaBits(); -} - -// Returns largest value of the biased exponent field in IEEE binary16/32/64, -// right-shifted so that the LSB is bit zero. Example: 0xFF for float. -// This is expressed as a signed integer for more efficient comparison. -template -constexpr MakeSigned MaxExponentField() { - return (MakeSigned{1} << ExponentBits()) - 1; -} - -//------------------------------------------------------------------------------ -// Helper functions - -template -constexpr inline T1 DivCeil(T1 a, T2 b) { - return (a + b - 1) / b; -} - -// Works for any `align`; if a power of two, compiler emits ADD+AND. -constexpr inline size_t RoundUpTo(size_t what, size_t align) { - return DivCeil(what, align) * align; -} - -// Undefined results for x == 0. -HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { -#if HWY_COMPILER_MSVC - unsigned long index; // NOLINT - _BitScanForward(&index, x); - return index; -#else // HWY_COMPILER_MSVC - return static_cast(__builtin_ctz(x)); -#endif // HWY_COMPILER_MSVC -} - -HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { -#if HWY_COMPILER_MSVC -#if HWY_ARCH_X86_64 - unsigned long index; // NOLINT - _BitScanForward64(&index, x); - return index; -#else // HWY_ARCH_X86_64 - // _BitScanForward64 not available - uint32_t lsb = static_cast(x & 0xFFFFFFFF); - unsigned long index; // NOLINT - if (lsb == 0) { - uint32_t msb = static_cast(x >> 32u); - _BitScanForward(&index, msb); - return 32 + index; - } else { - _BitScanForward(&index, lsb); - return index; - } -#endif // HWY_ARCH_X86_64 -#else // HWY_COMPILER_MSVC - return static_cast(__builtin_ctzll(x)); -#endif // HWY_COMPILER_MSVC -} - -// Undefined results for x == 0. -HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { -#if HWY_COMPILER_MSVC - unsigned long index; // NOLINT - _BitScanReverse(&index, x); - return 31 - index; -#else // HWY_COMPILER_MSVC - return static_cast(__builtin_clz(x)); -#endif // HWY_COMPILER_MSVC -} - -HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { -#if HWY_COMPILER_MSVC -#if HWY_ARCH_X86_64 - unsigned long index; // NOLINT - _BitScanReverse64(&index, x); - return 63 - index; -#else // HWY_ARCH_X86_64 - // _BitScanReverse64 not available - const uint32_t msb = static_cast(x >> 32u); - unsigned long index; // NOLINT - if (msb == 0) { - const uint32_t lsb = static_cast(x & 0xFFFFFFFF); - _BitScanReverse(&index, lsb); - return 63 - index; - } else { - _BitScanReverse(&index, msb); - return 31 - index; - } -#endif // HWY_ARCH_X86_64 -#else // HWY_COMPILER_MSVC - return static_cast(__builtin_clzll(x)); -#endif // HWY_COMPILER_MSVC -} - -HWY_API size_t PopCount(uint64_t x) { -#if HWY_COMPILER_GCC // includes clang - return static_cast(__builtin_popcountll(x)); - // This instruction has a separate feature flag, but is often called from - // non-SIMD code, so we don't want to require dynamic dispatch. It was first - // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro - // for AVX, so check for that. -#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) - return _mm_popcnt_u64(x); -#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) - return _mm_popcnt_u32(static_cast(x & 0xFFFFFFFFu)) + - _mm_popcnt_u32(static_cast(x >> 32)); -#else - x -= ((x >> 1) & 0x5555555555555555ULL); - x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); - x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL); - x += (x >> 8); - x += (x >> 16); - x += (x >> 32); - return static_cast(x & 0x7Fu); -#endif -} - -// Skip HWY_API due to GCC "function not considered for inlining". Previously -// such errors were caused by underlying type mismatches, but it's not clear -// what is still mismatched despite all the casts. -template -/*HWY_API*/ constexpr size_t FloorLog2(TI x) { - return x == TI{1} - ? 0 - : static_cast(FloorLog2(static_cast(x >> 1)) + 1); -} - -template -/*HWY_API*/ constexpr size_t CeilLog2(TI x) { - return x == TI{1} - ? 0 - : static_cast(FloorLog2(static_cast(x - 1)) + 1); -} - -template -HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) { - return t + static_cast(n); -} - -template -HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t, - size_t n) { - using TU = MakeUnsigned; - return static_cast( - static_cast(static_cast(t) + static_cast(n)) & - hwy::LimitsMax()); -} - -#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 -#pragma intrinsic(_umul128) -#endif - -// 64 x 64 = 128 bit multiplication -HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { -#if defined(__SIZEOF_INT128__) - __uint128_t product = (__uint128_t)a * (__uint128_t)b; - *upper = (uint64_t)(product >> 64); - return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); -#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 - return _umul128(a, b, upper); -#else - constexpr uint64_t kLo32 = 0xFFFFFFFFU; - const uint64_t lo_lo = (a & kLo32) * (b & kLo32); - const uint64_t hi_lo = (a >> 32) * (b & kLo32); - const uint64_t lo_hi = (a & kLo32) * (b >> 32); - const uint64_t hi_hi = (a >> 32) * (b >> 32); - const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; - *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; - return (t << 32) | (lo_lo & kLo32); -#endif -} - -// Prevents the compiler from eliding the computations that led to "output". -template -HWY_API void PreventElision(T&& output) { -#if HWY_COMPILER_MSVC - // MSVC does not support inline assembly anymore (and never supported GCC's - // RTL constraints). Self-assignment with #pragma optimize("off") might be - // expected to prevent elision, but it does not with MSVC 2015. Type-punning - // with volatile pointers generates inefficient code on MSVC 2017. - static std::atomic> dummy; - dummy.store(output, std::memory_order_relaxed); -#else - // Works by indicating to the compiler that "output" is being read and - // modified. The +r constraint avoids unnecessary writes to memory, but only - // works for built-in types (typically FuncOutput). - asm volatile("" : "+r"(output) : : "memory"); -#endif -} - -} // namespace hwy - -#endif // HIGHWAY_HWY_BASE_H_ diff --git a/deps/highway/include/hwy/cache_control.h b/deps/highway/include/hwy/cache_control.h deleted file mode 100644 index 6e7665dd..00000000 --- a/deps/highway/include/hwy/cache_control.h +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_ -#define HIGHWAY_HWY_CACHE_CONTROL_H_ - -#include "hwy/base.h" - -// Requires SSE2; fails to compile on 32-bit Clang 7 (see -// https://github.com/gperftools/gperftools/issues/946). -#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32) -#undef HWY_DISABLE_CACHE_CONTROL -#define HWY_DISABLE_CACHE_CONTROL -#endif - -// intrin.h is sufficient on MSVC and already included by base.h. -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC -#include // SSE2 -#include // _mm_prefetch -#endif - -namespace hwy { - -// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size. -#define HWY_STREAM_MULTIPLE 16 - -// The following functions may also require an attribute. -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC -#define HWY_ATTR_CACHE __attribute__((target("sse2"))) -#else -#define HWY_ATTR_CACHE -#endif - -// Windows.h #defines this, which causes infinite recursion. Temporarily -// undefine to avoid conflict with our function. -// TODO(janwas): remove when this function is removed. -#pragma push_macro("LoadFence") -#undef LoadFence - -// Delays subsequent loads until prior loads are visible. Beware of potentially -// differing behavior across architectures and vendors: on Intel but not -// AMD CPUs, also serves as a full fence (waits for all prior instructions to -// complete). -HWY_INLINE HWY_ATTR_CACHE void LoadFence() { -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) - _mm_lfence(); -#endif -} - -// TODO(janwas): remove when this function is removed. (See above.) -#pragma pop_macro("LoadFence") - -// Ensures values written by previous `Stream` calls are visible on the current -// core. This is NOT sufficient for synchronizing across cores; when `Stream` -// outputs are to be consumed by other core(s), the producer must publish -// availability (e.g. via mutex or atomic_flag) after `FlushStream`. -HWY_INLINE HWY_ATTR_CACHE void FlushStream() { -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) - _mm_sfence(); -#endif -} - -// Optionally begins loading the cache line containing "p" to reduce latency of -// subsequent actual loads. -template -HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) { -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) - _mm_prefetch(reinterpret_cast(p), _MM_HINT_T0); -#elif HWY_COMPILER_GCC // includes clang - // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not - // desirable, so use the default 3 (keep in caches). - __builtin_prefetch(p, /*write=*/0, /*hint=*/3); -#else - (void)p; -#endif -} - -// Invalidates and flushes the cache line containing "p", if possible. -HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) { -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) - _mm_clflush(p); -#else - (void)p; -#endif -} - -// When called inside a spin-loop, may reduce power consumption. -HWY_INLINE HWY_ATTR_CACHE void Pause() { -#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) - _mm_pause(); -#endif -} - -} // namespace hwy - -#endif // HIGHWAY_HWY_CACHE_CONTROL_H_ diff --git a/deps/highway/include/hwy/detect_compiler_arch.h b/deps/highway/include/hwy/detect_compiler_arch.h deleted file mode 100644 index 081b6fff..00000000 --- a/deps/highway/include/hwy/detect_compiler_arch.h +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_ -#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_ - -// Detects compiler and arch from predefined macros. Zero dependencies for -// inclusion by foreach_target.h. - -// Add to #if conditions to prevent IDE from graying out code. -#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \ - (defined Q_CREATOR_RUN) || (defined __CLANGD__) || \ - (defined GROK_ELLIPSIS_BUILD) -#define HWY_IDE 1 -#else -#define HWY_IDE 0 -#endif - -//------------------------------------------------------------------------------ -// Compiler - -// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like -// MSVC in other aspects (e.g. HWY_DIAGNOSTICS). -#if defined(_MSC_VER) && !defined(__clang__) -#define HWY_COMPILER_MSVC _MSC_VER -#else -#define HWY_COMPILER_MSVC 0 -#endif - -#if defined(_MSC_VER) && defined(__clang__) -#define HWY_COMPILER_CLANGCL _MSC_VER -#else -#define HWY_COMPILER_CLANGCL 0 -#endif - -#ifdef __INTEL_COMPILER -#define HWY_COMPILER_ICC __INTEL_COMPILER -#else -#define HWY_COMPILER_ICC 0 -#endif - -#ifdef __INTEL_LLVM_COMPILER -#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER -#else -#define HWY_COMPILER_ICX 0 -#endif - -// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU -// compiler extensions (eg. Clang, Intel...) -#ifdef __GNUC__ -#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) -#else -#define HWY_COMPILER_GCC 0 -#endif - -// Clang or clang-cl, not GCC. -#ifdef __clang__ -// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or -// an invalid version number, deduce it from the presence of warnings. -// Originally based on -// https://github.com/simd-everywhere/simde/blob/47d6e603de9d04ee05cdfbc57cf282a02be1bf2a/simde/simde-detect-clang.h#L59. -// Please send updates below to them as well, thanks! -#if defined(__apple_build_version__) || __clang_major__ >= 999 -#if __has_attribute(nouwtable) // no new warnings in 16.0 -#define HWY_COMPILER_CLANG 1600 -#elif __has_warning("-Warray-parameter") -#define HWY_COMPILER_CLANG 1500 -#elif __has_warning("-Wbitwise-instead-of-logical") -#define HWY_COMPILER_CLANG 1400 -#elif __has_warning("-Wreserved-identifier") -#define HWY_COMPILER_CLANG 1300 -#elif __has_warning("-Wformat-insufficient-args") -#define HWY_COMPILER_CLANG 1200 -#elif __has_warning("-Wimplicit-const-int-float-conversion") -#define HWY_COMPILER_CLANG 1100 -#elif __has_warning("-Wmisleading-indentation") -#define HWY_COMPILER_CLANG 1000 -#elif defined(__FILE_NAME__) -#define HWY_COMPILER_CLANG 900 -#elif __has_warning("-Wextra-semi-stmt") || \ - __has_builtin(__builtin_rotateleft32) -#define HWY_COMPILER_CLANG 800 -// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently -// based on Clang 7, but does not support the warning we test. -// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and -// https://trac.macports.org/wiki/XcodeVersionInfo. -#elif __has_warning("-Wc++98-compat-extra-semi") || \ - (defined(__apple_build_version__) && __apple_build_version__ >= 10010000) -#define HWY_COMPILER_CLANG 700 -#else // Anything older than 7.0 is not recommended for Highway. -#define HWY_COMPILER_CLANG 600 -#endif // __has_warning chain -#define HWY_COMPILER3_CLANG (HWY_COMPILER_CLANG * 100) -#else // use normal version -#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) -#define HWY_COMPILER3_CLANG \ - (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) -#endif -#else // Not clang -#define HWY_COMPILER_CLANG 0 -#define HWY_COMPILER3_CLANG 0 -#endif - -#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && !HWY_COMPILER_ICC -#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC -#else -#define HWY_COMPILER_GCC_ACTUAL 0 -#endif - -// More than one may be nonzero, but we want at least one. -#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \ - HWY_COMPILER_GCC + HWY_COMPILER_CLANG) -#error "Unsupported compiler" -#endif - -// We should only detect one of these (only clang/clangcl overlap) -#if 1 < \ - (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \ - !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG)) -#error "Detected multiple compilers" -#endif - -#ifdef __has_builtin -#define HWY_HAS_BUILTIN(name) __has_builtin(name) -#else -#define HWY_HAS_BUILTIN(name) 0 -#endif - -#ifdef __has_attribute -#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name) -#else -#define HWY_HAS_ATTRIBUTE(name) 0 -#endif - -#ifdef __has_cpp_attribute -#define HWY_HAS_CPP_ATTRIBUTE(name) __has_cpp_attribute(name) -#else -#define HWY_HAS_CPP_ATTRIBUTE(name) 0 -#endif - -#ifdef __has_feature -#define HWY_HAS_FEATURE(name) __has_feature(name) -#else -#define HWY_HAS_FEATURE(name) 0 -#endif - -//------------------------------------------------------------------------------ -// Architecture - -#if defined(__i386__) || defined(_M_IX86) -#define HWY_ARCH_X86_32 1 -#else -#define HWY_ARCH_X86_32 0 -#endif - -#if defined(__x86_64__) || defined(_M_X64) -#define HWY_ARCH_X86_64 1 -#else -#define HWY_ARCH_X86_64 0 -#endif - -#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64 -#error "Cannot have both x86-32 and x86-64" -#endif - -#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64 -#define HWY_ARCH_X86 1 -#else -#define HWY_ARCH_X86 0 -#endif - -#if defined(__powerpc64__) || defined(_M_PPC) || defined(__powerpc__) -#define HWY_ARCH_PPC 1 -#else -#define HWY_ARCH_PPC 0 -#endif - -// aarch32 is currently not supported; please raise an issue if you want it. -#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64) -#define HWY_ARCH_ARM_A64 1 -#else -#define HWY_ARCH_ARM_A64 0 -#endif - -#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7) -#define HWY_ARCH_ARM_V7 1 -#else -#define HWY_ARCH_ARM_V7 0 -#endif - -#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7 -#error "Cannot have both A64 and V7" -#endif - -// Any *supported* version of Arm, i.e. 7 or later -#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7 -#define HWY_ARCH_ARM 1 -#else -#define HWY_ARCH_ARM 0 -#endif - -// Older than Armv7 (e.g. armel aka Armv5) => we do not support SIMD. -#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM -#define HWY_ARCH_ARM_OLD 1 -#else -#define HWY_ARCH_ARM_OLD 0 -#endif - -#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__) -#define HWY_ARCH_WASM 1 -#else -#define HWY_ARCH_WASM 0 -#endif - -#ifdef __riscv -#define HWY_ARCH_RVV 1 -#else -#define HWY_ARCH_RVV 0 -#endif - -// It is an error to detect multiple architectures at the same time, but OK to -// detect none of the above. -#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \ - HWY_ARCH_WASM + HWY_ARCH_RVV) > 1 -#error "Must not detect more than one architecture" -#endif - -#if defined(_WIN32) || defined(_WIN64) -#define HWY_OS_WIN 1 -#else -#define HWY_OS_WIN 0 -#endif - -#if defined(linux) || defined(__linux__) -#define HWY_OS_LINUX 1 -#else -#define HWY_OS_LINUX 0 -#endif - -//------------------------------------------------------------------------------ -// Endianness - -#if HWY_COMPILER_MSVC -#if HWY_ARCH_PPC && defined(_XBOX_VER) && _XBOX_VER >= 200 -// XBox 360 is big-endian -#define HWY_IS_LITTLE_ENDIAN 0 -#define HWY_IS_BIG_ENDIAN 1 -#else -// All other targets supported by MSVC are little-endian -#define HWY_IS_LITTLE_ENDIAN 1 -#define HWY_IS_BIG_ENDIAN 0 -#endif // HWY_ARCH_PPC && defined(_XBOX_VER) && _XBOX_VER >= 200 -#elif defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \ - __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define HWY_IS_LITTLE_ENDIAN 1 -#define HWY_IS_BIG_ENDIAN 0 -#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \ - __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define HWY_IS_LITTLE_ENDIAN 0 -#define HWY_IS_BIG_ENDIAN 1 -#else -#error "Unable to detect endianness or unsupported byte order" -#endif - -#if (HWY_IS_LITTLE_ENDIAN + HWY_IS_BIG_ENDIAN) != 1 -#error "Must only detect one byte order" -#endif - -#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_ diff --git a/deps/highway/include/hwy/detect_targets.h b/deps/highway/include/hwy/detect_targets.h deleted file mode 100644 index c99fc277..00000000 --- a/deps/highway/include/hwy/detect_targets.h +++ /dev/null @@ -1,644 +0,0 @@ -// Copyright 2021 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_ -#define HIGHWAY_HWY_DETECT_TARGETS_H_ - -// Defines targets and chooses which to enable. - -#include "hwy/detect_compiler_arch.h" - -//------------------------------------------------------------------------------ -// Optional configuration - -// See g3doc/quick_reference.md for documentation of these macros. - -// Uncomment to override the default baseline determined from predefined macros: -// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR) - -// Uncomment to override the default blocklist: -// #define HWY_BROKEN_TARGETS HWY_AVX3 - -// Uncomment to definitely avoid generating those target(s): -// #define HWY_DISABLED_TARGETS HWY_SSE4 - -// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating -// AVX2 target for VMs which support AVX2 but not the other instruction sets) -// #define HWY_DISABLE_BMI2_FMA - -// Uncomment to enable these on MSVC even if the predefined macros are not set. -// #define HWY_WANT_SSE2 1 -// #define HWY_WANT_SSSE3 1 -// #define HWY_WANT_SSE4 1 - -//------------------------------------------------------------------------------ -// Targets - -// Unique bit value for each target. A lower value is "better" (e.g. more lanes) -// than a higher value within the same group/platform - see HWY_STATIC_TARGET. -// -// All values are unconditionally defined so we can test HWY_TARGETS without -// first checking the HWY_ARCH_*. -// -// The C99 preprocessor evaluates #if expressions using intmax_t types. This -// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on -// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now -// avoid overflow when computing HWY_TARGETS (subtracting one instead of -// left-shifting 2^62), but still do not use bit 63 because it is the sign bit. - -// --------------------------- x86: 15 targets (+ one fallback) -// Bits 0..3 reserved (4 targets) -#define HWY_AVX3_SPR (1LL << 4) -// Bit 5 reserved (likely AVX10.2 with 256-bit vectors) -// Currently HWY_AVX3_DL plus a special case for CompressStore (10x as fast). -// We may later also use VPCONFLICT. -#define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below - -// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2, -// VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is -// only in Tiger Lake? -#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below -#define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL -#define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA -// Bit 10: reserved -#define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL -#define HWY_SSSE3 (1LL << 12) // S-SSE3 -// Bit 13: reserved for SSE3 -#define HWY_SSE2 (1LL << 14) -// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for -// dynamic dispatch. All x86 target bits must be lower or equal to -// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use -// HWY_MAX_DYNAMIC_TARGETS in total. -#define HWY_HIGHEST_TARGET_BIT_X86 14 - -// --------------------------- Arm: 15 targets (+ one fallback) -// Bits 15..23 reserved (9 targets) -#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2) -#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1) -#define HWY_SVE2 (1LL << 26) -#define HWY_SVE (1LL << 27) -#define HWY_NEON (1LL << 28) // Implies support for AES -#define HWY_NEON_WITHOUT_AES (1LL << 29) -#define HWY_HIGHEST_TARGET_BIT_ARM 29 - -// --------------------------- RISC-V: 9 targets (+ one fallback) -// Bits 30..36 reserved (7 targets) -#define HWY_RVV (1LL << 37) -// Bit 38 reserved -#define HWY_HIGHEST_TARGET_BIT_RVV 38 - -// --------------------------- Future expansion: 4 targets -// Bits 39..42 reserved - -// --------------------------- IBM Power: 9 targets (+ one fallback) -// Bits 43..46 reserved (4 targets) -#define HWY_PPC10 (1LL << 47) // v3.1 -#define HWY_PPC9 (1LL << 48) // v3.0 -#define HWY_PPC8 (1LL << 49) // v2.07 -// Bits 50..51 reserved for prior VSX/AltiVec (2 targets) -#define HWY_HIGHEST_TARGET_BIT_PPC 51 - -// --------------------------- WebAssembly: 9 targets (+ one fallback) -// Bits 52..57 reserved (6 targets) -#define HWY_WASM_EMU256 (1LL << 58) // Experimental -#define HWY_WASM (1LL << 59) -// Bits 60 reserved -#define HWY_HIGHEST_TARGET_BIT_WASM 60 - -// --------------------------- Emulation: 2 targets - -#define HWY_EMU128 (1LL << 61) -// We do not add/left-shift, so this will not overflow to a negative number. -#define HWY_SCALAR (1LL << 62) -#define HWY_HIGHEST_TARGET_BIT_SCALAR 62 - -// Do not use bit 63 - would be confusing to have negative numbers. - -//------------------------------------------------------------------------------ -// Set default blocklists - -// Disabled means excluded from enabled at user's request. A separate config -// macro allows disabling without deactivating the blocklist below. -#ifndef HWY_DISABLED_TARGETS -#define HWY_DISABLED_TARGETS 0 -#endif - -// Broken means excluded from enabled due to known compiler issues. We define -// separate HWY_BROKEN_* and then OR them together (more than one might apply). - -// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid -// SSE4 codegen (possibly only for msan), so disable all those targets. -#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) - -#define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1)) -// This entails a major speed reduction, so warn unless the user explicitly -// opts in to scalar-only. -#if !defined(HWY_COMPILE_ONLY_SCALAR) -#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.") -#endif - -#else -#define HWY_BROKEN_CLANG6 0 -#endif - -// 32-bit may fail to compile AVX2/3. -#if HWY_ARCH_X86_32 -#define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1)) -#else -#define HWY_BROKEN_32BIT 0 -#endif - -// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16 -#if HWY_COMPILER_MSVC != 0 -#define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1)) -#else -#define HWY_BROKEN_MSVC 0 -#endif - -// AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC -// 2021. -#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \ - (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021) -#define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4) -#else -#define HWY_BROKEN_AVX3_DL_ZEN4 0 -#endif - -// AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021. -#if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \ - (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \ - (HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021) -#define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR) -#else -#define HWY_BROKEN_AVX3_SPR 0 -#endif - -// armv7be has not been tested and is not yet supported. -#if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN -#define HWY_BROKEN_ARM7_BIG_ENDIAN (HWY_NEON | HWY_NEON_WITHOUT_AES) -#else -#define HWY_BROKEN_ARM7_BIG_ENDIAN 0 -#endif - -// armv7-a without a detected vfpv4 is not supported -// (for example Cortex-A8, Cortex-A9) -// vfpv4 always have neon half-float _and_ FMA. -#if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \ - !defined(__ARM_VFPV4__) && \ - !((__ARM_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1)) -#define HWY_BROKEN_ARM7_WITHOUT_VFP4 (HWY_NEON | HWY_NEON_WITHOUT_AES) -#else -#define HWY_BROKEN_ARM7_WITHOUT_VFP4 0 -#endif - -// SVE[2] require recent clang or gcc versions. -#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \ - (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) -#define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128) -#else -#define HWY_BROKEN_SVE 0 -#endif - -#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100) -// GCC 10 supports the -mcpu=power10 option but does not support the PPC10 -// vector intrinsics -#define HWY_BROKEN_PPC10 (HWY_PPC10) -#elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN && \ - ((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) || \ - (HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \ - (HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301)) -// GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the -// vsldoi instruction is sometimes incorrectly optimized out (and this causes -// some of the Highway unit tests to fail on big-endian PPC10). Details about -// this compiler bug can be found at -// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be -// fixed in the upcoming GCC 12.4 and 13.2 releases. - -// Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler -// bug in the LLVM DAGCombiner that causes a zero-extend followed by an -// element insert into a vector, followed by a vector shuffle to be incorrectly -// optimized on big-endian PPC (and which caused some of the Highway unit tests -// to fail on big-endian PPC10). - -// Details about this bug, which has already been fixed in Clang 16.0.1 and -// later, can be found at https://github.com/llvm/llvm-project/issues/61315. -#define HWY_BROKEN_PPC10 (HWY_PPC10) -#else -#define HWY_BROKEN_PPC10 0 -#endif - -// Allow the user to override this without any guarantee of success. -#ifndef HWY_BROKEN_TARGETS - -#define HWY_BROKEN_TARGETS \ - (HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \ - HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \ - HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \ - HWY_BROKEN_SVE | HWY_BROKEN_PPC10) - -#endif // HWY_BROKEN_TARGETS - -// Enabled means not disabled nor blocklisted. -#define HWY_ENABLED(targets) \ - ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS))) - -// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3: -// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate -// from HWY_BROKEN_TARGETS because it affects the fallback target, which must -// always be enabled. If 1, we instead choose HWY_SCALAR even without -// HWY_COMPILE_ONLY_SCALAR being set. -#if !defined(HWY_BROKEN_EMU128) // allow overriding -#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \ - defined(HWY_NO_LIBCXX) -#define HWY_BROKEN_EMU128 1 -#else -#define HWY_BROKEN_EMU128 0 -#endif -#endif // HWY_BROKEN_EMU128 - -//------------------------------------------------------------------------------ -// Detect baseline targets using predefined macros - -// Baseline means the targets for which the compiler is allowed to generate -// instructions, implying the target CPU would have to support them. This does -// not take the blocklist into account. - -#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128 -#define HWY_BASELINE_SCALAR HWY_SCALAR -#else -#define HWY_BASELINE_SCALAR HWY_EMU128 -#endif - -// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with -// HWY_TARGET == HWY_BASELINE_SCALAR. - -#if HWY_ARCH_WASM && defined(__wasm_simd128__) -#if defined(HWY_WANT_WASM2) -#define HWY_BASELINE_WASM HWY_WASM_EMU256 -#else -#define HWY_BASELINE_WASM HWY_WASM -#endif // HWY_WANT_WASM2 -#else -#define HWY_BASELINE_WASM 0 -#endif - -// GCC or Clang. -#if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \ - defined(__VSX__) && defined(__POWER8_VECTOR__) && \ - (defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO)) -#define HWY_BASELINE_PPC8 HWY_PPC8 -#else -#define HWY_BASELINE_PPC8 0 -#endif - -#if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__) -#define HWY_BASELINE_PPC9 HWY_PPC9 -#else -#define HWY_BASELINE_PPC9 0 -#endif - -#if HWY_BASELINE_PPC9 != 0 && \ - (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__)) -#define HWY_BASELINE_PPC10 HWY_PPC10 -#else -#define HWY_BASELINE_PPC10 0 -#endif - -#define HWY_BASELINE_SVE2 0 -#define HWY_BASELINE_SVE 0 -#define HWY_BASELINE_NEON 0 - -#if HWY_ARCH_ARM - -#if defined(__ARM_FEATURE_SVE2) -#undef HWY_BASELINE_SVE2 // was 0, will be re-defined -// If user specified -msve-vector-bits=128, they assert the vector length is -// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops). -#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128 -#define HWY_BASELINE_SVE2 HWY_SVE2_128 -// Otherwise we're not sure what the vector length will be. The baseline must be -// unconditionally valid, so we can only assume HWY_SVE2. However, when running -// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will -// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS. -#else -#define HWY_BASELINE_SVE2 HWY_SVE2 -#endif // __ARM_FEATURE_SVE_BITS -#endif // __ARM_FEATURE_SVE2 - -#if defined(__ARM_FEATURE_SVE) -#undef HWY_BASELINE_SVE // was 0, will be re-defined -// See above. If user-specified vector length matches our optimization, use it. -#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256 -#define HWY_BASELINE_SVE HWY_SVE_256 -#else -#define HWY_BASELINE_SVE HWY_SVE -#endif // __ARM_FEATURE_SVE_BITS -#endif // __ARM_FEATURE_SVE - -// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both. -#if defined(__ARM_NEON__) || defined(__ARM_NEON) -#undef HWY_BASELINE_NEON -#if defined(__ARM_FEATURE_AES) -#define HWY_BASELINE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES) -#else -#define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES) -#endif -#endif - -#endif // HWY_ARCH_ARM - -// Special handling for MSVC because it has fewer predefined macros: -#if HWY_COMPILER_MSVC - -#if HWY_ARCH_X86_32 -#if _M_IX86_FP >= 2 -#define HWY_CHECK_SSE2 1 -#else -#define HWY_CHECK_SSE2 0 -#endif -#elif HWY_ARCH_X86_64 -#define HWY_CHECK_SSE2 1 -#else -#define HWY_CHECK_SSE2 0 -#endif - -// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is: -// https://stackoverflow.com/questions/18563978/. -#if defined(__AVX__) -#define HWY_CHECK_SSSE3 1 -#define HWY_CHECK_SSE4 1 -#else -#define HWY_CHECK_SSSE3 0 -#define HWY_CHECK_SSE4 0 -#endif - -// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume -// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is. -#define HWY_CHECK_PCLMUL_AES 1 -#define HWY_CHECK_BMI2_FMA 1 -#define HWY_CHECK_F16C 1 - -#else // non-MSVC - -#if defined(__SSE2__) -#define HWY_CHECK_SSE2 1 -#else -#define HWY_CHECK_SSE2 0 -#endif - -#if defined(__SSSE3__) -#define HWY_CHECK_SSSE3 1 -#else -#define HWY_CHECK_SSSE3 0 -#endif - -#if defined(__SSE4_1__) && defined(__SSE4_2__) -#define HWY_CHECK_SSE4 1 -#else -#define HWY_CHECK_SSE4 0 -#endif - -// If these are disabled, they should not gate the availability of SSE4/AVX2. -#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__)) -#define HWY_CHECK_PCLMUL_AES 1 -#else -#define HWY_CHECK_PCLMUL_AES 0 -#endif - -#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__)) -#define HWY_CHECK_BMI2_FMA 1 -#else -#define HWY_CHECK_BMI2_FMA 0 -#endif - -#if defined(HWY_DISABLE_F16C) || defined(__F16C__) -#define HWY_CHECK_F16C 1 -#else -#define HWY_CHECK_F16C 0 -#endif - -#endif // non-MSVC - -#if HWY_ARCH_X86 && (HWY_WANT_SSE2 || HWY_CHECK_SSE2) -#define HWY_BASELINE_SSE2 HWY_SSE2 -#else -#define HWY_BASELINE_SSE2 0 -#endif - -#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3) -#define HWY_BASELINE_SSSE3 HWY_SSSE3 -#else -#define HWY_BASELINE_SSSE3 0 -#endif - -#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES)) -#define HWY_BASELINE_SSE4 HWY_SSE4 -#else -#define HWY_BASELINE_SSE4 0 -#endif - -#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \ - defined(__AVX2__) -#define HWY_BASELINE_AVX2 HWY_AVX2 -#else -#define HWY_BASELINE_AVX2 0 -#endif - -// Require everything in AVX2 plus AVX-512 flags (also set by MSVC) -#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \ - defined(__AVX512DQ__) && defined(__AVX512VL__) -#define HWY_BASELINE_AVX3 HWY_AVX3 -#else -#define HWY_BASELINE_AVX3 0 -#endif - -// TODO(janwas): not yet known whether these will be set by MSVC -#if HWY_BASELINE_AVX3 != 0 && defined(__AVX512VNNI__) && defined(__VAES__) && \ - defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \ - defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \ - defined(__AVX512BITALG__) -#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL -#else -#define HWY_BASELINE_AVX3_DL 0 -#endif - -// The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus -// considered better. Do not enable it unless the user explicitly requests it - -// we do not want to choose the ZEN4 path on Intel because it could be slower. -#if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0 -#define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4 -#else -#define HWY_BASELINE_AVX3_ZEN4 0 -#endif - -#if HWY_BASELINE_AVX3_DL != 0 && defined(__AVX512FP16__) -#define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR -#else -#define HWY_BASELINE_AVX3_SPR 0 -#endif - -// RVV requires intrinsics 0.11 or later, see #1156. -#if HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 11000 -#define HWY_BASELINE_RVV HWY_RVV -#else -#define HWY_BASELINE_RVV 0 -#endif - -// Allow the user to override this without any guarantee of success. -#ifndef HWY_BASELINE_TARGETS -#define HWY_BASELINE_TARGETS \ - (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \ - HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_SVE2 | \ - HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | \ - HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \ - HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | \ - HWY_BASELINE_AVX3_SPR | HWY_BASELINE_RVV) -#endif // HWY_BASELINE_TARGETS - -//------------------------------------------------------------------------------ -// Choose target for static dispatch - -#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS) -#if HWY_ENABLED_BASELINE == 0 -#error "At least one baseline target must be defined and enabled" -#endif - -// Best baseline, used for static dispatch. This is the least-significant 1-bit -// within HWY_ENABLED_BASELINE and lower bit values imply "better". -#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE) - -// Start by assuming static dispatch. If we later use dynamic dispatch, this -// will be defined to other targets during the multiple-inclusion, and finally -// return to the initial value. Defining this outside begin/end_target ensures -// inl headers successfully compile by themselves (required by Bazel). -#define HWY_TARGET HWY_STATIC_TARGET - -//------------------------------------------------------------------------------ -// Choose targets for dynamic dispatch according to one of four policies - -#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \ - defined(HWY_COMPILE_ONLY_STATIC)) -#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?" -#endif -// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE. - -// Clang, GCC and MSVC allow runtime dispatch on x86. -#if HWY_ARCH_X86 -#define HWY_HAVE_RUNTIME_DISPATCH 1 -// On Arm/PPC, currently only GCC does, and we require Linux to detect CPU -// capabilities. -#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && HWY_COMPILER_GCC_ACTUAL && \ - HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H) -#define HWY_HAVE_RUNTIME_DISPATCH 1 -#else -#define HWY_HAVE_RUNTIME_DISPATCH 0 -#endif - -// AVX3_DL is not widely available yet. To reduce code size and compile time, -// only include it in the set of attainable targets (for dynamic dispatch) if -// the user opts in, OR it is in the baseline (we check whether enabled below). -#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL) -#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL) -#else -#define HWY_ATTAINABLE_AVX3_DL 0 -#endif - -#if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH -#define HWY_ATTAINABLE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES) -#elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7 -#define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON) -#else -#define HWY_ATTAINABLE_NEON 0 -#endif - -#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \ - (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256))) -#define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256) -#else -#define HWY_ATTAINABLE_SVE 0 -#endif - -#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \ - (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128))) -#define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128) -#else -#define HWY_ATTAINABLE_SVE2 0 -#endif - -#if HWY_ARCH_PPC && defined(__ALTIVEC__) && \ - (!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0) -#define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10) -#else -#define HWY_ATTAINABLE_PPC 0 -#endif - -// Attainable means enabled and the compiler allows intrinsics (even when not -// allowed to autovectorize). Used in 3 and 4. -#if HWY_ARCH_X86 -#define HWY_ATTAINABLE_TARGETS \ - HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \ - HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \ - HWY_AVX3_SPR) -#elif HWY_ARCH_ARM -#define HWY_ATTAINABLE_TARGETS \ - HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \ - HWY_ATTAINABLE_SVE2) -#elif HWY_ARCH_PPC -#define HWY_ATTAINABLE_TARGETS \ - HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC) -#else -#define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE) -#endif // HWY_ARCH_* - -// 1) For older compilers: avoid SIMD intrinsics, but still support all ops. -#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128 -#undef HWY_STATIC_TARGET -#define HWY_STATIC_TARGET HWY_EMU128 // override baseline -#define HWY_TARGETS HWY_EMU128 - -// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but -// we currently still support it for backwards compatibility. -#elif defined(HWY_COMPILE_ONLY_SCALAR) || \ - (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128) -#undef HWY_STATIC_TARGET -#define HWY_STATIC_TARGET HWY_SCALAR // override baseline -#define HWY_TARGETS HWY_SCALAR - -// 2) For forcing static dispatch without code changes (removing HWY_EXPORT) -#elif defined(HWY_COMPILE_ONLY_STATIC) -#define HWY_TARGETS HWY_STATIC_TARGET - -// 3) For tests: include all attainable targets (in particular: scalar) -#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST) -#define HWY_TARGETS HWY_ATTAINABLE_TARGETS - -// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by -// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET -// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one -// sets all lower bits (better targets), then we also include the static target. -#else -#define HWY_TARGETS \ - (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET)) - -#endif // target policy - -// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being -// one of the dynamic targets. This also implies HWY_TARGETS != 0 and -// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0. -#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0 -#error "Logic error: best baseline should be included in dynamic targets" -#endif - -#endif // HIGHWAY_HWY_DETECT_TARGETS_H_ diff --git a/deps/highway/include/hwy/foreach_target.h b/deps/highway/include/hwy/foreach_target.h deleted file mode 100644 index ca3e5a24..00000000 --- a/deps/highway/include/hwy/foreach_target.h +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_ -#define HIGHWAY_HWY_FOREACH_TARGET_H_ - -// Re-includes the translation unit zero or more times to compile for any -// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that -// highway.h defines the corresponding macro/namespace. - -#include "hwy/detect_targets.h" - -// *_inl.h may include other headers, which requires include guards to prevent -// repeated inclusion. The guards must be reset after compiling each target, so -// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE, -// defining it if undefined and vice versa. This macro is initially undefined -// so that IDEs don't gray out the contents of each header. -#ifdef HWY_TARGET_TOGGLE -#error "This macro must not be defined outside foreach_target.h" -#endif - -#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard -// Trigger fixup at the bottom of this header. -#define HWY_ALREADY_INCLUDED - -// The next highway.h must re-include set_macros-inl.h because the first -// highway.h chose the static target instead of what we will set below. -#undef HWY_SET_MACROS_PER_TARGET -#endif - -// Disable HWY_EXPORT in user code until we have generated all targets. Note -// that a subsequent highway.h will not override this definition. -#undef HWY_ONCE -#define HWY_ONCE (0 || HWY_IDE) - -// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE; -// also skip if only 1 target defined (no re-inclusion will be necessary). -#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET) - -#if !defined(HWY_TARGET_INCLUDE) -#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h" -#endif - -// ------------------------------ HWY_ARCH_X86 - -#if (HWY_TARGETS & HWY_SSE2) && (HWY_STATIC_TARGET != HWY_SSE2) -#undef HWY_TARGET -#define HWY_TARGET HWY_SSE2 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3) -#undef HWY_TARGET -#define HWY_TARGET HWY_SSSE3 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4) -#undef HWY_TARGET -#define HWY_TARGET HWY_SSE4 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2) -#undef HWY_TARGET -#define HWY_TARGET HWY_AVX2 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3) -#undef HWY_TARGET -#define HWY_TARGET HWY_AVX3 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL) -#undef HWY_TARGET -#define HWY_TARGET HWY_AVX3_DL -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_AVX3_ZEN4) && (HWY_STATIC_TARGET != HWY_AVX3_ZEN4) -#undef HWY_TARGET -#define HWY_TARGET HWY_AVX3_ZEN4 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_AVX3_SPR) && (HWY_STATIC_TARGET != HWY_AVX3_SPR) -#undef HWY_TARGET -#define HWY_TARGET HWY_AVX3_SPR -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -// ------------------------------ HWY_ARCH_ARM - -#if (HWY_TARGETS & HWY_NEON_WITHOUT_AES) && \ - (HWY_STATIC_TARGET != HWY_NEON_WITHOUT_AES) -#undef HWY_TARGET -#define HWY_TARGET HWY_NEON_WITHOUT_AES -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON) -#undef HWY_TARGET -#define HWY_TARGET HWY_NEON -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE) -#undef HWY_TARGET -#define HWY_TARGET HWY_SVE -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2) -#undef HWY_TARGET -#define HWY_TARGET HWY_SVE2 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256) -#undef HWY_TARGET -#define HWY_TARGET HWY_SVE_256 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128) -#undef HWY_TARGET -#define HWY_TARGET HWY_SVE2_128 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -// ------------------------------ HWY_ARCH_WASM - -#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256) -#undef HWY_TARGET -#define HWY_TARGET HWY_WASM_EMU256 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM) -#undef HWY_TARGET -#define HWY_TARGET HWY_WASM -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -// ------------------------------ HWY_ARCH_PPC - -#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8) -#undef HWY_TARGET -#define HWY_TARGET HWY_PPC8 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_PPC9) && (HWY_STATIC_TARGET != HWY_PPC9) -#undef HWY_TARGET -#define HWY_TARGET HWY_PPC9 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_PPC10) && (HWY_STATIC_TARGET != HWY_PPC10) -#undef HWY_TARGET -#define HWY_TARGET HWY_PPC10 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -// ------------------------------ HWY_ARCH_RVV - -#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV) -#undef HWY_TARGET -#define HWY_TARGET HWY_RVV -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -// ------------------------------ Scalar - -#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128) -#undef HWY_TARGET -#define HWY_TARGET HWY_EMU128 -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR) -#undef HWY_TARGET -#define HWY_TARGET HWY_SCALAR -#include HWY_TARGET_INCLUDE -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif -#endif - -#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET) - -// Now that all but the static target have been generated, re-enable HWY_EXPORT. -#undef HWY_ONCE -#define HWY_ONCE 1 - -// If we re-include once per enabled target, the translation unit's -// implementation would have to be skipped via #if to avoid redefining symbols. -// We instead skip the re-include for HWY_STATIC_TARGET, and generate its -// implementation when resuming compilation of the translation unit. -#undef HWY_TARGET -#define HWY_TARGET HWY_STATIC_TARGET - -#ifdef HWY_ALREADY_INCLUDED -// Revert the previous toggle to prevent redefinitions for the static target. -#ifdef HWY_TARGET_TOGGLE -#undef HWY_TARGET_TOGGLE -#else -#define HWY_TARGET_TOGGLE -#endif - -// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored. -#ifdef HWY_SET_MACROS_PER_TARGET -#undef HWY_SET_MACROS_PER_TARGET -#else -#define HWY_SET_MACROS_PER_TARGET -#endif -#endif - -#endif // HIGHWAY_HWY_FOREACH_TARGET_H_ diff --git a/deps/highway/include/hwy/highway.h b/deps/highway/include/hwy/highway.h deleted file mode 100644 index 99d74619..00000000 --- a/deps/highway/include/hwy/highway.h +++ /dev/null @@ -1,435 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Main header required before using vector types. - -// IWYU pragma: begin_exports -#include "hwy/base.h" -#include "hwy/detect_compiler_arch.h" -#include "hwy/highway_export.h" -#include "hwy/targets.h" -// IWYU pragma: end_exports - -// This include guard is checked by foreach_target, so avoid the usual _H_ -// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included -// after/outside this include guard. -#ifndef HWY_HIGHWAY_INCLUDED -#define HWY_HIGHWAY_INCLUDED - -namespace hwy { - -// API version (https://semver.org/); keep in sync with CMakeLists.txt. -#define HWY_MAJOR 1 -#define HWY_MINOR 0 -#define HWY_PATCH 6 - -//------------------------------------------------------------------------------ -// Shorthand for tags (defined in shared-inl.h) used to select overloads. -// Note that ScalableTag is preferred over HWY_FULL, and CappedTag over -// HWY_CAPPED(T, N). - -// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of -// registers in the group, and is ignored on targets that do not support groups. -#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag -#define HWY_FULL2(T, LMUL) \ - hwy::HWY_NAMESPACE::ScalableTag -#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3 -// Workaround for MSVC grouping __VA_ARGS__ into a single argument -#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren -// Trailing comma avoids -pedantic false alarm -#define HWY_CHOOSE_FULL(...) \ - HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) -#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) - -// Vector of up to MAX_N lanes. It's better to use full vectors where possible. -#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag - -//------------------------------------------------------------------------------ -// Export user functions for static/dynamic dispatch - -// Evaluates to 0 inside a translation unit if it is generating anything but the -// static target (the last one if multiple targets are enabled). Used to prevent -// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only -// compile once anyway, so this is 1 unless it is or has been included. -#ifndef HWY_ONCE -#define HWY_ONCE 1 -#endif - -// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for -// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is -// defined), and can be used to deduce the return type of Choose*. -#if HWY_STATIC_TARGET == HWY_SCALAR -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_EMU128 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_RVV -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_WASM_EMU256 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_WASM -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_NEON -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SVE -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SVE2 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SVE_256 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SVE2_128 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_PPC8 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_PPC9 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_PPC10 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SSE2 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SSSE3 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_SSE4 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_AVX2 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_AVX3 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_AVX3_DL -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4 -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME -#elif HWY_STATIC_TARGET == HWY_AVX3_SPR -#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME -#endif - -// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or -// nullptr is that target was not compiled. -#if HWY_TARGETS & HWY_EMU128 -#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME -#elif HWY_TARGETS & HWY_SCALAR -#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME -#else -// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at -// runtime, fall back to the baseline with HWY_STATIC_DISPATCH(). -#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) -#endif - -#if HWY_TARGETS & HWY_WASM_EMU256 -#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME -#else -#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_WASM -#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME -#else -#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_RVV -#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME -#else -#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_NEON_WITHOUT_AES -#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME -#else -#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_NEON -#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME -#else -#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SVE -#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME -#else -#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SVE2 -#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME -#else -#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SVE_256 -#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME -#else -#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SVE2_128 -#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME -#else -#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_PPC8 -#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME -#else -#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_PPC9 -#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME -#else -#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_PPC10 -#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME -#else -#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SSE2 -#define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME -#else -#define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SSSE3 -#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME -#else -#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_SSE4 -#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME -#else -#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_AVX2 -#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME -#else -#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_AVX3 -#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME -#else -#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_AVX3_DL -#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME -#else -#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_AVX3_ZEN4 -#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME -#else -#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr -#endif - -#if HWY_TARGETS & HWY_AVX3_SPR -#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME -#else -#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr -#endif - -// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall -// apparently cannot be an array. Use a function pointer instead, which has the -// disadvantage that we call the static (not best) target on the first call to -// any HWY_DYNAMIC_DISPATCH. -#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915 -#define HWY_DISPATCH_WORKAROUND 1 -#else -#define HWY_DISPATCH_WORKAROUND 0 -#endif - -// Provides a static member function which is what is called during the first -// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of -// this function are the first entry in the tables created by HWY_EXPORT. -template -struct FunctionCache { - public: - typedef RetType(FunctionType)(Args...); - -#if HWY_DISPATCH_WORKAROUND - template - static RetType ChooseAndCall(Args... args) { - ChosenTarget& chosen_target = GetChosenTarget(); - chosen_target.Update(SupportedTargets()); - return (*func)(args...); - } -#else - // A template function that when instantiated has the same signature as the - // function being called. This function initializes the bit array of targets - // supported by the current CPU and then calls the appropriate entry within - // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any - // exported functions, even those defined by different translation units, - // will dispatch directly to the best available target. - template - static RetType ChooseAndCall(Args... args) { - ChosenTarget& chosen_target = GetChosenTarget(); - chosen_target.Update(SupportedTargets()); - return (table[chosen_target.GetIndex()])(args...); - } -#endif // HWY_DISPATCH_WORKAROUND -}; - -// Used to deduce the template parameters RetType and Args from a function. -template -FunctionCache DeduceFunctionCache(RetType (*)(Args...)) { - return FunctionCache(); -} - -#define HWY_DISPATCH_TABLE(FUNC_NAME) \ - HWY_CONCAT(FUNC_NAME, HighwayDispatchTable) - -// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by -// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This -// static array must be defined at the same namespace level as the function -// it is exporting. -// After being exported, it can be called from other parts of the same source -// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper -// like in the following example: -// -// #include "hwy/highway.h" -// HWY_BEFORE_NAMESPACE(); -// namespace skeleton { -// namespace HWY_NAMESPACE { -// -// void MyFunction(int a, char b, const char* c) { ... } -// -// // NOLINTNEXTLINE(google-readability-namespace-comments) -// } // namespace HWY_NAMESPACE -// } // namespace skeleton -// HWY_AFTER_NAMESPACE(); -// -// namespace skeleton { -// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope. -// -// void MyFunction(int a, char b, const char* c) { -// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c); -// } -// } // namespace skeleton -// - -#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) - -// Simplified version for IDE or the dynamic dispatch case with only one target. -// This case still uses a table, although of a single element, to provide the -// same compile error conditions as with the dynamic dispatch case when multiple -// targets are being compiled. -#define HWY_EXPORT(FUNC_NAME) \ - HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \ - HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)} -#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME) -#define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) - -#else - -// Simplified version for MSVC 2017: function pointer instead of table. -#if HWY_DISPATCH_WORKAROUND - -#define HWY_EXPORT(FUNC_NAME) \ - static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ - FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ - /* The first entry in the table initializes the global cache and \ - * calls the function from HWY_STATIC_TARGET. */ \ - &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \ - FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \ - HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ - HWY_CHOOSE_FALLBACK(FUNC_NAME), \ - } - -#else - -// Dynamic dispatch case with one entry per dynamic target plus the fallback -// target and the initialization wrapper. -#define HWY_EXPORT(FUNC_NAME) \ - static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ - FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ - /* The first entry in the table initializes the global cache and \ - * calls the appropriate function. */ \ - &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \ - FUNC_NAME)))::ChooseAndCall, \ - HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ - HWY_CHOOSE_FALLBACK(FUNC_NAME), \ - } - -#endif // HWY_DISPATCH_WORKAROUND - -#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \ - (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])) -#define HWY_DYNAMIC_POINTER(FUNC_NAME) \ - (HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]) - -#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) - -// DEPRECATED names; please use HWY_HAVE_* instead. -#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64 -#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16 -#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64 - -} // namespace hwy - -#endif // HWY_HIGHWAY_INCLUDED - -//------------------------------------------------------------------------------ - -// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want -// to include them once per target, which is ensured by the toggle check. -// Because ops/*.h are included under it, they do not need their own guard. -#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE) -#ifdef HWY_HIGHWAY_PER_TARGET -#undef HWY_HIGHWAY_PER_TARGET -#else -#define HWY_HIGHWAY_PER_TARGET -#endif - -// These define ops inside namespace hwy::HWY_NAMESPACE. -#if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 -#include "hwy/ops/x86_128-inl.h" -#elif HWY_TARGET == HWY_AVX2 -#include "hwy/ops/x86_256-inl.h" -#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ - HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR -#include "hwy/ops/x86_512-inl.h" -#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \ - HWY_TARGET == HWY_PPC10 -#include "hwy/ops/ppc_vsx-inl.h" -#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES -#include "hwy/ops/arm_neon-inl.h" -#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \ - HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 -#include "hwy/ops/arm_sve-inl.h" -#elif HWY_TARGET == HWY_WASM_EMU256 -#include "hwy/ops/wasm_256-inl.h" -#elif HWY_TARGET == HWY_WASM -#include "hwy/ops/wasm_128-inl.h" -#elif HWY_TARGET == HWY_RVV -#include "hwy/ops/rvv-inl.h" -#elif HWY_TARGET == HWY_EMU128 -#include "hwy/ops/emu128-inl.h" -#elif HWY_TARGET == HWY_SCALAR -#include "hwy/ops/scalar-inl.h" -#else -#pragma message("HWY_TARGET does not match any known target") -#endif // HWY_TARGET - -#include "hwy/ops/generic_ops-inl.h" - -#endif // HWY_HIGHWAY_PER_TARGET diff --git a/deps/highway/include/hwy/highway_export.h b/deps/highway/include/hwy/highway_export.h deleted file mode 100644 index 30edc17d..00000000 --- a/deps/highway/include/hwy/highway_export.h +++ /dev/null @@ -1,74 +0,0 @@ -// Pseudo-generated file to handle both cmake & bazel build system. - -// Initial generation done using cmake code: -// include(GenerateExportHeader) -// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME -// hwy/highway_export.h) -// code reformatted using clang-format --style=Google - -#ifndef HWY_DLLEXPORT_H -#define HWY_DLLEXPORT_H - -#if !defined(HWY_SHARED_DEFINE) -#define HWY_DLLEXPORT -#define HWY_CONTRIB_DLLEXPORT -#define HWY_TEST_DLLEXPORT -#else // !HWY_SHARED_DEFINE - -#ifndef HWY_DLLEXPORT -#if defined(hwy_EXPORTS) -/* We are building this library */ -#ifdef _WIN32 -#define HWY_DLLEXPORT __declspec(dllexport) -#else -#define HWY_DLLEXPORT __attribute__((visibility("default"))) -#endif -#else // defined(hwy_EXPORTS) -/* We are using this library */ -#ifdef _WIN32 -#define HWY_DLLEXPORT __declspec(dllimport) -#else -#define HWY_DLLEXPORT __attribute__((visibility("default"))) -#endif -#endif // defined(hwy_EXPORTS) -#endif // HWY_DLLEXPORT - -#ifndef HWY_CONTRIB_DLLEXPORT -#if defined(hwy_contrib_EXPORTS) -/* We are building this library */ -#ifdef _WIN32 -#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport) -#else -#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default"))) -#endif -#else // defined(hwy_contrib_EXPORTS) -/* We are using this library */ -#ifdef _WIN32 -#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport) -#else -#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default"))) -#endif -#endif // defined(hwy_contrib_EXPORTS) -#endif // HWY_CONTRIB_DLLEXPORT - -#ifndef HWY_TEST_DLLEXPORT -#if defined(hwy_test_EXPORTS) -/* We are building this library */ -#ifdef _WIN32 -#define HWY_TEST_DLLEXPORT __declspec(dllexport) -#else -#define HWY_TEST_DLLEXPORT __attribute__((visibility("default"))) -#endif -#else // defined(hwy_test_EXPORTS) -/* We are using this library */ -#ifdef _WIN32 -#define HWY_TEST_DLLEXPORT __declspec(dllimport) -#else -#define HWY_TEST_DLLEXPORT __attribute__((visibility("default"))) -#endif -#endif // defined(hwy_test_EXPORTS) -#endif // HWY_TEST_DLLEXPORT - -#endif // !HWY_SHARED_DEFINE - -#endif /* HWY_DLLEXPORT_H */ diff --git a/deps/highway/include/hwy/nanobenchmark.h b/deps/highway/include/hwy/nanobenchmark.h deleted file mode 100644 index 46bfc4b0..00000000 --- a/deps/highway/include/hwy/nanobenchmark.h +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_ -#define HIGHWAY_HWY_NANOBENCHMARK_H_ - -// Benchmarks functions of a single integer argument with realistic branch -// prediction hit rates. Uses a robust estimator to summarize the measurements. -// The precision is about 0.2%. -// -// Examples: see nanobenchmark_test.cc. -// -// Background: Microbenchmarks such as http://github.com/google/benchmark -// can measure elapsed times on the order of a microsecond. Shorter functions -// are typically measured by repeating them thousands of times and dividing -// the total elapsed time by this count. Unfortunately, repetition (especially -// with the same input parameter!) influences the runtime. In time-critical -// code, it is reasonable to expect warm instruction/data caches and TLBs, -// but a perfect record of which branches will be taken is unrealistic. -// Unless the application also repeatedly invokes the measured function with -// the same parameter, the benchmark is measuring something very different - -// a best-case result, almost as if the parameter were made a compile-time -// constant. This may lead to erroneous conclusions about branch-heavy -// algorithms outperforming branch-free alternatives. -// -// Our approach differs in three ways. Adding fences to the timer functions -// reduces variability due to instruction reordering, improving the timer -// resolution to about 40 CPU cycles. However, shorter functions must still -// be invoked repeatedly. For more realistic branch prediction performance, -// we vary the input parameter according to a user-specified distribution. -// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the -// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the -// central tendency of the measurement samples with the "half sample mode", -// which is more robust to outliers and skewed data than the mean or median. - -#include -#include - -#include "hwy/highway_export.h" -#include "hwy/timer.h" - -// Enables sanity checks that verify correct operation at the cost of -// longer benchmark runs. -#ifndef NANOBENCHMARK_ENABLE_CHECKS -#define NANOBENCHMARK_ENABLE_CHECKS 0 -#endif - -#define NANOBENCHMARK_CHECK_ALWAYS(condition) \ - while (!(condition)) { \ - fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \ - abort(); \ - } - -#if NANOBENCHMARK_ENABLE_CHECKS -#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition) -#else -#define NANOBENCHMARK_CHECK(condition) -#endif - -namespace hwy { - -// Returns 1, but without the compiler knowing what the value is. This prevents -// optimizing out code. -HWY_DLLEXPORT int Unpredictable1(); - -// Input influencing the function being measured (e.g. number of bytes to copy). -using FuncInput = size_t; - -// "Proof of work" returned by Func to ensure the compiler does not elide it. -using FuncOutput = uint64_t; - -// Function to measure: either 1) a captureless lambda or function with two -// arguments or 2) a lambda with capture, in which case the first argument -// is reserved for use by MeasureClosure. -using Func = FuncOutput (*)(const void*, FuncInput); - -// Internal parameters that determine precision/resolution/measuring time. -struct Params { - // Best-case precision, expressed as a divisor of the timer resolution. - // Larger => more calls to Func and higher precision. - size_t precision_divisor = 1024; - - // Ratio between full and subset input distribution sizes. Cannot be less - // than 2; larger values increase measurement time but more faithfully - // model the given input distribution. - size_t subset_ratio = 2; - - // Together with the estimated Func duration, determines how many times to - // call Func before checking the sample variability. Larger values increase - // measurement time, memory/cache use and precision. - double seconds_per_eval = 4E-3; - - // The minimum number of samples before estimating the central tendency. - size_t min_samples_per_eval = 7; - - // The mode is better than median for estimating the central tendency of - // skewed/fat-tailed distributions, but it requires sufficient samples - // relative to the width of half-ranges. - size_t min_mode_samples = 64; - - // Maximum permissible variability (= median absolute deviation / center). - double target_rel_mad = 0.002; - - // Abort after this many evals without reaching target_rel_mad. This - // prevents infinite loops. - size_t max_evals = 9; - - // Whether to print additional statistics to stdout. - bool verbose = true; -}; - -// Measurement result for each unique input. -struct Result { - FuncInput input; - - // Robust estimate (mode or median) of duration. - float ticks; - - // Measure of variability (median absolute deviation relative to "ticks"). - float variability; -}; - -// Precisely measures the number of ticks elapsed when calling "func" with the -// given inputs, shuffled to ensure realistic branch prediction hit rates. -// -// "func" returns a 'proof of work' to ensure its computations are not elided. -// "arg" is passed to Func, or reserved for internal use by MeasureClosure. -// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to -// "func". The values should be chosen to maximize coverage of "func". This -// represents a distribution, so a value's frequency should reflect its -// probability in the real application. Order does not matter; for example, a -// uniform distribution over [0, 4) could be represented as {3,0,2,1}. -// Returns how many Result were written to "results": one per unique input, or -// zero if the measurement failed (an error message goes to stderr). -HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg, - const FuncInput* inputs, size_t num_inputs, - Result* results, const Params& p = Params()); - -// Calls operator() of the given closure (lambda function). -template -static FuncOutput CallClosure(const Closure* f, const FuncInput input) { - return (*f)(input); -} - -// Same as Measure, except "closure" is typically a lambda function of -// FuncInput -> FuncOutput with a capture list. -template -static inline size_t MeasureClosure(const Closure& closure, - const FuncInput* inputs, - const size_t num_inputs, Result* results, - const Params& p = Params()) { - return Measure(reinterpret_cast(&CallClosure), - reinterpret_cast(&closure), inputs, num_inputs, - results, p); -} - -} // namespace hwy - -#endif // HIGHWAY_HWY_NANOBENCHMARK_H_ diff --git a/deps/highway/include/hwy/ops/arm_neon-inl.h b/deps/highway/include/hwy/ops/arm_neon-inl.h deleted file mode 100644 index 4dbdf64c..00000000 --- a/deps/highway/include/hwy/ops/arm_neon-inl.h +++ /dev/null @@ -1,8625 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// 128-bit Arm NEON vectors and operations. -// External include guard in highway.h - see comment there. - -// Arm NEON intrinsics are documented at: -// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] - -#include "hwy/ops/shared-inl.h" - -HWY_BEFORE_NAMESPACE(); - -// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with -// the same target attribute as our code, see #834. -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") -#include // NOLINT(build/include_order) -HWY_DIAGNOSTICS(pop) - -// Must come after arm_neon.h. -namespace hwy { -namespace HWY_NAMESPACE { - -namespace detail { // for code folding and Raw128 - -// Macros used to define single and double function calls for multiple types -// for full and half vectors. These macros are undefined at the end of the file. - -// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function. -#define HWY_NEON_BUILD_TPL_1 -#define HWY_NEON_BUILD_TPL_2 -#define HWY_NEON_BUILD_TPL_3 - -// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can -// extend it to int32x4x2_t packs. -#define HWY_NEON_BUILD_RET_1(type, size) Vec128 -#define HWY_NEON_BUILD_RET_2(type, size) Vec128 -#define HWY_NEON_BUILD_RET_3(type, size) Vec128 - -// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives. -#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128 a -#define HWY_NEON_BUILD_PARAM_2(type, size) \ - const Vec128 a, const Vec128 b -#define HWY_NEON_BUILD_PARAM_3(type, size) \ - const Vec128 a, const Vec128 b, \ - const Vec128 c - -// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying -// function. -#define HWY_NEON_BUILD_ARG_1 a.raw -#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw -#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw - -// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after -// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on -// itself like with some of the library "functions" such as vshlq_u8. For -// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as -// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed. -// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro -// expects two arguments. -#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__) - -// Main macro definition that defines a single function for the given type and -// size of vector, using the underlying (prefix##infix##suffix) function and -// the template, return type, parameters and arguments defined by the "args" -// parameters passed here (see HWY_NEON_BUILD_* macros defined before). -#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ - HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \ - HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \ - name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \ - return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \ - HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \ - } - -// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function -// called "name" using the set of neon functions starting with the given -// "prefix" for all the variants of certain types, as specified next to each -// macro. For example, the prefix "vsub" can be used to define the operator- -// using args=2. - -// uint8_t -#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ - HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \ - HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \ - HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \ - HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args) - -// int8_t -#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ - HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \ - HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \ - HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \ - HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args) - -// uint16_t -#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ - HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \ - HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \ - HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args) - -// int16_t -#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ - HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \ - HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \ - HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args) - -// uint32_t -#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ - HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \ - HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args) - -// int32_t -#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ - HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \ - HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args) - -// uint64_t -#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ - HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) - -// int64_t -#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \ - HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) - -#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC -#define HWY_NEON_HAVE_BFLOAT16 1 -#else -#define HWY_NEON_HAVE_BFLOAT16 0 -#endif - -// bfloat16_t -#if HWY_NEON_HAVE_BFLOAT16 -#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \ - HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args) \ - HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args) \ - HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args) -#else -#define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) -#endif - -// Used for conversion instructions if HWY_NEON_HAVE_FLOAT16C. -#define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \ - args) \ - HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \ - HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args) \ - HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args) \ - HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args) - -// float16_t -#if HWY_HAVE_FLOAT16 -#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args) -#else -#define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) -#endif - -// float -#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \ - HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \ - HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args) - -// double -#if HWY_HAVE_FLOAT64 -#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \ - HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args) -#else -#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) -#endif - -// Helper macros to define for more than one type. -// uint8_t, uint16_t and uint32_t -#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) - -// int8_t, int16_t and int32_t -#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) - -// uint8_t, uint16_t, uint32_t and uint64_t -#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) - -// int8_t, int16_t, int32_t and int64_t -#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) - -// All int*_t and uint*_t up to 64 -#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) - -#define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) - -#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) - -// All previous types. -#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) - -#define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) - -#define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) - -#define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) - -// For vzip1/2 -#define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ - HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) -#define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) - -// For eor3q, which is only defined for full vectors. -#define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ - HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ - HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ - HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ - HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ - HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ - HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) -// Emulation of some intrinsics on armv7. -#if HWY_ARCH_ARM_V7 -#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] -#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] -#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] -#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0] -#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0] -#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0] -#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0] -#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0] -#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0] -#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0] -#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0] -#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0] -#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0] -#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0] -#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1] -#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1] -#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1] -#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1] -#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1] -#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1] -#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1] -#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1] -#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1] -#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1] -#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1] -#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1] -#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1] -#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1] -#define vzip1_s8(x, y) vzip_s8(x, y).val[0] -#define vzip1_u8(x, y) vzip_u8(x, y).val[0] -#define vzip1_s16(x, y) vzip_s16(x, y).val[0] -#define vzip1_u16(x, y) vzip_u16(x, y).val[0] -#define vzip1_f32(x, y) vzip_f32(x, y).val[0] -#define vzip1_u32(x, y) vzip_u32(x, y).val[0] -#define vzip1_s32(x, y) vzip_s32(x, y).val[0] -#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0] -#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0] -#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0] -#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0] -#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0] -#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0] -#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0] -#define vzip2_s8(x, y) vzip_s8(x, y).val[1] -#define vzip2_u8(x, y) vzip_u8(x, y).val[1] -#define vzip2_s16(x, y) vzip_s16(x, y).val[1] -#define vzip2_u16(x, y) vzip_u16(x, y).val[1] -#define vzip2_s32(x, y) vzip_s32(x, y).val[1] -#define vzip2_u32(x, y) vzip_u32(x, y).val[1] -#define vzip2_f32(x, y) vzip_f32(x, y).val[1] -#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1] -#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1] -#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1] -#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1] -#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1] -#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1] -#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1] -#endif - -// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 -// overloads for all vector types, even those (bfloat16_t) where the -// underlying vector is the same as others (uint16_t). -template -struct Tuple2; -template -struct Tuple3; -template -struct Tuple4; - -template <> -struct Tuple2 { - uint8x16x2_t raw; -}; -template -struct Tuple2 { - uint8x8x2_t raw; -}; -template <> -struct Tuple2 { - int8x16x2_t raw; -}; -template -struct Tuple2 { - int8x8x2_t raw; -}; -template <> -struct Tuple2 { - uint16x8x2_t raw; -}; -template -struct Tuple2 { - uint16x4x2_t raw; -}; -template <> -struct Tuple2 { - int16x8x2_t raw; -}; -template -struct Tuple2 { - int16x4x2_t raw; -}; -template <> -struct Tuple2 { - uint32x4x2_t raw; -}; -template -struct Tuple2 { - uint32x2x2_t raw; -}; -template <> -struct Tuple2 { - int32x4x2_t raw; -}; -template -struct Tuple2 { - int32x2x2_t raw; -}; -template <> -struct Tuple2 { - uint64x2x2_t raw; -}; -template -struct Tuple2 { - uint64x1x2_t raw; -}; -template <> -struct Tuple2 { - int64x2x2_t raw; -}; -template -struct Tuple2 { - int64x1x2_t raw; -}; - -template <> -struct Tuple2 { -#if HWY_NEON_HAVE_FLOAT16C - float16x8x2_t raw; -#else - uint16x8x2_t raw; -#endif -}; -template -struct Tuple2 { -#if HWY_NEON_HAVE_FLOAT16C - float16x4x2_t raw; -#else - uint16x4x2_t raw; -#endif -}; -template <> -struct Tuple2 { -#if HWY_NEON_HAVE_BFLOAT16 - bfloat16x8x2_t raw; -#else - uint16x8x2_t raw; -#endif -}; -template -struct Tuple2 { -#if HWY_NEON_HAVE_BFLOAT16 - bfloat16x4x2_t raw; -#else - uint16x4x2_t raw; -#endif -}; - -template <> -struct Tuple2 { - float32x4x2_t raw; -}; -template -struct Tuple2 { - float32x2x2_t raw; -}; -#if HWY_HAVE_FLOAT64 -template <> -struct Tuple2 { - float64x2x2_t raw; -}; -template -struct Tuple2 { - float64x1x2_t raw; -}; -#endif // HWY_HAVE_FLOAT64 - -template <> -struct Tuple3 { - uint8x16x3_t raw; -}; -template -struct Tuple3 { - uint8x8x3_t raw; -}; -template <> -struct Tuple3 { - int8x16x3_t raw; -}; -template -struct Tuple3 { - int8x8x3_t raw; -}; -template <> -struct Tuple3 { - uint16x8x3_t raw; -}; -template -struct Tuple3 { - uint16x4x3_t raw; -}; -template <> -struct Tuple3 { - int16x8x3_t raw; -}; -template -struct Tuple3 { - int16x4x3_t raw; -}; -template <> -struct Tuple3 { - uint32x4x3_t raw; -}; -template -struct Tuple3 { - uint32x2x3_t raw; -}; -template <> -struct Tuple3 { - int32x4x3_t raw; -}; -template -struct Tuple3 { - int32x2x3_t raw; -}; -template <> -struct Tuple3 { - uint64x2x3_t raw; -}; -template -struct Tuple3 { - uint64x1x3_t raw; -}; -template <> -struct Tuple3 { - int64x2x3_t raw; -}; -template -struct Tuple3 { - int64x1x3_t raw; -}; - -template <> -struct Tuple3 { -#if HWY_NEON_HAVE_FLOAT16C - float16x8x3_t raw; -#else - uint16x8x3_t raw; -#endif -}; -template -struct Tuple3 { -#if HWY_NEON_HAVE_FLOAT16C - float16x4x3_t raw; -#else - uint16x4x3_t raw; -#endif -}; -template <> -struct Tuple3 { -#if HWY_NEON_HAVE_BFLOAT16 - bfloat16x8x3_t raw; -#else - uint16x8x3_t raw; -#endif -}; -template -struct Tuple3 { -#if HWY_NEON_HAVE_BFLOAT16 - bfloat16x4x3_t raw; -#else - uint16x4x3_t raw; -#endif -}; - -template <> -struct Tuple3 { - float32x4x3_t raw; -}; -template -struct Tuple3 { - float32x2x3_t raw; -}; -#if HWY_HAVE_FLOAT64 -template <> -struct Tuple3 { - float64x2x3_t raw; -}; -template -struct Tuple3 { - float64x1x3_t raw; -}; -#endif // HWY_HAVE_FLOAT64 - -template <> -struct Tuple4 { - uint8x16x4_t raw; -}; -template -struct Tuple4 { - uint8x8x4_t raw; -}; -template <> -struct Tuple4 { - int8x16x4_t raw; -}; -template -struct Tuple4 { - int8x8x4_t raw; -}; -template <> -struct Tuple4 { - uint16x8x4_t raw; -}; -template -struct Tuple4 { - uint16x4x4_t raw; -}; -template <> -struct Tuple4 { - int16x8x4_t raw; -}; -template -struct Tuple4 { - int16x4x4_t raw; -}; -template <> -struct Tuple4 { - uint32x4x4_t raw; -}; -template -struct Tuple4 { - uint32x2x4_t raw; -}; -template <> -struct Tuple4 { - int32x4x4_t raw; -}; -template -struct Tuple4 { - int32x2x4_t raw; -}; -template <> -struct Tuple4 { - uint64x2x4_t raw; -}; -template -struct Tuple4 { - uint64x1x4_t raw; -}; -template <> -struct Tuple4 { - int64x2x4_t raw; -}; -template -struct Tuple4 { - int64x1x4_t raw; -}; - -template <> -struct Tuple4 { -#if HWY_NEON_HAVE_FLOAT16C - float16x8x4_t raw; -#else - uint16x8x4_t raw; -#endif -}; -template -struct Tuple4 { -#if HWY_NEON_HAVE_FLOAT16C - float16x4x4_t raw; -#else - uint16x4x4_t raw; -#endif -}; -template <> -struct Tuple4 { -#if HWY_NEON_HAVE_BFLOAT16 - bfloat16x8x4_t raw; -#else - uint16x8x4_t raw; -#endif -}; -template -struct Tuple4 { -#if HWY_NEON_HAVE_BFLOAT16 - bfloat16x4x4_t raw; -#else - uint16x4x4_t raw; -#endif -}; - -template <> -struct Tuple4 { - float32x4x4_t raw; -}; -template -struct Tuple4 { - float32x2x4_t raw; -}; -#if HWY_HAVE_FLOAT64 -template <> -struct Tuple4 { - float64x2x4_t raw; -}; -template -struct Tuple4 { - float64x1x4_t raw; -}; -#endif // HWY_HAVE_FLOAT64 - -template -struct Raw128; - -// 128 -template <> -struct Raw128 { - using type = uint8x16_t; -}; - -template <> -struct Raw128 { - using type = uint16x8_t; -}; - -template <> -struct Raw128 { - using type = uint32x4_t; -}; - -template <> -struct Raw128 { - using type = uint64x2_t; -}; - -template <> -struct Raw128 { - using type = int8x16_t; -}; - -template <> -struct Raw128 { - using type = int16x8_t; -}; - -template <> -struct Raw128 { - using type = int32x4_t; -}; - -template <> -struct Raw128 { - using type = int64x2_t; -}; - -template <> -struct Raw128 { -#if HWY_NEON_HAVE_FLOAT16C - using type = float16x8_t; -#else - using type = uint16x8_t; -#endif -}; - -template <> -struct Raw128 { -#if HWY_NEON_HAVE_BFLOAT16 - using type = bfloat16x8_t; -#else - using type = uint16x8_t; -#endif -}; - -template <> -struct Raw128 { - using type = float32x4_t; -}; - -#if HWY_HAVE_FLOAT64 -template <> -struct Raw128 { - using type = float64x2_t; -}; -#endif // HWY_HAVE_FLOAT64 - -// 64 -template <> -struct Raw128 { - using type = uint8x8_t; -}; - -template <> -struct Raw128 { - using type = uint16x4_t; -}; - -template <> -struct Raw128 { - using type = uint32x2_t; -}; - -template <> -struct Raw128 { - using type = uint64x1_t; -}; - -template <> -struct Raw128 { - using type = int8x8_t; -}; - -template <> -struct Raw128 { - using type = int16x4_t; -}; - -template <> -struct Raw128 { - using type = int32x2_t; -}; - -template <> -struct Raw128 { - using type = int64x1_t; -}; - -template <> -struct Raw128 { -#if HWY_NEON_HAVE_FLOAT16C - using type = float16x4_t; -#else - using type = uint16x4_t; -#endif -}; - -template <> -struct Raw128 { -#if HWY_NEON_HAVE_BFLOAT16 - using type = bfloat16x4_t; -#else - using type = uint16x4_t; -#endif -}; - -template <> -struct Raw128 { - using type = float32x2_t; -}; - -#if HWY_HAVE_FLOAT64 -template <> -struct Raw128 { - using type = float64x1_t; -}; -#endif // HWY_HAVE_FLOAT64 - -// 32 (same as 64) -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -// 16 (same as 64) -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -// 8 (same as 64) -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -} // namespace detail - -template -class Vec128 { - public: - using Raw = typename detail::Raw128::type; - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = N; // only for DFromV - - HWY_INLINE Vec128() {} - Vec128(const Vec128&) = default; - Vec128& operator=(const Vec128&) = default; - HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {} - - // Compound assignment. Only usable if there is a corresponding non-member - // binary operator overload. For example, only f32 and f64 support division. - HWY_INLINE Vec128& operator*=(const Vec128 other) { - return *this = (*this * other); - } - HWY_INLINE Vec128& operator/=(const Vec128 other) { - return *this = (*this / other); - } - HWY_INLINE Vec128& operator+=(const Vec128 other) { - return *this = (*this + other); - } - HWY_INLINE Vec128& operator-=(const Vec128 other) { - return *this = (*this - other); - } - HWY_INLINE Vec128& operator&=(const Vec128 other) { - return *this = (*this & other); - } - HWY_INLINE Vec128& operator|=(const Vec128 other) { - return *this = (*this | other); - } - HWY_INLINE Vec128& operator^=(const Vec128 other) { - return *this = (*this ^ other); - } - - Raw raw; -}; - -template -using Vec64 = Vec128; - -template -using Vec32 = Vec128; - -template -using Vec16 = Vec128; - -// FF..FF or 0. -template -class Mask128 { - // Arm C Language Extensions return and expect unsigned type. - using Raw = typename detail::Raw128, N>::type; - - public: - using PrivateT = T; // only for DFromM - static constexpr size_t kPrivateN = N; // only for DFromM - - HWY_INLINE Mask128() {} - Mask128(const Mask128&) = default; - Mask128& operator=(const Mask128&) = default; - HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {} - - Raw raw; -}; - -template -using Mask64 = Mask128; - -template -using DFromV = Simd; - -template -using DFromM = Simd; - -template -using TFromV = typename V::PrivateT; - -// ------------------------------ Set - -namespace detail { -// We want to route any combination of N/kPow2 to the intrinsics depending on -// whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is -// unconditional and currently does not accept inputs (such as whether the -// vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for -// SFINAE. We instead define a private NativeSet which receives a Simd<> whose -// kPow2 has already been folded into its N. -#define HWY_NEON_BUILD_TPL_HWY_SET -#define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128 -#define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \ - Simd /* tag */, type##_t t -#define HWY_NEON_BUILD_ARG_HWY_SET t - -HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET) -HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET) -#if !HWY_HAVE_FLOAT16 -HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET) -#endif - -#undef HWY_NEON_BUILD_TPL_HWY_SET -#undef HWY_NEON_BUILD_RET_HWY_SET -#undef HWY_NEON_BUILD_PARAM_HWY_SET -#undef HWY_NEON_BUILD_ARG_HWY_SET - -#if !HWY_NEON_HAVE_BFLOAT16 -// BF16: return u16. -template -HWY_API Vec128 NativeSet(D d, bfloat16_t t) { - uint16_t tu; - CopyBytes(&t, &tu); - return Vec128(Set(RebindToUnsigned(), tu).raw); -} -#endif // !HWY_NEON_HAVE_BFLOAT16 - -} // namespace detail - -// Full vector. Cannot yet use VFromD because that is defined in terms of Set. -// Do not use a typename T = TFromD argument because T will be deduced from -// the actual argument type, which can differ from TFromD. -template -HWY_INLINE Vec128> Set(D /* tag */, T t) { - return detail::NativeSet(Full128>(), static_cast>(t)); -} - -// Partial vector: create 64-bit and return wrapper. -template -HWY_API Vec128, MaxLanes(D())> Set(D /* tag */, T t) { - const Full64> dfull; - return Vec128, MaxLanes(D())>( - detail::NativeSet(dfull, static_cast>(t)).raw); -} - -template -using VFromD = decltype(Set(D(), TFromD())); - -template -HWY_API VFromD Zero(D d) { - // Default ctor also works for bfloat16_t and float16_t. - return Set(d, TFromD{}); -} - -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") -#if HWY_COMPILER_GCC_ACTUAL -HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") -#endif - -template -HWY_API VFromD Undefined(D /*tag*/) { - VFromD v; - return v; -} - -HWY_DIAGNOSTICS(pop) - -namespace detail { - -template -HWY_INLINE VFromD Iota0(D d) { - const RebindToUnsigned du; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(8))); - constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; - const VFromD vu8_iota0(reinterpret_cast(kU8Iota0)); -#else - alignas(8) static constexpr uint8_t kU8Iota0[8] = {0, 1, 2, 3, 4, 5, 6, 7}; - const VFromD vu8_iota0( - Load(Full64>(), kU8Iota0).raw); -#endif - return BitCast(d, vu8_iota0); -} - -template -HWY_INLINE VFromD Iota0(D d) { - const RebindToUnsigned du; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); - constexpr GccU8RawVectType kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; - const VFromD vu8_iota0(reinterpret_cast(kU8Iota0)); -#else - alignas(16) static constexpr uint8_t kU8Iota0[16] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - const auto vu8_iota0 = Load(du, kU8Iota0); -#endif - return BitCast(d, vu8_iota0); -} - -template -HWY_INLINE VFromD Iota0(D d) { - using T = TFromD; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef T GccRawVectType __attribute__((__vector_size__(8))); - constexpr GccRawVectType kIota0 = {T{0}, T{1}, T{2}, static_cast(3)}; - return VFromD(reinterpret_cast::Raw>(kIota0)); -#else - alignas(8) static constexpr T kIota0[4] = {T{0}, T{1}, T{2}, - static_cast(3)}; - return Load(d, kIota0); -#endif -} - -template -HWY_INLINE VFromD Iota0(D d) { - using T = TFromD; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef T GccRawVectType __attribute__((__vector_size__(16))); - constexpr GccRawVectType kIota0 = {T{0}, T{1}, T{2}, static_cast(3), - T{4}, T{5}, T{6}, static_cast(7)}; - return VFromD(reinterpret_cast::Raw>(kIota0)); -#else - alignas(16) static constexpr T kU16Iota0[8] = { - T{0}, T{1}, T{2}, static_cast(3), T{4}, T{5}, T{6}, static_cast(7)}; - return Load(d, kIota0); -#endif -} - -template -HWY_INLINE VFromD Iota0(D d) { - const RebindToUnsigned du; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8))); - constexpr GccU32RawVectType kU32Iota0 = {0, 1}; - const VFromD vu32_iota0( - reinterpret_cast(kU32Iota0)); -#else - alignas(8) static constexpr uint32_t kU32Iota0[2] = {0, 1}; - const VFromD vu32_iota0{ - Load(Full64>(), kU32Iota0).raw}; -#endif - return BitCast(d, vu32_iota0); -} - -template -HWY_INLINE VFromD Iota0(D d) { - const RebindToUnsigned du; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); - constexpr GccU32RawVectType kU32Iota0 = {0, 1, 2, 3}; - const VFromD vu32_iota0( - reinterpret_cast(kU32Iota0)); -#else - alignas(16) static constexpr uint32_t kU32Iota0[4] = {0, 1, 2, 3}; - const auto vu32_iota0 = Load(du, kU32Iota0); -#endif - return BitCast(d, vu32_iota0); -} - -template -HWY_INLINE VFromD Iota0(D d) { -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef float GccF32RawVectType __attribute__((__vector_size__(8))); - constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f}; - return VFromD(reinterpret_cast(kF32Iota0)); -#else - alignas(8) static constexpr float kF32Iota0[2] = {0.0f, 1.0f}; - return VFromD{ - Load(Full64>(), kF32Iota0).raw}; -#endif -} - -template -HWY_INLINE VFromD Iota0(D d) { -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef float GccF32RawVectType __attribute__((__vector_size__(16))); - constexpr GccF32RawVectType kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f}; - return VFromD(reinterpret_cast(kF32Iota0)); -#else - alignas(16) static constexpr float kF32Iota0[4] = {0.0f, 1.0f, 2.0f, 3.0f}; - return Load(d, kF32Iota0); -#endif -} - -template -HWY_INLINE VFromD Iota0(D d) { - return Zero(d); -} - -template -HWY_INLINE VFromD Iota0(D d) { - const RebindToUnsigned du; -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef uint64_t GccU64RawVectType __attribute__((__vector_size__(16))); - constexpr GccU64RawVectType kU64Iota0 = {0, 1}; - const VFromD vu64_iota0( - reinterpret_cast(kU64Iota0)); -#else - alignas(16) static constexpr uint64_t kU64Iota0[4] = {0, 1}; - const auto vu64_iota0 = Load(du, kU64Iota0); -#endif - return BitCast(d, vu64_iota0); -} - -#if HWY_HAVE_FLOAT64 -template -HWY_INLINE VFromD Iota0(D d) { -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL - typedef double GccF64RawVectType __attribute__((__vector_size__(16))); - constexpr GccF64RawVectType kF64Iota0 = {0.0, 1.0}; - return VFromD(reinterpret_cast(kF64Iota0)); -#else - alignas(16) static constexpr double kF64Iota0[4] = {0.0, 1.0}; - return Load(d, kF64Iota0); -#endif -} -#endif // HWY_HAVE_FLOAT64 - -#if HWY_COMPILER_MSVC -template -static HWY_INLINE V MaskOutIota(V v) { - constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV); - constexpr uint64_t kU64MaskOutMask = - hwy::LimitsMax>(); - - const DFromV d; - const Repartition du8; - using VU8 = VFromD; - const auto mask_out_mask = - BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask)))); - return v & mask_out_mask; -} -template -static HWY_INLINE V MaskOutIota(V v) { - return v; -} -#endif - -} // namespace detail - -template -HWY_API VFromD Iota(D d, const T2 first) { - const auto result_iota = - detail::Iota0(d) + Set(d, static_cast>(first)); -#if HWY_COMPILER_MSVC - return detail::MaskOutIota(result_iota); -#else - return result_iota; -#endif -} - -// ------------------------------ Tuple (VFromD) -#include "hwy/ops/tuple-inl.h" - -// ------------------------------ Combine - -// Full result -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_u8(lo.raw, hi.raw)); -} -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_u16(lo.raw, hi.raw)); -} -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_u32(lo.raw, hi.raw)); -} -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_u64(lo.raw, hi.raw)); -} - -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_s8(lo.raw, hi.raw)); -} -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_s16(lo.raw, hi.raw)); -} -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_s32(lo.raw, hi.raw)); -} -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_s64(lo.raw, hi.raw)); -} - -template -HWY_API Vec128 Combine(D d, Vec64 hi, - Vec64 lo) { -#if HWY_HAVE_FLOAT16 - (void)d; - return Vec128(vcombine_f16(lo.raw, hi.raw)); -#else - const RebindToUnsigned du; - const Half duh; - return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo))); -#endif -} - -template -HWY_API Vec128 Combine(D d, Vec64 hi, - Vec64 lo) { -#if HWY_NEON_HAVE_BFLOAT16 - (void)d; - return Vec128(vcombine_bf16(lo.raw, hi.raw)); -#else - const RebindToUnsigned du; - const Half duh; - return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo))); -#endif -} - -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { - return Vec128(vcombine_f32(lo.raw, hi.raw)); -} -#if HWY_HAVE_FLOAT64 -template -HWY_API Vec128 Combine(D /* tag */, Vec64 hi, - Vec64 lo) { - return Vec128(vcombine_f64(lo.raw, hi.raw)); -} -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ BitCast - -namespace detail { - -// Converts from Vec128 to Vec128 using the -// vreinterpret*_u8_*() set of functions. -#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 -#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \ - Vec128 -#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128 v -#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw - -// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined. -template -HWY_INLINE Vec128 BitCastToByte(Vec128 v) { - return v; -} - -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, - HWY_CAST_TO_U8) -HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_, - HWY_CAST_TO_U8) - -HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) -HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) -HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) -HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) - -#if !HWY_HAVE_FLOAT16 -#if HWY_NEON_HAVE_FLOAT16C -HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_, - HWY_CAST_TO_U8) -#else -template -HWY_INLINE Vec128 BitCastToByte(Vec128 v) { - return BitCastToByte(Vec128(v.raw)); -} -#endif // HWY_NEON_HAVE_FLOAT16C -#endif // !HWY_HAVE_FLOAT16 - -#if !HWY_NEON_HAVE_BFLOAT16 -template -HWY_INLINE Vec128 BitCastToByte(Vec128 v) { - return BitCastToByte(Vec128(v.raw)); -} -#endif // !HWY_NEON_HAVE_BFLOAT16 - -#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 -#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8 -#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8 -#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 - -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD v) { - return v; -} - -// 64-bit or less: - -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(vreinterpret_s8_u8(v.raw)); -} -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(vreinterpret_u16_u8(v.raw)); -} -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(vreinterpret_s16_u8(v.raw)); -} -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(vreinterpret_u32_u8(v.raw)); -} -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(vreinterpret_s32_u8(v.raw)); -} - -template -HWY_INLINE Vec64 BitCastFromByte(D /* tag */, Vec64 v) { - return Vec64(vreinterpret_u64_u8(v.raw)); -} -template -HWY_INLINE Vec64 BitCastFromByte(D /* tag */, Vec64 v) { - return Vec64(vreinterpret_s64_u8(v.raw)); -} - -template -HWY_INLINE VFromD BitCastFromByte(D d, VFromD> v) { -#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C - (void)d; - return VFromD(vreinterpret_f16_u8(v.raw)); -#else - const RebindToUnsigned du; - return VFromD(BitCastFromByte(du, v).raw); -#endif -} - -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(vreinterpret_f32_u8(v.raw)); -} - -#if HWY_HAVE_FLOAT64 -template -HWY_INLINE Vec64 BitCastFromByte(D /* tag */, Vec64 v) { - return Vec64(vreinterpret_f64_u8(v.raw)); -} -#endif // HWY_HAVE_FLOAT64 - -// 128-bit full: - -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_s8_u8(v.raw)); -} -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_u16_u8(v.raw)); -} -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_s16_u8(v.raw)); -} -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_u32_u8(v.raw)); -} -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_s32_u8(v.raw)); -} -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_u64_u8(v.raw)); -} -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_s64_u8(v.raw)); -} - -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { -#if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_FLOAT16C - return Vec128(vreinterpretq_f16_u8(v.raw)); -#else - return Vec128(BitCastFromByte(RebindToUnsigned(), v).raw); -#endif -} - -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_f32_u8(v.raw)); -} - -#if HWY_HAVE_FLOAT64 -template -HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { - return Vec128(vreinterpretq_f64_u8(v.raw)); -} -#endif // HWY_HAVE_FLOAT64 - -// Special case for bfloat16_t, which may have the same Raw as uint16_t. -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - VFromD> v) { - return VFromD(BitCastFromByte(RebindToUnsigned(), v).raw); -} - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, - Vec128().MaxLanes()> v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(v)); -} - -// ------------------------------ ResizeBitCast - -// <= 8 byte vector to <= 8 byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - const Repartition du8; - return BitCast(d, VFromD{detail::BitCastToByte(v).raw}); -} - -// 16-byte vector to 16-byte vector: same as BitCast -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, v); -} - -// 16-byte vector to <= 8-byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - const DFromV d_from; - const Half dh_from; - return ResizeBitCast(d, LowerHalf(dh_from, v)); -} - -// <= 8-bit vector to 16-byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - const Full64> d_full64_from; - const Full128> d_full128_from; - return BitCast(d, Combine(d_full128_from, Zero(d_full64_from), - ResizeBitCast(d_full64_from, v))); -} - -// ------------------------------ GetLane - -namespace detail { -#define HWY_NEON_BUILD_TPL_HWY_GET template -#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t -#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128 v -#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane - -HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET) - -#undef HWY_NEON_BUILD_TPL_HWY_GET -#undef HWY_NEON_BUILD_RET_HWY_GET -#undef HWY_NEON_BUILD_PARAM_HWY_GET -#undef HWY_NEON_BUILD_ARG_HWY_GET - -} // namespace detail - -template -HWY_API TFromV GetLane(const V v) { - return detail::GetLane<0>(v); -} - -// ------------------------------ ExtractLane - -// Requires one overload per vector length because GetLane<3> is a compile error -// if v is a uint32x2_t. -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { - HWY_DASSERT(i == 0); - (void)i; - return detail::GetLane<0>(v); -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::GetLane<0>(v); - case 1: - return detail::GetLane<1>(v); - } - } -#endif - alignas(16) T lanes[2]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::GetLane<0>(v); - case 1: - return detail::GetLane<1>(v); - case 2: - return detail::GetLane<2>(v); - case 3: - return detail::GetLane<3>(v); - } - } -#endif - alignas(16) T lanes[4]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::GetLane<0>(v); - case 1: - return detail::GetLane<1>(v); - case 2: - return detail::GetLane<2>(v); - case 3: - return detail::GetLane<3>(v); - case 4: - return detail::GetLane<4>(v); - case 5: - return detail::GetLane<5>(v); - case 6: - return detail::GetLane<6>(v); - case 7: - return detail::GetLane<7>(v); - } - } -#endif - alignas(16) T lanes[8]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::GetLane<0>(v); - case 1: - return detail::GetLane<1>(v); - case 2: - return detail::GetLane<2>(v); - case 3: - return detail::GetLane<3>(v); - case 4: - return detail::GetLane<4>(v); - case 5: - return detail::GetLane<5>(v); - case 6: - return detail::GetLane<6>(v); - case 7: - return detail::GetLane<7>(v); - case 8: - return detail::GetLane<8>(v); - case 9: - return detail::GetLane<9>(v); - case 10: - return detail::GetLane<10>(v); - case 11: - return detail::GetLane<11>(v); - case 12: - return detail::GetLane<12>(v); - case 13: - return detail::GetLane<13>(v); - case 14: - return detail::GetLane<14>(v); - case 15: - return detail::GetLane<15>(v); - } - } -#endif - alignas(16) T lanes[16]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -// ------------------------------ InsertLane - -namespace detail { -#define HWY_NEON_BUILD_TPL_HWY_INSERT template -#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128 -#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \ - Vec128 v, type##_t t -#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane - -HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT) - -#undef HWY_NEON_BUILD_TPL_HWY_INSERT -#undef HWY_NEON_BUILD_RET_HWY_INSERT -#undef HWY_NEON_BUILD_PARAM_HWY_INSERT -#undef HWY_NEON_BUILD_ARG_HWY_INSERT - -} // namespace detail - -// Requires one overload per vector length because InsertLane<3> may be a -// compile error. - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { - HWY_DASSERT(i == 0); - (void)i; - return Set(DFromV(), t); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[2]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[4]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - case 4: - return detail::InsertLane<4>(v, t); - case 5: - return detail::InsertLane<5>(v, t); - case 6: - return detail::InsertLane<6>(v, t); - case 7: - return detail::InsertLane<7>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[8]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - case 4: - return detail::InsertLane<4>(v, t); - case 5: - return detail::InsertLane<5>(v, t); - case 6: - return detail::InsertLane<6>(v, t); - case 7: - return detail::InsertLane<7>(v, t); - case 8: - return detail::InsertLane<8>(v, t); - case 9: - return detail::InsertLane<9>(v, t); - case 10: - return detail::InsertLane<10>(v, t); - case 11: - return detail::InsertLane<11>(v, t); - case 12: - return detail::InsertLane<12>(v, t); - case 13: - return detail::InsertLane<13>(v, t); - case 14: - return detail::InsertLane<14>(v, t); - case 15: - return detail::InsertLane<15>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[16]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -// ================================================== ARITHMETIC - -// ------------------------------ Addition -HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2) - -// ------------------------------ Subtraction -HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2) - -// ------------------------------ SumsOf8 - -HWY_API Vec128 SumsOf8(const Vec128 v) { - return Vec128(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw)))); -} -HWY_API Vec64 SumsOf8(const Vec64 v) { - return Vec64(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw)))); -} - -// ------------------------------ SaturatedAdd - -#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB -#undef HWY_NATIVE_I32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB -#undef HWY_NATIVE_U32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB -#undef HWY_NATIVE_I64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I64_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB -#undef HWY_NATIVE_U64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U64_SATURATED_ADDSUB -#endif - -// Returns a + b clamped to the destination range. -HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedAdd, vqadd, _, 2) - -// ------------------------------ SaturatedSub - -// Returns a - b clamped to the destination range. -HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2) - -// ------------------------------ Average - -// Returns (a + b + 1) / 2 -HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) -HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) - -// ------------------------------ Neg - -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below - -#if !HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Neg(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask()))); -} -#endif // !HWY_HAVE_FLOAT16 - -// There is no vneg for bf16, but we can cast to f16 (emulated or native). -template -HWY_API Vec128 Neg(const Vec128 v) { - const DFromV d; - const Rebind df16; - return BitCast(d, Neg(BitCast(df16, v))); -} - -HWY_API Vec64 Neg(const Vec64 v) { -#if HWY_ARCH_ARM_A64 - return Vec64(vneg_s64(v.raw)); -#else - return Zero(DFromV()) - v; -#endif -} - -HWY_API Vec128 Neg(const Vec128 v) { -#if HWY_ARCH_ARM_A64 - return Vec128(vnegq_s64(v.raw)); -#else - return Zero(DFromV()) - v; -#endif -} - -// ------------------------------ ShiftLeft - -// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported). -#pragma push_macro("HWY_NEON_DEF_FUNCTION") -#undef HWY_NEON_DEF_FUNCTION -#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ - template \ - HWY_API Vec128 name(const Vec128 v) { \ - return kBits == 0 ? v \ - : Vec128(HWY_NEON_EVAL( \ - prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \ - } - -HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored) - -HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored) -HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored) - -#pragma pop_macro("HWY_NEON_DEF_FUNCTION") - -// ------------------------------ RotateRight (ShiftRight, Or) -template -HWY_API Vec128 RotateRight(const Vec128 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a -// mechanism for checking for extensions to Armv8. - -// ------------------------------ Shl - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); -} -template -HWY_API Vec128 operator<<(Vec128 v, - Vec128 bits) { - return Vec128(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); -} -template -HWY_API Vec128 operator<<(Vec128 v, - Vec128 bits) { - return Vec128(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw))); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw))); -} -template -HWY_API Vec128 operator<<(Vec128 v, - Vec128 bits) { - return Vec128(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw))); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw))); -} -HWY_API Vec64 operator<<(Vec64 v, Vec64 bits) { - return Vec64(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s8(v.raw, bits.raw)); -} -template -HWY_API Vec128 operator<<(Vec128 v, - Vec128 bits) { - return Vec128(vshl_s8(v.raw, bits.raw)); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s16(v.raw, bits.raw)); -} -template -HWY_API Vec128 operator<<(Vec128 v, - Vec128 bits) { - return Vec128(vshl_s16(v.raw, bits.raw)); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s32(v.raw, bits.raw)); -} -template -HWY_API Vec128 operator<<(Vec128 v, - Vec128 bits) { - return Vec128(vshl_s32(v.raw, bits.raw)); -} - -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s64(v.raw, bits.raw)); -} -HWY_API Vec64 operator<<(Vec64 v, Vec64 bits) { - return Vec64(vshl_s64(v.raw, bits.raw)); -} - -// ------------------------------ Shr (Neg) - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - const RebindToSigned> di; - const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshlq_u8(v.raw, neg_bits)); -} -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { - const RebindToSigned> di; - const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshl_u8(v.raw, neg_bits)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - const RebindToSigned> di; - const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshlq_u16(v.raw, neg_bits)); -} -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { - const RebindToSigned> di; - const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshl_u16(v.raw, neg_bits)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - const RebindToSigned> di; - const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshlq_u32(v.raw, neg_bits)); -} -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { - const RebindToSigned> di; - const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshl_u32(v.raw, neg_bits)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - const RebindToSigned> di; - const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec128(vshlq_u64(v.raw, neg_bits)); -} -HWY_API Vec64 operator>>(Vec64 v, Vec64 bits) { - const RebindToSigned> di; - const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw; - return Vec64(vshl_u64(v.raw, neg_bits)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s8(v.raw, Neg(bits).raw)); -} -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { - return Vec128(vshl_s8(v.raw, Neg(bits).raw)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s16(v.raw, Neg(bits).raw)); -} -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { - return Vec128(vshl_s16(v.raw, Neg(bits).raw)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s32(v.raw, Neg(bits).raw)); -} -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { - return Vec128(vshl_s32(v.raw, Neg(bits).raw)); -} - -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - return Vec128(vshlq_s64(v.raw, Neg(bits).raw)); -} -HWY_API Vec64 operator>>(Vec64 v, Vec64 bits) { - return Vec64(vshl_s64(v.raw, Neg(bits).raw)); -} - -// ------------------------------ ShiftLeftSame (Shl) - -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, int bits) { - return v << Set(DFromV(), static_cast(bits)); -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, int bits) { - return v >> Set(DFromV(), static_cast(bits)); -} - -// ------------------------------ Int/float multiplication - -// Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*. -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif - -// All except ui64 -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2) -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2) - -// ------------------------------ Integer multiplication - -// Returns the upper 16 bits of a * b in each lane. -HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { - int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); -#if HWY_ARCH_ARM_A64 - int32x4_t rhi = vmull_high_s16(a.raw, b.raw); -#else - int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); -#endif - return Vec128( - vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi))); -} -HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { - uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); -#if HWY_ARCH_ARM_A64 - uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); -#else - uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); -#endif - return Vec128( - vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi))); -} - -template -HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { - int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw)); - return Vec128(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo))); -} -template -HWY_API Vec128 MulHigh(Vec128 a, - Vec128 b) { - uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw)); - return Vec128(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo))); -} - -HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { - return Vec128(vqrdmulhq_s16(a.raw, b.raw)); -} -template -HWY_API Vec128 MulFixedPoint15(Vec128 a, - Vec128 b) { - return Vec128(vqrdmulh_s16(a.raw, b.raw)); -} - -// ------------------------------ Floating-point division - -// Emulate missing intrinsic -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 -HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) { - const CappedTag d; - const Twice dt; - using VT = VFromD; - return LowerHalf(d, VT(vrecpeq_f64(Combine(dt, v, v).raw))).raw; -} -#endif - -// Approximate reciprocal -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocal, vrecpe, _, 1) - -#if HWY_HAVE_FLOAT64 -#ifdef HWY_NATIVE_F64_APPROX_RECIP -#undef HWY_NATIVE_F64_APPROX_RECIP -#else -#define HWY_NATIVE_F64_APPROX_RECIP -#endif - -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) -#else // !HWY_HAVE_FLOAT64 -namespace detail { -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalNewtonRaphsonStep, vrecps, _, 2) -} // namespace detail - -template -HWY_API Vec128 operator/(Vec128 a, Vec128 b) { - auto x = ApproximateReciprocal(b); - x *= detail::ReciprocalNewtonRaphsonStep(x, b); - x *= detail::ReciprocalNewtonRaphsonStep(x, b); - x *= detail::ReciprocalNewtonRaphsonStep(x, b); - return a * x; -} -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ Absolute value of difference. - -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(AbsDiff, vabd, _, 2) -HWY_NEON_DEF_FUNCTION_UI_8_16_32(AbsDiff, vabd, _, 2) // no UI64 - -#ifdef HWY_NATIVE_INTEGER_ABS_DIFF -#undef HWY_NATIVE_INTEGER_ABS_DIFF -#else -#define HWY_NATIVE_INTEGER_ABS_DIFF -#endif - -// ------------------------------ Integer multiply-add - -// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. -#ifdef HWY_NATIVE_INT_FMA -#undef HWY_NATIVE_INT_FMA -#else -#define HWY_NATIVE_INT_FMA -#endif - -// Wrappers for changing argument order to what intrinsics expect. -namespace detail { -// All except ui64 -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(MulAdd, vmla, _, 3) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(MulAdd, vmla, _, 3) -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(NegMulAdd, vmls, _, 3) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(NegMulAdd, vmls, _, 3) -} // namespace detail - -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return detail::MulAdd(add, mul, x); -} - -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return detail::NegMulAdd(add, mul, x); -} - -// 64-bit integer -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return Add(Mul(mul, x), add); -} - -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return Sub(add, Mul(mul, x)); -} - -// ------------------------------ Floating-point multiply-add variants - -namespace detail { - -#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 -// Wrappers for changing argument order to what intrinsics expect. -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3) -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3) -#else -// Emulate. Matches intrinsics arg order. -template -HWY_API Vec128 MulAdd(Vec128 add, Vec128 mul, - Vec128 x) { - return mul * x + add; -} - -template -HWY_API Vec128 NegMulAdd(Vec128 add, Vec128 mul, - Vec128 x) { - return add - mul * x; -} - -#endif // defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 -} // namespace detail - -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return detail::MulAdd(add, mul, x); -} - -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return detail::NegMulAdd(add, mul, x); -} - -template -HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return MulAdd(mul, x, Neg(sub)); -} - -template -HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return Neg(MulAdd(mul, x, sub)); -} - -// ------------------------------ Floating-point square root (IfThenZeroElse) - -// Emulate missing intrinsic -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 490 -HWY_INLINE float64x1_t vrsqrte_f64(float64x1_t raw) { - const CappedTag d; - const Twice dt; - using VT = VFromD; - const VFromD v(raw); - return LowerHalf(d, VT(vrsqrteq_f64(Combine(dt, v, v).raw))).raw; -} -#endif - -// Approximate reciprocal square root -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocalSqrt, vrsqrte, _, 1) - -#if HWY_HAVE_FLOAT64 -#ifdef HWY_NATIVE_F64_APPROX_RSQRT -#undef HWY_NATIVE_F64_APPROX_RSQRT -#else -#define HWY_NATIVE_F64_APPROX_RSQRT -#endif - -// Full precision square root -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) -#else // !HWY_HAVE_FLOAT64 -namespace detail { -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalSqrtStep, vrsqrts, _, 2) -} // namespace detail - -template -HWY_API Vec128 Sqrt(const Vec128 v) { - auto recip = ApproximateReciprocalSqrt(v); - - recip *= detail::ReciprocalSqrtStep(v * recip, recip); - recip *= detail::ReciprocalSqrtStep(v * recip, recip); - recip *= detail::ReciprocalSqrtStep(v * recip, recip); - - const auto root = v * recip; - return IfThenZeroElse(v == Zero(Simd()), root); -} -#endif // HWY_HAVE_FLOAT64 - -// ================================================== LOGICAL - -// ------------------------------ Not - -// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. -template -HWY_API Vec128 Not(const Vec128 v) { - const DFromV d; - const Repartition d8; - return BitCast(d, Vec128(vmvnq_u8(BitCast(d8, v).raw))); -} -template -HWY_API Vec128 Not(const Vec128 v) { - const DFromV d; - const Repartition d8; - using V8 = decltype(Zero(d8)); - return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); -} - -// ------------------------------ And -HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2) - -// Uses the u32/64 defined above. -template -HWY_API Vec128 And(const Vec128 a, const Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, BitCast(du, a) & BitCast(du, b)); -} - -// ------------------------------ AndNot - -namespace detail { -// reversed_andnot returns a & ~b. -HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2) -} // namespace detail - -// Returns ~not_mask & mask. -template -HWY_API Vec128 AndNot(const Vec128 not_mask, - const Vec128 mask) { - return detail::reversed_andnot(mask, not_mask); -} - -// Uses the u32/64 defined above. -template -HWY_API Vec128 AndNot(const Vec128 not_mask, - const Vec128 mask) { - const DFromV d; - const RebindToUnsigned du; - VFromD ret = - detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); - return BitCast(d, ret); -} - -// ------------------------------ Or - -HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2) - -// Uses the u32/64 defined above. -template -HWY_API Vec128 Or(const Vec128 a, const Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, BitCast(du, a) | BitCast(du, b)); -} - -// ------------------------------ Xor - -HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2) - -// Uses the u32/64 defined above. -template -HWY_API Vec128 Xor(const Vec128 a, const Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, BitCast(du, a) ^ BitCast(du, b)); -} - -// ------------------------------ Xor3 -#if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3) -HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3) - -// Half vectors are not natively supported. Two Xor are likely more efficient -// than Combine to 128-bit. -template -HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { - return Xor(x1, Xor(x2, x3)); -} - -template -HWY_API Vec128 Xor3(const Vec128 x1, const Vec128 x2, - const Vec128 x3) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3))); -} - -#else -template -HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { - return Xor(x1, Xor(x2, x3)); -} -#endif - -// ------------------------------ Or3 -template -HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { - return Or(o1, Or(o2, o3)); -} - -// ------------------------------ OrAnd -template -HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { - return Or(o, And(a1, a2)); -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, - Vec128 no) { - return IfThenElse(MaskFromVec(mask), yes, no); -} - -// ------------------------------ BitwiseIfThenElse - -#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#else -#define HWY_NATIVE_BITWISE_IF_THEN_ELSE -#endif - -template -HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { - return IfVecThenElse(mask, yes, no); -} - -// ------------------------------ Operator overloads (internal-only if float) - -template -HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { - return And(a, b); -} - -template -HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { - return Or(a, b); -} - -template -HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { - return Xor(a, b); -} - -// ------------------------------ I64/U64 AbsDiff - -template -HWY_API Vec128 AbsDiff(const Vec128 a, - const Vec128 b) { - return Max(a, b) - Min(a, b); -} - -template -HWY_API Vec128 AbsDiff(const Vec128 a, - const Vec128 b) { - return Or(SaturatedSub(a, b), SaturatedSub(b, a)); -} - -// ------------------------------ PopulationCount - -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -namespace detail { - -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, Vec128 v) { - const Full128 d8; - return Vec128(vcntq_u8(BitCast(d8, v).raw)); -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, - Vec128 v) { - const Simd d8; - return Vec128(vcnt_u8(BitCast(d8, v).raw)); -} - -// NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes. -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, Vec128 v) { - const Full128 d8; - const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); - return Vec128(vpaddlq_u8(bytes)); -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, - Vec128 v) { - const Repartition> d8; - const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); - return Vec128(vpaddl_u8(bytes)); -} - -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, Vec128 v) { - const Full128 d8; - const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); - return Vec128(vpaddlq_u16(vpaddlq_u8(bytes))); -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, - Vec128 v) { - const Repartition> d8; - const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); - return Vec128(vpaddl_u16(vpaddl_u8(bytes))); -} - -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, Vec128 v) { - const Full128 d8; - const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); - return Vec128(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes)))); -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, - Vec128 v) { - const Repartition> d8; - const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); - return Vec128(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes)))); -} - -} // namespace detail - -template -HWY_API Vec128 PopulationCount(Vec128 v) { - return detail::PopulationCount(hwy::SizeTag(), v); -} - -// ================================================== SIGN - -// ------------------------------ Abs -// i64 is implemented after BroadcastSignBit. -HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1) -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1) - -// ------------------------------ CopySign -template -HWY_API Vec128 CopySign(Vec128 magn, Vec128 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return BitwiseIfThenElse(SignBit(d), sign, magn); -} - -// ------------------------------ CopySignToAbs -template -HWY_API Vec128 CopySignToAbs(Vec128 abs, Vec128 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return OrAnd(abs, SignBit(d), sign); -} - -// ------------------------------ BroadcastSignBit - -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - return ShiftRight(v); -} - -// ================================================== MASK - -// ------------------------------ To/from vector - -// Mask and Vec have the same representation (true = FF..FF). -template -HWY_API Mask128 MaskFromVec(const Vec128 v) { - const Simd, N, 0> du; - return Mask128(BitCast(du, v).raw); -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -template -HWY_API VFromD VecFromMask(D d, const MFromD m) { - // Raw type of masks is unsigned. - const RebindToUnsigned du; - return BitCast(d, VFromD(m.raw)); -} - -// ------------------------------ RebindMask (MaskFromVec) - -template -HWY_API MFromD RebindMask(DTo /* tag */, Mask128 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - return MFromD(m.raw); -} - -// ------------------------------ IfThenElse - -#define HWY_NEON_BUILD_TPL_HWY_IF -#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128 -#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ - const Mask128 mask, const Vec128 yes, \ - const Vec128 no -#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw - -HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF) - -#undef HWY_NEON_BUILD_TPL_HWY_IF -#undef HWY_NEON_BUILD_RET_HWY_IF -#undef HWY_NEON_BUILD_PARAM_HWY_IF -#undef HWY_NEON_BUILD_ARG_HWY_IF - -// mask ? yes : 0 -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { - return yes & VecFromMask(DFromV(), mask); -} - -// mask ? 0 : no -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { - return AndNot(VecFromMask(DFromV(), mask), no); -} - -template -HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, - Vec128 no) { - static_assert(IsSigned(), "Only works for signed/float"); - const DFromV d; - const RebindToSigned di; - - Mask128 m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); - return IfThenElse(m, yes, no); -} - -template -HWY_API Vec128 ZeroIfNegative(Vec128 v) { - const auto zero = Zero(DFromV()); - return Max(zero, v); -} - -// ------------------------------ Mask logical - -template -HWY_API Mask128 Not(const Mask128 m) { - return MaskFromVec(Not(VecFromMask(DFromM(), m))); -} - -template -HWY_API Mask128 And(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Or(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); -} - -// ================================================== COMPARE - -// Comparisons fill a lane with 1-bits if the condition is true, else 0. - -// ------------------------------ Shuffle2301 (for i64 compares) - -// Swap 32-bit halves in 64-bits -HWY_API Vec64 Shuffle2301(const Vec64 v) { - return Vec64(vrev64_u32(v.raw)); -} -HWY_API Vec64 Shuffle2301(const Vec64 v) { - return Vec64(vrev64_s32(v.raw)); -} -HWY_API Vec64 Shuffle2301(const Vec64 v) { - return Vec64(vrev64_f32(v.raw)); -} -HWY_API Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64q_u32(v.raw)); -} -HWY_API Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64q_s32(v.raw)); -} -HWY_API Vec128 Shuffle2301(const Vec128 v) { - return Vec128(vrev64q_f32(v.raw)); -} - -#define HWY_NEON_BUILD_TPL_HWY_COMPARE -#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 -#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ - const Vec128 a, const Vec128 b -#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw - -// ------------------------------ Equality -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) -#if HWY_ARCH_ARM_A64 -HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) -#else -// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) -#endif - -// ------------------------------ Strict inequality (signed, float) -#if HWY_ARCH_ARM_A64 -HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE) -#else -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) -#endif -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) - -// ------------------------------ Weak inequality (float) -#if HWY_ARCH_ARM_A64 -HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE) -#else -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE) -#endif -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) - -#undef HWY_NEON_BUILD_TPL_HWY_COMPARE -#undef HWY_NEON_BUILD_RET_HWY_COMPARE -#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE -#undef HWY_NEON_BUILD_ARG_HWY_COMPARE - -// ------------------------------ Armv7 i64 compare (Shuffle2301, Eq) - -#if HWY_ARCH_ARM_V7 - -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - const Simd d32; - const Simd d64; - const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); - const auto cmp64 = cmp32 & Shuffle2301(cmp32); - return MaskFromVec(BitCast(d64, cmp64)); -} - -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - const Simd d32; - const Simd d64; - const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); - const auto cmp64 = cmp32 & Shuffle2301(cmp32); - return MaskFromVec(BitCast(d64, cmp64)); -} - -HWY_API Mask128 operator<(const Vec128 a, - const Vec128 b) { - const int64x2_t sub = vqsubq_s64(a.raw, b.raw); - return MaskFromVec(BroadcastSignBit(Vec128(sub))); -} -HWY_API Mask128 operator<(const Vec64 a, - const Vec64 b) { - const int64x1_t sub = vqsub_s64(a.raw, b.raw); - return MaskFromVec(BroadcastSignBit(Vec64(sub))); -} - -template -HWY_API Mask128 operator<(const Vec128 a, - const Vec128 b) { - const DFromV du; - const RebindToSigned di; - const Vec128 msb = AndNot(a, b) | AndNot(a ^ b, a - b); - return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb)))); -} - -template -HWY_API Mask128 operator<=(const Vec128 a, - const Vec128 b) { - return Not(b < a); -} - -template -HWY_API Mask128 operator<=(const Vec128 a, - const Vec128 b) { - return Not(b < a); -} - -#endif - -// ------------------------------ operator!= (operator==) - -// Customize HWY_NEON_DEF_FUNCTION to call 2 functions. -#pragma push_macro("HWY_NEON_DEF_FUNCTION") -#undef HWY_NEON_DEF_FUNCTION -// This cannot have _any_ template argument (in x86_128 we can at least have N -// as an argument), otherwise it is not more specialized than rewritten -// operator== in C++20, leading to compile errors. -#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ - HWY_API Mask128 name(Vec128 a, \ - Vec128 b) { \ - return Not(a == b); \ - } - -HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored) - -#pragma pop_macro("HWY_NEON_DEF_FUNCTION") - -// ------------------------------ Reversed comparisons - -template -HWY_API Mask128 operator>(Vec128 a, Vec128 b) { - return operator<(b, a); -} -template -HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { - return operator<=(b, a); -} - -// ------------------------------ FirstN (Iota, Lt) - -template -HWY_API MFromD FirstN(D d, size_t num) { - const RebindToSigned di; // Signed comparisons are cheaper. - using TI = TFromD; - return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(num))); -} - -// ------------------------------ TestBit (Eq) - -#define HWY_NEON_BUILD_TPL_HWY_TESTBIT -#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128 -#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ - Vec128 v, Vec128 bit -#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw - -#if HWY_ARCH_ARM_A64 -HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) -#else -// No 64-bit versions on armv7 -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) - -template -HWY_API Mask128 TestBit(Vec128 v, - Vec128 bit) { - return (v & bit) == bit; -} -template -HWY_API Mask128 TestBit(Vec128 v, - Vec128 bit) { - return (v & bit) == bit; -} - -#endif -#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT -#undef HWY_NEON_BUILD_RET_HWY_TESTBIT -#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT -#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT - -// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) -HWY_API Vec128 Abs(const Vec128 v) { -#if HWY_ARCH_ARM_A64 - return Vec128(vabsq_s64(v.raw)); -#else - const auto zero = Zero(DFromV()); - return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -#endif -} -HWY_API Vec64 Abs(const Vec64 v) { -#if HWY_ARCH_ARM_A64 - return Vec64(vabs_s64(v.raw)); -#else - const auto zero = Zero(DFromV()); - return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -#endif -} - -// ------------------------------ Min (IfThenElse, BroadcastSignBit) - -// Unsigned -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) - -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_ARCH_ARM_A64 - return IfThenElse(b < a, b, a); -#else - const DFromV du; - const RebindToSigned di; - return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b))); -#endif -} - -// Signed -HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2) - -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_ARCH_ARM_A64 - return IfThenElse(b < a, b, a); -#else - const Vec128 sign = SaturatedSub(a, b); - return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); -#endif -} - -// Float: IEEE minimumNumber on v8 -#if HWY_ARCH_ARM_A64 - -HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Min, vminnm, _, 2) - -// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define -// in terms of the 128-bit intrinsic. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 -namespace detail { - -template -HWY_INLINE V F64Vec64Min(V a, V b) { - const DFromV d; - const Twice dt; - return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); -} - -} // namespace detail -#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - -HWY_API Vec64 Min(Vec64 a, Vec64 b) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - return detail::F64Vec64Min(a, b); -#else - return Vec64(vminnm_f64(a.raw, b.raw)); -#endif -} - -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128(vminnmq_f64(a.raw, b.raw)); -} - -#else -// Armv7: NaN if any is NaN. -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) -#endif // HWY_ARCH_ARM_A64 - -// ------------------------------ Max (IfThenElse, BroadcastSignBit) - -// Unsigned (no u64) -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2) - -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_ARCH_ARM_A64 - return IfThenElse(b < a, a, b); -#else - const DFromV du; - const RebindToSigned di; - return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b))); -#endif -} - -// Signed (no i64) -HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2) - -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_ARCH_ARM_A64 - return IfThenElse(b < a, a, b); -#else - const Vec128 sign = SaturatedSub(a, b); - return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); -#endif -} - -// Float: IEEE minimumNumber on v8 -#if HWY_ARCH_ARM_A64 - -HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Max, vmaxnm, _, 2) - -// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define -// in terms of the 128-bit intrinsic. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 -namespace detail { - -template -HWY_INLINE V F64Vec64Max(V a, V b) { - const DFromV d; - const Twice dt; - return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); -} - -} // namespace detail -#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - -HWY_API Vec64 Max(Vec64 a, Vec64 b) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - return detail::F64Vec64Max(a, b); -#else - return Vec64(vmaxnm_f64(a.raw, b.raw)); -#endif -} - -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128(vmaxnmq_f64(a.raw, b.raw)); -} - -#else -// Armv7: NaN if any is NaN. -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) -#endif // HWY_ARCH_ARM_A64 - -// ================================================== MEMORY - -// ------------------------------ Load 128 - -template -HWY_API Vec128 LoadU(D /* tag */, - const uint8_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_u8(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const uint16_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_u16(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const uint32_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_u32(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const uint64_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_u64(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const int8_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_s8(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const int16_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_s16(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const int32_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_s32(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, - const int64_t* HWY_RESTRICT unaligned) { - return Vec128(vld1q_s64(unaligned)); -} -template -HWY_API Vec128 LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) { - return Vec128(vld1q_f32(unaligned)); -} -#if HWY_HAVE_FLOAT64 -template -HWY_API Vec128 LoadU(D /* tag */, - const double* HWY_RESTRICT unaligned) { - return Vec128(vld1q_f64(unaligned)); -} -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ Load 64 - -template -HWY_API Vec64 LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { - return Vec64(vld1_u8(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { - return Vec64(vld1_u16(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) { - return Vec64(vld1_u32(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) { - return Vec64(vld1_u64(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { - return Vec64(vld1_s8(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { - return Vec64(vld1_s16(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) { - return Vec64(vld1_s32(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) { - return Vec64(vld1_s64(p)); -} -template -HWY_API Vec64 LoadU(D /* tag */, const float* HWY_RESTRICT p) { - return Vec64(vld1_f32(p)); -} -#if HWY_HAVE_FLOAT64 -template -HWY_API Vec64 LoadU(D /* tag */, const double* HWY_RESTRICT p) { - return Vec64(vld1_f64(p)); -} -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ Load 32 - -// Actual 32-bit broadcast load - used to implement the other lane types -// because reinterpret_cast of the pointer leads to incorrect codegen on GCC. -template -HWY_API Vec32 LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) { - return Vec32(vld1_dup_u32(p)); -} -template -HWY_API Vec32 LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) { - return Vec32(vld1_dup_s32(p)); -} -template -HWY_API Vec32 LoadU(D /*tag*/, const float* HWY_RESTRICT p) { - return Vec32(vld1_dup_f32(p)); -} - -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - const Repartition d32; - uint32_t buf; - CopyBytes<4>(p, &buf); - return BitCast(d, LoadU(d32, &buf)); -} - -// ------------------------------ Load 16 - -// Actual 16-bit broadcast load - used to implement the other lane types -// because reinterpret_cast of the pointer leads to incorrect codegen on GCC. -template -HWY_API VFromD LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { - return VFromD(vld1_dup_u16(p)); -} -template -HWY_API VFromD LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { - return VFromD(vld1_dup_s16(p)); -} - -// 8-bit x2 -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - const Repartition d16; - uint16_t buf; - CopyBytes<2>(p, &buf); - return BitCast(d, LoadU(d16, &buf)); -} - -// ------------------------------ Load 8 -template -HWY_API VFromD LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { - return VFromD(vld1_dup_u8(p)); -} -template -HWY_API VFromD LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { - return VFromD(vld1_dup_s8(p)); -} - -// ------------------------------ Load misc - -// [b]float16_t may use the same Raw as uint16_t, so forward to that. -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du16; - const auto pu16 = reinterpret_cast(p); - return BitCast(d, LoadU(du16, pu16)); -} - -// On Arm, Load is the same as LoadU. -template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { - return LoadU(d, p); -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT aligned) { - return IfThenElseZero(m, Load(d, aligned)); -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const TFromD* HWY_RESTRICT aligned) { - return IfThenElse(m, Load(d, aligned), v); -} - -// 128-bit SIMD => nothing to duplicate, same as an unaligned load. -template -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { - return LoadU(d, p); -} - -// ------------------------------ Store 128 - -template -HWY_API void StoreU(Vec128 v, D /* tag */, - uint8_t* HWY_RESTRICT unaligned) { - vst1q_u8(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - uint16_t* HWY_RESTRICT unaligned) { - vst1q_u16(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - uint32_t* HWY_RESTRICT unaligned) { - vst1q_u32(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - uint64_t* HWY_RESTRICT unaligned) { - vst1q_u64(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - int8_t* HWY_RESTRICT unaligned) { - vst1q_s8(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - int16_t* HWY_RESTRICT unaligned) { - vst1q_s16(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - int32_t* HWY_RESTRICT unaligned) { - vst1q_s32(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - int64_t* HWY_RESTRICT unaligned) { - vst1q_s64(unaligned, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, - float* HWY_RESTRICT unaligned) { - vst1q_f32(unaligned, v.raw); -} -#if HWY_HAVE_FLOAT64 -template -HWY_API void StoreU(Vec128 v, D /* tag */, - double* HWY_RESTRICT unaligned) { - vst1q_f64(unaligned, v.raw); -} -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ Store 64 - -template -HWY_API void StoreU(Vec64 v, D /* tag */, uint8_t* HWY_RESTRICT p) { - vst1_u8(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, uint16_t* HWY_RESTRICT p) { - vst1_u16(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, uint32_t* HWY_RESTRICT p) { - vst1_u32(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, uint64_t* HWY_RESTRICT p) { - vst1_u64(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, int8_t* HWY_RESTRICT p) { - vst1_s8(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, int16_t* HWY_RESTRICT p) { - vst1_s16(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, int32_t* HWY_RESTRICT p) { - vst1_s32(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, int64_t* HWY_RESTRICT p) { - vst1_s64(p, v.raw); -} -template -HWY_API void StoreU(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { - vst1_f32(p, v.raw); -} -#if HWY_HAVE_FLOAT64 -template -HWY_API void StoreU(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { - vst1_f64(p, v.raw); -} -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ Store 32 - -template -HWY_API void StoreU(Vec32 v, D, uint32_t* HWY_RESTRICT p) { - vst1_lane_u32(p, v.raw, 0); -} -template -HWY_API void StoreU(Vec32 v, D, int32_t* HWY_RESTRICT p) { - vst1_lane_s32(p, v.raw, 0); -} -template -HWY_API void StoreU(Vec32 v, D, float* HWY_RESTRICT p) { - vst1_lane_f32(p, v.raw, 0); -} - -// Overload 16-bit types directly to avoid ambiguity with [b]float16_t. -template , - HWY_IF_T_SIZE(T, 1)> -HWY_API void StoreU(Vec32 v, D d, T* HWY_RESTRICT p) { - Repartition d32; - uint32_t buf = GetLane(BitCast(d32, v)); - CopyBytes<4>(&buf, p); -} - -template -HWY_API void StoreU(Vec32 v, D d, uint16_t* HWY_RESTRICT p) { - Repartition d32; - uint32_t buf = GetLane(BitCast(d32, v)); - CopyBytes<4>(&buf, p); -} - -template -HWY_API void StoreU(Vec32 v, D d, int16_t* HWY_RESTRICT p) { - Repartition d32; - uint32_t buf = GetLane(BitCast(d32, v)); - CopyBytes<4>(&buf, p); -} - -// ------------------------------ Store 16 - -template -HWY_API void StoreU(Vec16 v, D, uint16_t* HWY_RESTRICT p) { - vst1_lane_u16(p, v.raw, 0); -} -template -HWY_API void StoreU(Vec16 v, D, int16_t* HWY_RESTRICT p) { - vst1_lane_s16(p, v.raw, 0); -} - -template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - const Repartition d16; - const uint16_t buf = GetLane(BitCast(d16, v)); - CopyBytes<2>(&buf, p); -} - -// ------------------------------ Store 8 - -template -HWY_API void StoreU(Vec128 v, D, uint8_t* HWY_RESTRICT p) { - vst1_lane_u8(p, v.raw, 0); -} -template -HWY_API void StoreU(Vec128 v, D, int8_t* HWY_RESTRICT p) { - vst1_lane_s8(p, v.raw, 0); -} - -// [b]float16_t may use the same Raw as uint16_t, so forward to that. -template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du16; - const auto pu16 = reinterpret_cast(p); - return StoreU(BitCast(du16, v), du16, pu16); -} - -HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_GCC_ACTUAL -HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") -#endif - -// On Arm, Store is the same as StoreU. -template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - StoreU(v, d, aligned); -} - -HWY_DIAGNOSTICS(pop) - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - // Treat as unsigned so that we correctly support float16. - const RebindToUnsigned du; - const auto blended = - IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p))); - StoreU(BitCast(d, blended), d, p); -} - -// ------------------------------ Non-temporal stores - -// Same as aligned stores on non-x86. - -template -HWY_API void Stream(const VFromD v, D d, TFromD* HWY_RESTRICT aligned) { -#if HWY_ARCH_ARM_A64 -#if HWY_COMPILER_GCC - __builtin_prefetch(aligned, 1, 0); -#elif HWY_COMPILER_MSVC - __prefetch2(aligned, 0x11); -#endif -#endif - Store(v, d, aligned); -} - -// ================================================== CONVERT - -// ------------------------------ ConvertTo - -#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 - -// TODO(janwas): use macro generator instead of handwritten -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_f16_s16(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_f16_s16(v.raw)); -} - -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_f16_u16(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_f16_u16(v.raw)); -} - -#endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 - -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_f32_s32(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_f32_s32(v.raw)); -} - -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_f32_u32(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_f32_u32(v.raw)); -} - -// Truncates (rounds toward zero). -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_s32_f32(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_s32_f32(v.raw)); -} - -#if HWY_HAVE_FLOAT64 - -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_f64_s64(v.raw)); -} -template -HWY_API Vec64 ConvertTo(D /* tag */, Vec64 v) { -// GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - return Set(Full64(), static_cast(GetLane(v))); -#else - return Vec64(vcvt_f64_s64(v.raw)); -#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 -} - -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_f64_u64(v.raw)); -} -template -HWY_API Vec64 ConvertTo(D /* tag */, Vec64 v) { - return Vec64(vcvt_f64_u64(v.raw)); -} - -// Truncates (rounds toward zero). -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_s64_f64(v.raw)); -} -template -HWY_API Vec64 ConvertTo(D di, Vec64 v) { - // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. Use the - // 128-bit version to avoid UB from casting double -> int64_t. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - const Full128 ddt; - const Twice dit; - return LowerHalf(di, ConvertTo(dit, Combine(ddt, v, v))); -#else - (void)di; - return Vec64(vcvt_s64_f64(v.raw)); -#endif -} - -#endif // HWY_HAVE_FLOAT64 - -#if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 - -// Truncates (rounds toward zero). -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_s16_f16(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_s16_f16(v.raw)); -} - -template -HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { - return Vec128(vcvtq_u16_f16(v.raw)); -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_u16_f16(v.raw)); -} - -#endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 - -// ------------------------------ PromoteTo (ConvertTo) - -// Unsigned: zero-extend to full vector. -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vmovl_u8(v.raw)); -} -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec32 v) { - uint16x8_t a = vmovl_u8(v.raw); - return Vec128(vmovl_u16(vget_low_u16(a))); -} -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vmovl_u16(v.raw)); -} -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vmovl_u32(v.raw)); -} -template -HWY_API Vec128 PromoteTo(D d, Vec64 v) { - return BitCast(d, Vec128(vmovl_u8(v.raw))); -} -template -HWY_API Vec128 PromoteTo(D d, Vec32 v) { - uint16x8_t a = vmovl_u8(v.raw); - return BitCast(d, Vec128(vmovl_u16(vget_low_u16(a)))); -} -template -HWY_API Vec128 PromoteTo(D d, Vec64 v) { - return BitCast(d, Vec128(vmovl_u16(v.raw))); -} -template -HWY_API Vec128 PromoteTo(D d, Vec64 v) { - return BitCast(d, Vec128(vmovl_u32(v.raw))); -} - -// Unsigned: zero-extend to half vector. -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_u16(vmovl_u8(v.raw))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw))))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_u32(vmovl_u16(v.raw))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_u64(vmovl_u32(v.raw))); -} -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { - using VU16 = VFromD>; - return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw)))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw))); - return VFromD(vget_low_s32(vreinterpretq_s32_u32(u32))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw)))); -} -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { - using DU = RebindToUnsigned; - return BitCast(d, VFromD(vget_low_u64(vmovl_u32(v.raw)))); -} - -// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to -// TFromD -template -HWY_API VFromD PromoteTo(D d, V v) { - const Rebind du32; - return PromoteTo(d, PromoteTo(du32, v)); -} - -// Signed: replicate sign bit to full vector. -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vmovl_s8(v.raw)); -} -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec32 v) { - int16x8_t a = vmovl_s8(v.raw); - return Vec128(vmovl_s16(vget_low_s16(a))); -} -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vmovl_s16(v.raw)); -} -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vmovl_s32(v.raw)); -} - -// Signed: replicate sign bit to half vector. -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_s16(vmovl_s8(v.raw))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw))))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_s32(vmovl_s16(v.raw))); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_s64(vmovl_s32(v.raw))); -} - -// I8/I16 to I64: First, promote to I32, and then promote to I64 -template -HWY_API VFromD PromoteTo(D d, V v) { - const Rebind di32; - return PromoteTo(d, PromoteTo(di32, v)); -} - -#if HWY_NEON_HAVE_FLOAT16C - -// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif - -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vcvt_f32_f16(v.raw)); -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD(vget_low_f32(vcvt_f32_f16(v.raw))); -} - -#endif // HWY_NEON_HAVE_FLOAT16C - -#if HWY_HAVE_FLOAT64 - -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - return Vec128(vcvt_f64_f32(v.raw)); -} - -template -HWY_API Vec64 PromoteTo(D /* tag */, Vec32 v) { - return Vec64(vget_low_f64(vcvt_f64_f32(v.raw))); -} - -template -HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { - const int64x2_t i64 = vmovl_s32(v.raw); - return Vec128(vcvtq_f64_s64(i64)); -} - -template -HWY_API Vec64 PromoteTo(D d, Vec32 v) { - return ConvertTo(d, Vec64(vget_low_s64(vmovl_s32(v.raw)))); -} - -#endif // HWY_HAVE_FLOAT64 - -// ------------------------------ PromoteUpperTo - -#if HWY_ARCH_ARM_A64 - -// Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo. -#ifdef HWY_NATIVE_PROMOTE_UPPER_TO -#undef HWY_NATIVE_PROMOTE_UPPER_TO -#else -#define HWY_NATIVE_PROMOTE_UPPER_TO -#endif - -// Unsigned: zero-extend to full vector. -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vmovl_high_u8(v.raw)); -} -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vmovl_high_u16(v.raw)); -} -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vmovl_high_u32(v.raw)); -} -template -HWY_API Vec128 PromoteUpperTo(D d, Vec128 v) { - return BitCast(d, Vec128(vmovl_high_u8(v.raw))); -} -template -HWY_API Vec128 PromoteUpperTo(D d, Vec128 v) { - return BitCast(d, Vec128(vmovl_high_u16(v.raw))); -} -template -HWY_API Vec128 PromoteUpperTo(D d, Vec128 v) { - return BitCast(d, Vec128(vmovl_high_u32(v.raw))); -} - -// Signed: replicate sign bit to full vector. -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vmovl_high_s8(v.raw)); -} -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vmovl_high_s16(v.raw)); -} -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vmovl_high_s32(v.raw)); -} - -#if HWY_NEON_HAVE_FLOAT16C - -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vcvt_high_f32_f16(v.raw)); -} - -#endif // HWY_NEON_HAVE_FLOAT16C - -template -HWY_API VFromD PromoteUpperTo(D df32, VFromD> v) { - const Repartition du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); -} - -#if HWY_HAVE_FLOAT64 - -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - return Vec128(vcvt_high_f64_f32(v.raw)); -} - -template -HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { - const int64x2_t i64 = vmovl_high_s32(v.raw); - return Vec128(vcvtq_f64_s64(i64)); -} - -#endif // HWY_HAVE_FLOAT64 - -// Generic version for <=64 bit input/output (_high is only for full vectors). -template -HWY_API VFromD PromoteUpperTo(D d, V v) { - const Rebind, decltype(d)> dh; - return PromoteTo(d, UpperHalf(dh, v)); -} - -#endif // HWY_ARCH_ARM_A64 - -// ------------------------------ DemoteTo (ConvertTo) - -// From full vector to half or quarter -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovun_s32(v.raw)); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovn_s32(v.raw)); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec128 v) { - const uint16x4_t a = vqmovun_s32(v.raw); - return Vec32(vqmovn_u16(vcombine_u16(a, a))); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovun_s16(v.raw)); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec128 v) { - const int16x4_t a = vqmovn_s32(v.raw); - return Vec32(vqmovn_s16(vcombine_s16(a, a))); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovn_s16(v.raw)); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovn_u32(v.raw)); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec128 v) { - const uint16x4_t a = vqmovn_u32(v.raw); - return Vec32(vqmovn_u16(vcombine_u16(a, a))); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovn_u16(v.raw)); -} - -// From half vector to partial half -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vqmovun_s32(vcombine_s32(v.raw, v.raw))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vqmovn_s32(vcombine_s32(v.raw, v.raw))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw)); - return VFromD(vqmovn_u16(vcombine_u16(a, a))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vqmovun_s16(vcombine_s16(v.raw, v.raw))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw)); - return VFromD(vqmovn_s16(vcombine_s16(a, a))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vqmovn_s16(vcombine_s16(v.raw, v.raw))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vqmovn_u32(vcombine_u32(v.raw, v.raw))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw)); - return VFromD(vqmovn_u16(vcombine_u16(a, a))); -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vqmovn_u16(vcombine_u16(v.raw, v.raw))); -} - -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovn_s64(v.raw)); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovun_s64(v.raw)); -} -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vqmovn_u64(v.raw)); -} -template -HWY_API VFromD DemoteTo(D d, Vec128 v) { - const Rebind di32; - return DemoteTo(d, DemoteTo(di32, v)); -} -template -HWY_API VFromD DemoteTo(D d, Vec128 v) { - const Rebind du32; - return DemoteTo(d, DemoteTo(du32, v)); -} -template -HWY_API VFromD DemoteTo(D d, Vec128 v) { - const Rebind du32; - return DemoteTo(d, DemoteTo(du32, v)); -} - -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - return Vec32(vqmovn_s64(vcombine_s64(v.raw, v.raw))); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - return Vec32(vqmovun_s64(vcombine_s64(v.raw, v.raw))); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - return Vec32(vqmovn_u64(vcombine_u64(v.raw, v.raw))); -} -template -HWY_API VFromD DemoteTo(D d, Vec64 v) { - const Rebind di32; - return DemoteTo(d, DemoteTo(di32, v)); -} -template -HWY_API VFromD DemoteTo(D d, Vec64 v) { - const Rebind du32; - return DemoteTo(d, DemoteTo(du32, v)); -} -template -HWY_API VFromD DemoteTo(D d, Vec64 v) { - const Rebind du32; - return DemoteTo(d, DemoteTo(du32, v)); -} - -#if HWY_NEON_HAVE_FLOAT16C - -// We already toggled HWY_NATIVE_F16C above. - -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64{vcvt_f16_f32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD(vcvt_f16_f32(vcombine_f32(v.raw, v.raw))); -} - -#endif // HWY_NEON_HAVE_FLOAT16C - -template -HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { - const Rebind di32; - const Rebind du32; // for logical shift right - const Rebind du16; - const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); - return BitCast(dbf16, DemoteTo(du16, bits_in_32)); -} - -#if HWY_HAVE_FLOAT64 - -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - return Vec64(vcvt_f32_f64(v.raw)); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - return Vec32(vcvt_f32_f64(vcombine_f64(v.raw, v.raw))); -} - -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { - const int64x2_t i64 = vcvtq_s64_f64(v.raw); - return Vec64(vqmovn_s64(i64)); -} -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - // There is no i64x1 -> i32x1 narrow, so Combine to 128-bit. Do so with the - // f64 input already to also avoid the missing vcvt_s64_f64 in GCC 6.4. - const Full128 ddt; - const Full128 dit; - return Vec32(vqmovn_s64(ConvertTo(dit, Combine(ddt, v, v)).raw)); -} - -#endif // HWY_HAVE_FLOAT64 - -HWY_API Vec32 U8FromU32(Vec128 v) { - const uint8x16_t org_v = detail::BitCastToByte(v).raw; - const uint8x16_t w = vuzp1q_u8(org_v, org_v); - return Vec32(vget_low_u8(vuzp1q_u8(w, w))); -} -template -HWY_API Vec128 U8FromU32(Vec128 v) { - const uint8x8_t org_v = detail::BitCastToByte(v).raw; - const uint8x8_t w = vuzp1_u8(org_v, org_v); - return Vec128(vuzp1_u8(w, w)); -} - -// ------------------------------ Round (IfThenElse, mask, logical) - -#if HWY_ARCH_ARM_A64 -// Toward nearest integer -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) - -// Toward zero, aka truncate -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1) - -// Toward +infinity, aka ceiling -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1) - -// Toward -infinity, aka floor -HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1) -#else - -// ------------------------------ Trunc - -// Armv7 only supports truncation to integer. We can either convert back to -// float (3 floating-point and 2 logic operations) or manipulate the binary32 -// representation, clearing the lowest 23-exp mantissa bits. This requires 9 -// integer operations and 3 constants, which is likely more expensive. - -namespace detail { - -// The original value is already the desired result if NaN or the magnitude is -// large (i.e. the value is already an integer). -template -HWY_INLINE Mask128 UseInt(const Vec128 v) { - return Abs(v) < Set(Simd(), MantissaEnd()); -} - -} // namespace detail - -template -HWY_API Vec128 Trunc(const Vec128 v) { - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - return IfThenElse(detail::UseInt(v), int_f, v); -} - -template -HWY_API Vec128 Round(const Vec128 v) { - const DFromV df; - - // Armv7 also lacks a native NearestInt, but we can instead rely on rounding - // (we assume the current mode is nearest-even) after addition with a large - // value such that no mantissa bits remain. We may need a compiler flag for - // precise floating-point to prevent this from being "optimized" out. - const auto max = Set(df, MantissaEnd()); - const auto large = CopySignToAbs(max, v); - const auto added = large + v; - const auto rounded = added - large; - - // Keep original if NaN or the magnitude is large (already an int). - return IfThenElse(Abs(v) < max, rounded, v); -} - -template -HWY_API Vec128 Ceil(const Vec128 v) { - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - // Truncating a positive non-integer ends up smaller; if so, add 1. - const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); - - return IfThenElse(detail::UseInt(v), int_f - neg1, v); -} - -template -HWY_API Vec128 Floor(const Vec128 v) { - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - // Truncating a negative non-integer ends up larger; if so, subtract 1. - const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); - - return IfThenElse(detail::UseInt(v), int_f + neg1, v); -} - -#endif - -// ------------------------------ NearestInt (Round) - -#if HWY_ARCH_ARM_A64 - -HWY_API Vec128 NearestInt(const Vec128 v) { - return Vec128(vcvtnq_s32_f32(v.raw)); -} -template -HWY_API Vec128 NearestInt(const Vec128 v) { - return Vec128(vcvtn_s32_f32(v.raw)); -} - -#else - -template -HWY_API Vec128 NearestInt(const Vec128 v) { - const RebindToSigned> di; - return ConvertTo(di, Round(v)); -} - -#endif - -// ------------------------------ Floating-point classification -template -HWY_API Mask128 IsNaN(const Vec128 v) { - return v != v; -} - -template -HWY_API Mask128 IsInf(const Vec128 v) { - const DFromV d; - const RebindToSigned di; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask128 IsFinite(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - const VFromD vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). - const VFromD exp = - BitCast(di, ShiftRight() + 1>(Add(vu, vu))); - return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); -} - -// ================================================== SWIZZLE - -// ------------------------------ LowerHalf - -// <= 64 bit: just return different type -template -HWY_API Vec128 LowerHalf(Vec128 v) { - return Vec128(v.raw); -} - -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_u8(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_u16(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_u32(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_u64(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_s8(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_s16(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_s32(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_s64(v.raw)); -} -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_f32(v.raw)); -} -#if HWY_HAVE_FLOAT16 -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_f16(v.raw)); -} -#endif // HWY_HAVE_FLOAT16 -#if HWY_HAVE_FLOAT64 -HWY_API Vec64 LowerHalf(Vec128 v) { - return Vec64(vget_low_f64(v.raw)); -} -#endif // HWY_HAVE_FLOAT64 - -template -HWY_API VFromD>> LowerHalf(V v) { - const Full128 du; - const Half> dh; - return BitCast(dh, LowerHalf(BitCast(du, v))); -} - -template -HWY_API VFromD LowerHalf(DH /* tag */, VFromD> v) { - return LowerHalf(v); -} - -// ------------------------------ CombineShiftRightBytes - -// 128-bit -template > -HWY_API Vec128 CombineShiftRightBytes(D d, Vec128 hi, Vec128 lo) { - static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]"); - const Repartition d8; - uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); - return BitCast(d, Vec128(v8)); -} - -// 64-bit -template > -HWY_API Vec64 CombineShiftRightBytes(D d, Vec64 hi, Vec64 lo) { - static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]"); - const Repartition d8; - uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); - return BitCast(d, VFromD(v8)); -} - -// <= 32-bit defined after ShiftLeftBytes. - -// ------------------------------ Shift vector by constant #bytes - -namespace detail { - -// Partially specialize because kBytes = 0 and >= size are compile errors; -// callers replace the latter with 0xFF for easier specialization. -template -struct ShiftLeftBytesT { - // Full - template - HWY_INLINE Vec128 operator()(const Vec128 v) { - const Full128 d; - return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d)); - } - - // Partial - template - HWY_INLINE Vec128 operator()(const Vec128 v) { - // Expand to 64-bit so we only use the native EXT instruction. - const Full64 d64; - const auto zero64 = Zero(d64); - const decltype(zero64) v64(v.raw); - return Vec128( - CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw); - } -}; -template <> -struct ShiftLeftBytesT<0> { - template - HWY_INLINE Vec128 operator()(const Vec128 v) { - return v; - } -}; -template <> -struct ShiftLeftBytesT<0xFF> { - template - HWY_INLINE Vec128 operator()(const Vec128 v) { - return Xor(v, v); - } -}; - -template -struct ShiftRightBytesT { - template - HWY_INLINE Vec128 operator()(Vec128 v) { - const DFromV d; - // For < 64-bit vectors, zero undefined lanes so we shift in zeros. - if (d.MaxBytes() < 8) { - constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8; - const Simd dreg; - v = Vec128( - IfThenElseZero(FirstN(dreg, N), VFromD(v.raw)).raw); - } - return CombineShiftRightBytes(d, Zero(d), v); - } -}; -template <> -struct ShiftRightBytesT<0> { - template - HWY_INLINE Vec128 operator()(const Vec128 v) { - return v; - } -}; -template <> -struct ShiftRightBytesT<0xFF> { - template - HWY_INLINE Vec128 operator()(const Vec128 v) { - return Xor(v, v); - } -}; - -} // namespace detail - -template -HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { - return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v); -} - -template -HWY_API Vec128 ShiftLeftBytes(Vec128 v) { - return ShiftLeftBytes(DFromV(), v); -} - -template -HWY_API VFromD ShiftLeftLanes(D d, VFromD v) { - const Repartition d8; - return BitCast(d, ShiftLeftBytes)>(BitCast(d8, v))); -} - -template -HWY_API Vec128 ShiftLeftLanes(Vec128 v) { - return ShiftLeftLanes(DFromV(), v); -} - -// 0x01..0F, kBytes = 1 => 0x0001..0E -template -HWY_API VFromD ShiftRightBytes(D d, VFromD v) { - return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()( - v); -} - -template -HWY_API VFromD ShiftRightLanes(D d, VFromD v) { - const Repartition d8; - return BitCast( - d, ShiftRightBytes)>(d8, BitCast(d8, v))); -} - -// Calls ShiftLeftBytes -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - constexpr size_t kSize = d.MaxBytes(); - static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); - const Repartition d8; - const Full64 d_full8; - const Repartition, decltype(d_full8)> d_full; - using V64 = VFromD; - const V64 hi64(BitCast(d8, hi).raw); - // Move into most-significant bytes - const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw)); - const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64); - // After casting to full 64-bit vector of correct type, shrink to 32-bit - return VFromD(BitCast(d_full, r).raw); -} - -// ------------------------------ UpperHalf (ShiftRightBytes) - -// Full input -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_u8(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_u16(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_u32(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_u64(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_s8(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_s16(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_s32(v.raw)); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_s64(v.raw)); -} -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_f16(v.raw)); -} -#endif -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_f32(v.raw)); -} -#if HWY_HAVE_FLOAT64 -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64(vget_high_f64(v.raw)); -} -#endif // HWY_HAVE_FLOAT64 - -template -HWY_API VFromD UpperHalf(D dh, VFromD> v) { - const RebindToUnsigned> du; - const Half duh; - return BitCast(dh, UpperHalf(duh, BitCast(du, v))); -} - -// Partial -template -HWY_API VFromD UpperHalf(DH dh, VFromD> v) { - const Twice d; - const RebindToUnsigned du; - const VFromD upper = - ShiftRightBytes(du, BitCast(du, v)); - return VFromD(BitCast(d, upper).raw); -} - -// ------------------------------ Broadcast/splat any lane - -#if HWY_ARCH_ARM_A64 -// Unsigned -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 16, "Invalid lane"); - return Vec128(vdupq_laneq_u8(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_u8(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - return Vec128(vdupq_laneq_u16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_u16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec128(vdupq_laneq_u32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_u32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec128(vdupq_laneq_u64(v.raw, kLane)); -} -// Vec64 is defined below. - -// Signed -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 16, "Invalid lane"); - return Vec128(vdupq_laneq_s8(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_s8(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - return Vec128(vdupq_laneq_s16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_s16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec128(vdupq_laneq_s32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_s32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec128(vdupq_laneq_s64(v.raw, kLane)); -} -// Vec64 is defined below. - -// Float -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - return Vec128(vdupq_laneq_f16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_f16(v.raw, kLane)); -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec128(vdupq_laneq_f32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_f32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec128(vdupq_laneq_f64(v.raw, kLane)); -} -template -HWY_API Vec64 Broadcast(Vec64 v) { - static_assert(0 <= kLane && kLane < 1, "Invalid lane"); - return v; -} - -#else // !HWY_ARCH_ARM_A64 -// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*. - -// Unsigned -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 16, "Invalid lane"); - return Vec128(vdupq_n_u8(vgetq_lane_u8(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_u8(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - return Vec128(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_u16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec128(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_u32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec128(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane))); -} -// Vec64 is defined below. - -// Signed -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 16, "Invalid lane"); - return Vec128(vdupq_n_s8(vgetq_lane_s8(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_s8(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - return Vec128(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_s16(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec128(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_s32(v.raw, kLane)); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec128(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane))); -} -// Vec64 is defined below. - -// Float -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - return Vec128(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane))); -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec128(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane))); -} -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128(vdup_lane_f32(v.raw, kLane)); -} - -#endif // HWY_ARCH_ARM_A64 - -template -HWY_API Vec64 Broadcast(Vec64 v) { - static_assert(0 <= kLane && kLane < 1, "Invalid lane"); - return v; -} -template -HWY_API Vec64 Broadcast(Vec64 v) { - static_assert(0 <= kLane && kLane < 1, "Invalid lane"); - return v; -} - -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices for use by TableLookupLanes. -template -struct Indices128 { - typename detail::Raw128::type raw; -}; - -namespace detail { - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - return Iota(d8, 0); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - return Load(d8, kBroadcastLaneBytes); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; - return Load(d8, kBroadcastLaneBytes); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; - return Load(d8, kBroadcastLaneBytes); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - return Zero(d8); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; - return Load(d8, kByteOffsets); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; - return Load(d8, kByteOffsets); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; - return Load(d8, kByteOffsets); -} - -} // namespace detail - -template -HWY_API Indices128, MaxLanes(D())> IndicesFromVec( - D d, Vec128 vec) { - using T = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const RebindToUnsigned du; - using TU = TFromD; - HWY_DASSERT(AllTrue( - du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); -#endif - - (void)d; - return Indices128, MaxLanes(D())>{BitCast(d, vec).raw}; -} - -template -HWY_API Indices128, MaxLanes(D())> IndicesFromVec( - D d, Vec128 vec) { - using T = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const RebindToUnsigned du; - using TU = TFromD; - HWY_DASSERT(AllTrue( - du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); -#endif - - const Repartition d8; - using V8 = VFromD; - - // Broadcast each lane index to all bytes of T and shift to bytes - const V8 lane_indices = TableLookupBytes( - BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); - constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); - const V8 byte_indices = ShiftLeft(lane_indices); - const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); - return Indices128, MaxLanes(D())>{BitCast(d, sum).raw}; -} - -template -HWY_API Indices128, MaxLanes(D())> SetTableIndices(D d, - const TI* idx) { - const Rebind di; - return IndicesFromVec(d, LoadU(di, idx)); -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { - const DFromV d; - const RebindToSigned di; - return BitCast( - d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128{idx.raw}))); -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - const DFromV d; - const Twice dt; -// TableLookupLanes currently requires table and index vectors to be the same -// size, though a half-length index vector would be sufficient here. -#if HWY_IS_MSAN - const Vec128 idx_vec{idx.raw}; - const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; -#else - // We only keep LowerHalf of the result, which is valid in idx. - const Indices128 idx2{idx.raw}; -#endif - return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); -} - -template -HWY_API Vec64 TwoTablesLookupLanes(Vec64 a, Vec64 b, - Indices128 idx) { - const DFromV d; - const Repartition du8; - const auto a_u8 = BitCast(du8, a); - const auto b_u8 = BitCast(du8, b); - const auto idx_u8 = BitCast(du8, Vec64{idx.raw}); - -#if HWY_ARCH_ARM_A64 - const Twice dt_u8; - return BitCast( - d, Vec64{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)}); -#else - detail::Tuple2 tup = {{{a_u8.raw, b_u8.raw}}}; - return BitCast(d, Vec64{vtbl2_u8(tup.raw, idx_u8.raw)}); -#endif -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - const DFromV d; - const Repartition du8; - const auto a_u8 = BitCast(du8, a); - const auto b_u8 = BitCast(du8, b); - const auto idx_u8 = BitCast(du8, Vec128{idx.raw}); - -#if HWY_ARCH_ARM_A64 - detail::Tuple2 tup = {{{a_u8.raw, b_u8.raw}}}; - return BitCast(d, Vec128{vqtbl2q_u8(tup.raw, idx_u8.raw)}); -#else - const Half dh; - const Repartition dh_u8; - const auto a_lo_u8 = LowerHalf(dh_u8, a_u8); - const auto a_hi_u8 = UpperHalf(dh_u8, a_u8); - const auto b_lo_u8 = LowerHalf(dh_u8, b_u8); - const auto b_hi_u8 = UpperHalf(dh_u8, b_u8); - const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8); - const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8); - - detail::Tuple4 tup = { - {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}}; - const auto lo_result = - BitCast(dh, Vec64{vtbl4_u8(tup.raw, idx_lo_u8.raw)}); - const auto hi_result = - BitCast(dh, Vec64{vtbl4_u8(tup.raw, idx_hi_u8.raw)}); - return Combine(d, hi_result, lo_result); -#endif -} - -// ------------------------------ Reverse2 (CombineShiftRightBytes) - -// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -template -HWY_API VFromD Reverse2(D d, VFromD v) { - const RebindToUnsigned du; - return BitCast(d, VFromD(vrev16_u8(BitCast(du, v).raw))); -} -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 Reverse2(D d, Vec128 v) { - const RebindToUnsigned du; - return BitCast(d, Vec128(vrev16q_u8(BitCast(du, v).raw))); -} - -template -HWY_API VFromD Reverse2(D d, VFromD v) { - const RebindToUnsigned du; - return BitCast(d, VFromD(vrev32_u16(BitCast(du, v).raw))); -} -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec128 Reverse2(D d, Vec128 v) { - const RebindToUnsigned du; - return BitCast(d, Vec128(vrev32q_u16(BitCast(du, v).raw))); -} - -template -HWY_API VFromD Reverse2(D d, VFromD v) { - const RebindToUnsigned du; - return BitCast(d, VFromD(vrev64_u32(BitCast(du, v).raw))); -} -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec128 Reverse2(D d, Vec128 v) { - const RebindToUnsigned du; - return BitCast(d, Vec128(vrev64q_u32(BitCast(du, v).raw))); -} - -template -HWY_API VFromD Reverse2(D d, VFromD v) { - return CombineShiftRightBytes<8>(d, v, v); -} - -// ------------------------------ Reverse4 (Reverse2) - -template -HWY_API VFromD Reverse4(D d, VFromD v) { - const RebindToUnsigned du; - return BitCast(d, VFromD(vrev32_u8(BitCast(du, v).raw))); -} -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 Reverse4(D d, Vec128 v) { - const RebindToUnsigned du; - return BitCast(d, Vec128(vrev32q_u8(BitCast(du, v).raw))); -} - -template -HWY_API VFromD Reverse4(D d, VFromD v) { - const RebindToUnsigned du; - return BitCast(d, VFromD(vrev64_u16(BitCast(du, v).raw))); -} -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec128 Reverse4(D d, Vec128 v) { - const RebindToUnsigned du; - return BitCast(d, Vec128(vrev64q_u16(BitCast(du, v).raw))); -} - -template -HWY_API VFromD Reverse4(D d, VFromD v) { - const RepartitionToWide> duw; - return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v)))); -} - -template -HWY_API VFromD Reverse4(D /* tag */, VFromD) { - HWY_ASSERT(0); // don't have 8 u64 lanes -} - -// ------------------------------ Reverse8 (Reverse2, Reverse4) - -template -HWY_API VFromD Reverse8(D d, VFromD v) { - const RebindToUnsigned du; - return BitCast(d, VFromD(vrev64_u8(BitCast(du, v).raw))); -} -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 Reverse8(D d, Vec128 v) { - const RebindToUnsigned du; - return BitCast(d, Vec128(vrev64q_u8(BitCast(du, v).raw))); -} - -template -HWY_API VFromD Reverse8(D d, VFromD v) { - const Repartition du64; - return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v)))); -} - -template -HWY_API VFromD Reverse8(D, VFromD) { - HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit -} - -// ------------------------------ Reverse (Reverse2, Reverse4, Reverse8) - -template , HWY_IF_LANES_D(D, 1)> -HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { - return v; -} - -template , HWY_IF_LANES_D(D, 2)> -HWY_API Vec128 Reverse(D d, Vec128 v) { - return Reverse2(d, v); -} - -template , HWY_IF_LANES_D(D, 4)> -HWY_API Vec128 Reverse(D d, Vec128 v) { - return Reverse4(d, v); -} - -template , HWY_IF_LANES_D(D, 8)> -HWY_API Vec128 Reverse(D d, Vec128 v) { - return Reverse8(d, v); -} - -template , HWY_IF_LANES_D(D, 16)> -HWY_API Vec128 Reverse(D d, Vec128 v) { - const Repartition du64; - return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v)))); -} - -// ------------------------------ ReverseBits - -#if HWY_ARCH_ARM_A64 - -#ifdef HWY_NATIVE_REVERSE_BITS_UI8 -#undef HWY_NATIVE_REVERSE_BITS_UI8 -#else -#define HWY_NATIVE_REVERSE_BITS_UI8 -#endif - -HWY_NEON_DEF_FUNCTION_INT_8(ReverseBits, vrbit, _, 1) -HWY_NEON_DEF_FUNCTION_UINT_8(ReverseBits, vrbit, _, 1) - -#endif // HWY_ARCH_ARM_A64 - -// ------------------------------ Other shuffles (TableLookupBytes) - -// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). -// Shuffle0321 rotates one lane to the right (the previous least-significant -// lane is now most-significant). These could also be implemented via -// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. - -// Swap 64-bit halves -template -HWY_API Vec128 Shuffle1032(Vec128 v) { - return CombineShiftRightBytes<8>(DFromV(), v, v); -} -template -HWY_API Vec128 Shuffle01(Vec128 v) { - return CombineShiftRightBytes<8>(DFromV(), v, v); -} - -// Rotate right 32 bits -template -HWY_API Vec128 Shuffle0321(Vec128 v) { - return CombineShiftRightBytes<4>(DFromV(), v, v); -} - -// Rotate left 32 bits -template -HWY_API Vec128 Shuffle2103(Vec128 v) { - return CombineShiftRightBytes<12>(DFromV(), v, v); -} - -// Reverse -template -HWY_API Vec128 Shuffle0123(Vec128 v) { - return Reverse4(DFromV(), v); -} - -// ------------------------------ InterleaveLower - -// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides -// the least-significant lane) and "b". To concatenate two half-width integers -// into one, use ZipLower/Upper instead (also works with scalar). -HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveLower, vzip1, _, 2) -#if HWY_ARCH_ARM_A64 -// N=1 makes no sense (in that case, there would be no upper/lower). -HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveLower, vzip1, _, 2) -#else -// Emulated version for Armv7. -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - const DFromV d; - return CombineShiftRightBytes<8>(d, b, Shuffle01(a)); -} -#endif - -// < 64 bit parts -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - return Vec128(InterleaveLower(Vec64(a.raw), Vec64(b.raw)).raw); -} - -// Additional overload for the optional Simd<> tag. -template -HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { - return InterleaveLower(a, b); -} - -// ------------------------------ InterleaveUpper (UpperHalf) - -// All functions inside detail lack the required D parameter. -namespace detail { -HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveUpper, vzip2, _, 2) - -#if HWY_ARCH_ARM_A64 -// N=1 makes no sense (in that case, there would be no upper/lower). -HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveUpper, vzip2, _, 2) -#else -// Emulated version for Armv7. -template -HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { - const DFromV d; - return CombineShiftRightBytes<8>(d, Shuffle01(b), a); -} -#endif -} // namespace detail - -// Full register -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return detail::InterleaveUpper(a, b); -} - -// Partial -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const Half d2; - const VFromD a2(UpperHalf(d2, a).raw); - const VFromD b2(UpperHalf(d2, b).raw); - return InterleaveLower(d, a2, b2); -} - -// ------------------------------ ZipLower/ZipUpper (InterleaveLower) - -// Same as Interleave*, except that the return lanes are double-width integers; -// this is necessary because the single-lane scalar cannot return two values. -template >> -HWY_API VFromD ZipLower(V a, V b) { - return BitCast(DW(), InterleaveLower(a, b)); -} -template , class DW = RepartitionToWide> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - return BitCast(dw, InterleaveLower(D(), a, b)); -} - -template , class DW = RepartitionToWide> -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - return BitCast(dw, InterleaveUpper(D(), a, b)); -} - -// ------------------------------ Per4LaneBlockShuffle -namespace detail { - -#if HWY_COMPILER_GCC || HWY_COMPILER_CLANG - -#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#else -#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#endif - -template -HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t /*x3*/, - const uint32_t /*x2*/, - const uint32_t x1, - const uint32_t x0) { - typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8))); - const GccU32RawVectType raw = {x0, x1}; - return ResizeBitCast(d, Vec64(reinterpret_cast(raw))); -} - -template -HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, - const uint32_t x2, - const uint32_t x1, - const uint32_t x0) { - typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); - const GccU32RawVectType raw = {x0, x1, x2, x3}; - return ResizeBitCast(d, Vec128(reinterpret_cast(raw))); -} -#endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG - -template , 4)> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, - hwy::SizeTag /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - const DFromV d; - const RebindToUnsigned du; - const RepartitionToWide dw; - - const auto evens = BitCast(dw, ConcatEven(d, v, v)); - return BitCast(d, InterleaveLower(dw, evens, evens)); -} - -template , 4)> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, - hwy::SizeTag /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - const DFromV d; - const RebindToUnsigned du; - const RepartitionToWide dw; - - const auto odds = BitCast(dw, ConcatOdd(d, v, v)); - return BitCast(d, InterleaveLower(dw, odds, odds)); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, - hwy::SizeTag<2> /*lane_size_tag*/, - hwy::SizeTag<8> /*vect_size_tag*/, V v) { - const DFromV d; - return InterleaveUpper(d, v, v); -} - -} // namespace detail - -// ------------------------------ SlideUpLanes - -namespace detail { - -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - using TU = UnsignedFromSize; - const Repartition du; - return BitCast(d, BitCast(du, v) << Set( - du, static_cast(amt * sizeof(TFromV) * 8))); -} - -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - const Repartition du8; - const auto idx = - Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); - return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); -} - -} // namespace detail - -template -HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - case 4: - return ShiftLeftLanes<4>(d, v); - case 5: - return ShiftLeftLanes<5>(d, v); - case 6: - return ShiftLeftLanes<6>(d, v); - case 7: - return ShiftLeftLanes<7>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - case 4: - return ShiftLeftLanes<4>(d, v); - case 5: - return ShiftLeftLanes<5>(d, v); - case 6: - return ShiftLeftLanes<6>(d, v); - case 7: - return ShiftLeftLanes<7>(d, v); - case 8: - return ShiftLeftLanes<8>(d, v); - case 9: - return ShiftLeftLanes<9>(d, v); - case 10: - return ShiftLeftLanes<10>(d, v); - case 11: - return ShiftLeftLanes<11>(d, v); - case 12: - return ShiftLeftLanes<12>(d, v); - case 13: - return ShiftLeftLanes<13>(d, v); - case 14: - return ShiftLeftLanes<14>(d, v); - case 15: - return ShiftLeftLanes<15>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -// ------------------------------ SlideDownLanes - -namespace detail { - -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - using TU = UnsignedFromSize; - const Repartition du; - return BitCast(d, - BitCast(du, v) << Set( - du, static_cast(TU{0} - amt * sizeof(TFromV) * 8))); -} - -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - const Repartition di8; - auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); - idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); - return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); -} - -} // namespace detail - -template -HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - case 4: - return ShiftRightLanes<4>(d, v); - case 5: - return ShiftRightLanes<5>(d, v); - case 6: - return ShiftRightLanes<6>(d, v); - case 7: - return ShiftRightLanes<7>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - case 4: - return ShiftRightLanes<4>(d, v); - case 5: - return ShiftRightLanes<5>(d, v); - case 6: - return ShiftRightLanes<6>(d, v); - case 7: - return ShiftRightLanes<7>(d, v); - case 8: - return ShiftRightLanes<8>(d, v); - case 9: - return ShiftRightLanes<9>(d, v); - case 10: - return ShiftRightLanes<10>(d, v); - case 11: - return ShiftRightLanes<11>(d, v); - case 12: - return ShiftRightLanes<12>(d, v); - case 13: - return ShiftRightLanes<13>(d, v); - case 14: - return ShiftRightLanes<14>(d, v); - case 15: - return ShiftRightLanes<15>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -#if HWY_NEON_HAVE_BFLOAT16 - -template -HWY_API Vec128 ReorderWidenMulAccumulate(D /*d32*/, Vec128 a, - Vec128 b, - const Vec128 sum0, - Vec128& /*sum1*/) { - return Vec128(vbfdotq_f32(sum0.raw, a.raw, b.raw)); -} - -template -HWY_API VFromD ReorderWidenMulAccumulate( - D /*d32*/, VFromD> a, - VFromD> b, const VFromD sum0, - VFromD& /*sum1*/) { - return VFromD(vbfdot_f32(sum0.raw, a.raw, b.raw)); -} - -#else - -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, - const VFromD sum0, - VFromD& sum1) { - const RebindToUnsigned du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -} - -#endif // HWY_NEON_HAVE_BFLOAT16 - -template -HWY_API Vec128 ReorderWidenMulAccumulate(D /*d32*/, Vec128 a, - Vec128 b, - const Vec128 sum0, - Vec128& sum1) { -#if HWY_ARCH_ARM_A64 - sum1 = Vec128(vmlal_high_s16(sum1.raw, a.raw, b.raw)); -#else - const Full64 dh; - sum1 = Vec128( - vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); -#endif - return Vec128( - vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); -} - -template -HWY_API Vec64 ReorderWidenMulAccumulate(D d32, Vec64 a, - Vec64 b, - const Vec64 sum0, - Vec64& sum1) { - // vmlal writes into the upper half, which the caller cannot use, so - // split into two halves. - const Vec128 mul_3210(vmull_s16(a.raw, b.raw)); - const Vec64 mul_32 = UpperHalf(d32, mul_3210); - sum1 += mul_32; - return sum0 + LowerHalf(mul_3210); -} - -template -HWY_API Vec32 ReorderWidenMulAccumulate(D d32, Vec32 a, - Vec32 b, - const Vec32 sum0, - Vec32& sum1) { - const Vec128 mul_xx10(vmull_s16(a.raw, b.raw)); - const Vec64 mul_10(LowerHalf(mul_xx10)); - const Vec32 mul0 = LowerHalf(d32, mul_10); - const Vec32 mul1 = UpperHalf(d32, mul_10); - sum1 += mul1; - return sum0 + mul0; -} - -template -HWY_API Vec128 ReorderWidenMulAccumulate(D /*d32*/, - Vec128 a, - Vec128 b, - const Vec128 sum0, - Vec128& sum1) { -#if HWY_ARCH_ARM_A64 - sum1 = Vec128(vmlal_high_u16(sum1.raw, a.raw, b.raw)); -#else - const Full64 dh; - sum1 = Vec128( - vmlal_u16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); -#endif - return Vec128( - vmlal_u16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); -} - -template -HWY_API Vec64 ReorderWidenMulAccumulate(D d32, Vec64 a, - Vec64 b, - const Vec64 sum0, - Vec64& sum1) { - // vmlal writes into the upper half, which the caller cannot use, so - // split into two halves. - const Vec128 mul_3210(vmull_u16(a.raw, b.raw)); - const Vec64 mul_32 = UpperHalf(d32, mul_3210); - sum1 += mul_32; - return sum0 + LowerHalf(mul_3210); -} - -template -HWY_API Vec32 ReorderWidenMulAccumulate(D du32, Vec32 a, - Vec32 b, - const Vec32 sum0, - Vec32& sum1) { - const Vec128 mul_xx10(vmull_u16(a.raw, b.raw)); - const Vec64 mul_10(LowerHalf(mul_xx10)); - const Vec32 mul0 = LowerHalf(du32, mul_10); - const Vec32 mul1 = UpperHalf(du32, mul_10); - sum1 += mul1; - return sum0 + mul0; -} - -// ------------------------------ Combine partial (InterleaveLower) -// < 64bit input, <= 64 bit result -template -HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { - // First double N (only lower halves will be used). - const VFromD hi2(hi.raw); - const VFromD lo2(lo.raw); - // Repartition to two unsigned lanes (each the size of the valid input). - const Simd, 2, 0> du; - return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2))); -} - -// ------------------------------ RearrangeToOddPlusEven (Combine) - -template -HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, - Vec128 sum1) { -#if HWY_NEON_HAVE_BFLOAT16 - (void)sum1; // unused by bf16 ReorderWidenMulAccumulate - return sum0; -#else - return Add(sum0, sum1); -#endif -} - -HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, - Vec128 sum1) { -// vmlal_s16 multiplied the lower half into sum0 and upper into sum1. -#if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want - return Vec128(vpaddq_s32(sum0.raw, sum1.raw)); -#else - const Full128 d; - const Half d64; - const Vec64 hi( - vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw)); - const Vec64 lo( - vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw)); - return Combine(Full128(), hi, lo); -#endif -} - -HWY_API Vec64 RearrangeToOddPlusEven(Vec64 sum0, - Vec64 sum1) { - // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. - return Vec64(vpadd_s32(sum0.raw, sum1.raw)); -} - -HWY_API Vec32 RearrangeToOddPlusEven(Vec32 sum0, - Vec32 sum1) { - // Only one widened sum per register, so add them for sum of odd and even. - return sum0 + sum1; -} - -HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, - Vec128 sum1) { -// vmlal_s16 multiplied the lower half into sum0 and upper into sum1. -#if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want - return Vec128(vpaddq_u32(sum0.raw, sum1.raw)); -#else - const Full128 d; - const Half d64; - const Vec64 hi( - vpadd_u32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw)); - const Vec64 lo( - vpadd_u32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw)); - return Combine(Full128(), hi, lo); -#endif -} - -HWY_API Vec64 RearrangeToOddPlusEven(Vec64 sum0, - Vec64 sum1) { - // vmlal_u16 multiplied the lower half into sum0 and upper into sum1. - return Vec64(vpadd_u32(sum0.raw, sum1.raw)); -} - -HWY_API Vec32 RearrangeToOddPlusEven(Vec32 sum0, - Vec32 sum1) { - // Only one widened sum per register, so add them for sum of odd and even. - return sum0 + sum1; -} - -// ------------------------------ WidenMulPairwiseAdd - -#if HWY_NEON_HAVE_BFLOAT16 - -template -HWY_API Vec128 WidenMulPairwiseAdd(D d32, Vec128 a, - Vec128 b) { - return Vec128(vbfdotq_f32(Zero(d32).raw, a.raw, b.raw)); -} - -template -HWY_API VFromD WidenMulPairwiseAdd(D d32, - VFromD> a, - VFromD> b) { - return VFromD(vbfdot_f32(Zero(d32).raw, a.raw, b.raw)); -} - -#else -template -HWY_API VFromD WidenMulPairwiseAdd( - D32 df32, VFromD> a, - VFromD> b) { - const RebindToUnsigned du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), - Mul(BitCast(df32, ao), BitCast(df32, bo))); -} -#endif // HWY_NEON_HAVE_BFLOAT16 - -template -HWY_API Vec128 WidenMulPairwiseAdd(D /*d32*/, Vec128 a, - Vec128 b) { - Vec128 sum1; -#if HWY_ARCH_ARM_A64 - sum1 = Vec128(vmull_high_s16(a.raw, b.raw)); -#else - const Full64 dh; - sum1 = Vec128(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); -#endif - Vec128 sum0 = - Vec128(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw)); - return RearrangeToOddPlusEven(sum0, sum1); -} - -template -HWY_API Vec64 WidenMulPairwiseAdd(D d32, Vec64 a, - Vec64 b) { - // vmlal writes into the upper half, which the caller cannot use, so - // split into two halves. - const Vec128 mul_3210(vmull_s16(a.raw, b.raw)); - const Vec64 mul0 = LowerHalf(mul_3210); - const Vec64 mul1 = UpperHalf(d32, mul_3210); - return RearrangeToOddPlusEven(mul0, mul1); -} - -template -HWY_API Vec32 WidenMulPairwiseAdd(D d32, Vec32 a, - Vec32 b) { - const Vec128 mul_xx10(vmull_s16(a.raw, b.raw)); - const Vec64 mul_10(LowerHalf(mul_xx10)); - const Vec32 mul0 = LowerHalf(d32, mul_10); - const Vec32 mul1 = UpperHalf(d32, mul_10); - return RearrangeToOddPlusEven(mul0, mul1); -} - -template -HWY_API Vec128 WidenMulPairwiseAdd(D /*d32*/, Vec128 a, - Vec128 b) { - Vec128 sum1; -#if HWY_ARCH_ARM_A64 - sum1 = Vec128(vmull_high_u16(a.raw, b.raw)); -#else - const Full64 dh; - sum1 = - Vec128(vmull_u16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); -#endif - Vec128 sum0 = - Vec128(vmull_u16(LowerHalf(a).raw, LowerHalf(b).raw)); - return RearrangeToOddPlusEven(sum0, sum1); -} - -template -HWY_API Vec64 WidenMulPairwiseAdd(D d32, Vec64 a, - Vec64 b) { - // vmlal writes into the upper half, which the caller cannot use, so - // split into two halves. - const Vec128 mul_3210(vmull_u16(a.raw, b.raw)); - const Vec64 mul0 = LowerHalf(mul_3210); - const Vec64 mul1 = UpperHalf(d32, mul_3210); - return RearrangeToOddPlusEven(mul0, mul1); -} - -template -HWY_API Vec32 WidenMulPairwiseAdd(D d32, Vec32 a, - Vec32 b) { - const Vec128 mul_xx10(vmull_u16(a.raw, b.raw)); - const Vec64 mul_10(LowerHalf(mul_xx10)); - const Vec32 mul0 = LowerHalf(d32, mul_10); - const Vec32 mul1 = UpperHalf(d32, mul_10); - return RearrangeToOddPlusEven(mul0, mul1); -} - -// ------------------------------ ZeroExtendVector (Combine) - -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { - return Combine(d, Zero(Half()), lo); -} - -// ------------------------------ ConcatLowerLower - -// 64 or 128-bit input: just interleave -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - // Treat half-width input as a single lane and interleave them. - const Repartition, decltype(d)> du; - return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi))); -} - -namespace detail { -#if HWY_ARCH_ARM_A64 -HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveEven, vtrn1, _, 2) -HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveOdd, vtrn2, _, 2) -#else - -// vtrn returns a struct with even and odd result. -#define HWY_NEON_BUILD_TPL_HWY_TRN -#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t -// Pass raw args so we can accept uint16x2 args, for which there is no -// corresponding uint16x2x2 return type. -#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \ - Raw128::type a, Raw128::type b -#define HWY_NEON_BUILD_ARG_HWY_TRN a, b - -// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined -// for full and half vectors. -HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN) -HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN) -HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN) -HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN) -HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN) -HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN) -HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN) -HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN) -HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN) -HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN) -HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN) -HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN) -HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN) -HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN) - -#undef HWY_NEON_BUILD_TPL_HWY_TRN -#undef HWY_NEON_BUILD_RET_HWY_TRN -#undef HWY_NEON_BUILD_PARAM_HWY_TRN -#undef HWY_NEON_BUILD_ARG_HWY_TRN - -#endif // HWY_ARCH_ARM_A64 -} // namespace detail - -// <= 32-bit input/output -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - // Treat half-width input as two lanes and take every second one. - const Repartition, decltype(d)> du; -#if HWY_ARCH_ARM_A64 - return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi))); -#else - using VU = VFromD; - return BitCast( - d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) - .val[0])); -#endif -} - -// ------------------------------ ConcatUpperUpper - -// 64 or 128-bit input: just interleave -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - // Treat half-width input as a single lane and interleave them. - const Repartition, decltype(d)> du; - return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi))); -} - -// <= 32-bit input/output -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - // Treat half-width input as two lanes and take every second one. - const Repartition, decltype(d)> du; -#if HWY_ARCH_ARM_A64 - return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi))); -#else - using VU = VFromD; - return BitCast( - d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) - .val[1])); -#endif -} - -// ------------------------------ ConcatLowerUpper (ShiftLeftBytes) - -// 64 or 128-bit input: extract from concatenated -template -HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { - return CombineShiftRightBytes(d, hi, lo); -} - -// <= 32-bit input/output -template -HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { - constexpr size_t kSize = d.MaxBytes(); - const Repartition d8; - const Full64 d8x8; - const Full64> d64; - using V8x8 = VFromD; - const V8x8 hi8x8(BitCast(d8, hi).raw); - // Move into most-significant bytes - const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw)); - const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8); - // Back to original lane type, then shrink N. - return VFromD(BitCast(d64, r).raw); -} - -// ------------------------------ ConcatUpperLower - -// Works for all N. -template -HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { - return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); -} - -// ------------------------------ ConcatOdd (InterleaveUpper) - -namespace detail { -// There is no vuzpq_u64. -HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2) -HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2) -} // namespace detail - -// Full/half vector -template -HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { - return detail::ConcatOdd(lo, hi); -} - -// 8-bit x4 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 ConcatOdd(D d, Vec32 hi, Vec32 lo) { - const Twice d2; - const Repartition dw2; - const VFromD hi2(hi.raw); - const VFromD lo2(lo.raw); - const VFromD Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2)); - // Compact into two pairs of u8, skipping the invalid x lanes. Could also use - // vcopy_lane_u16, but that's A64-only. - return Vec32(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw); -} - -// Any type x2 -template > -HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { - return InterleaveUpper(d, lo, hi); -} - -// ------------------------------ ConcatEven (InterleaveLower) - -// Full/half vector -template -HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { - return detail::ConcatEven(lo, hi); -} - -// 8-bit x4 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 ConcatEven(D d, Vec32 hi, Vec32 lo) { - const Twice d2; - const Repartition dw2; - const VFromD hi2(hi.raw); - const VFromD lo2(lo.raw); - const VFromD Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2)); - // Compact into two pairs of u8, skipping the invalid x lanes. Could also use - // vcopy_lane_u16, but that's A64-only. - return Vec32(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw); -} - -// Any type x2 -template > -HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { - return InterleaveLower(d, lo, hi); -} - -// ------------------------------ DupEven (InterleaveLower) - -template -HWY_API Vec128 DupEven(Vec128 v) { -#if HWY_ARCH_ARM_A64 - return detail::InterleaveEven(v, v); -#else - return Vec128(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]); -#endif -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - return InterleaveLower(DFromV(), v, v); -} - -// ------------------------------ DupOdd (InterleaveUpper) - -template -HWY_API Vec128 DupOdd(Vec128 v) { -#if HWY_ARCH_ARM_A64 - return detail::InterleaveOdd(v, v); -#else - return Vec128(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]); -#endif -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return InterleaveUpper(DFromV(), v, v); -} - -// ------------------------------ OddEven (IfThenElse) - -template -HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { - const DFromV d; - const Repartition d8; - alignas(16) static constexpr uint8_t kBytes[16] = { - ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF, - ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF, - ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF, - ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF, - ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF, - ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF, - ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF, - ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF, - }; - const auto vec = BitCast(d, Load(d8, kBytes)); - return IfThenElse(MaskFromVec(vec), b, a); -} - -// ------------------------------ OddEvenBlocks -template -HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { - return even; -} - -// ------------------------------ SwapAdjacentBlocks -template -HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { - return v; -} - -// ------------------------------ ReverseBlocks -// Single block: no change -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return v; -} - -// ------------------------------ ReorderDemote2To (OddEven) - -template >> -HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { - const RebindToUnsigned du16; - return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); -} - -template -HWY_API Vec128 ReorderDemote2To(D d32, Vec128 a, - Vec128 b) { - const Vec64 a32(vqmovn_s64(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d32; - return Vec128(vqmovn_high_s64(a32.raw, b.raw)); -#else - const Vec64 b32(vqmovn_s64(b.raw)); - return Combine(d32, b32, a32); -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D d32, VFromD> a, - VFromD> b) { - const Rebind dt; - return DemoteTo(d32, Combine(dt, b, a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d32, Vec128 a, - Vec128 b) { - const Vec64 a32(vqmovun_s64(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d32; - return Vec128(vqmovun_high_s64(a32.raw, b.raw)); -#else - const Vec64 b32(vqmovun_s64(b.raw)); - return Combine(d32, b32, a32); -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D d32, VFromD> a, - VFromD> b) { - const Rebind dt; - return DemoteTo(d32, Combine(dt, b, a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d32, Vec128 a, - Vec128 b) { - const Vec64 a32(vqmovn_u64(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d32; - return Vec128(vqmovn_high_u64(a32.raw, b.raw)); -#else - const Vec64 b32(vqmovn_u64(b.raw)); - return Combine(d32, b32, a32); -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D d32, VFromD> a, - VFromD> b) { - const Rebind dt; - return DemoteTo(d32, Combine(dt, b, a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d16, Vec128 a, - Vec128 b) { - const Vec64 a16(vqmovn_s32(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d16; - return Vec128(vqmovn_high_s32(a16.raw, b.raw)); -#else - const Vec64 b16(vqmovn_s32(b.raw)); - return Combine(d16, b16, a16); -#endif -} - -template -HWY_API Vec64 ReorderDemote2To(D /*d16*/, Vec64 a, - Vec64 b) { - const Full128 d32; - const Vec128 ab = Combine(d32, b, a); - return Vec64(vqmovn_s32(ab.raw)); -} - -template -HWY_API Vec32 ReorderDemote2To(D /*d16*/, Vec32 a, - Vec32 b) { - const Full128 d32; - const Vec64 ab(vzip1_s32(a.raw, b.raw)); - return Vec32(vqmovn_s32(Combine(d32, ab, ab).raw)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d16, Vec128 a, - Vec128 b) { - const Vec64 a16(vqmovun_s32(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d16; - return Vec128(vqmovun_high_s32(a16.raw, b.raw)); -#else - const Vec64 b16(vqmovun_s32(b.raw)); - return Combine(d16, b16, a16); -#endif -} - -template -HWY_API Vec64 ReorderDemote2To(D /*d16*/, Vec64 a, - Vec64 b) { - const Full128 d32; - const Vec128 ab = Combine(d32, b, a); - return Vec64(vqmovun_s32(ab.raw)); -} - -template -HWY_API Vec32 ReorderDemote2To(D /*d16*/, Vec32 a, - Vec32 b) { - const Full128 d32; - const Vec64 ab(vzip1_s32(a.raw, b.raw)); - return Vec32(vqmovun_s32(Combine(d32, ab, ab).raw)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d16, Vec128 a, - Vec128 b) { - const Vec64 a16(vqmovn_u32(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d16; - return Vec128(vqmovn_high_u32(a16.raw, b.raw)); -#else - const Vec64 b16(vqmovn_u32(b.raw)); - return Combine(d16, b16, a16); -#endif -} - -template -HWY_API Vec64 ReorderDemote2To(D /*d16*/, Vec64 a, - Vec64 b) { - const Full128 d32; - const Vec128 ab = Combine(d32, b, a); - return Vec64(vqmovn_u32(ab.raw)); -} - -template -HWY_API Vec32 ReorderDemote2To(D /*d16*/, Vec32 a, - Vec32 b) { - const Full128 d32; - const Vec64 ab(vzip1_u32(a.raw, b.raw)); - return Vec32(vqmovn_u32(Combine(d32, ab, ab).raw)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d8, Vec128 a, - Vec128 b) { - const Vec64 a8(vqmovn_s16(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d8; - return Vec128(vqmovn_high_s16(a8.raw, b.raw)); -#else - const Vec64 b8(vqmovn_s16(b.raw)); - return Combine(d8, b8, a8); -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D d8, VFromD> a, - VFromD> b) { - const Rebind dt; - return DemoteTo(d8, Combine(dt, b, a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d8, Vec128 a, - Vec128 b) { - const Vec64 a8(vqmovun_s16(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d8; - return Vec128(vqmovun_high_s16(a8.raw, b.raw)); -#else - const Vec64 b8(vqmovun_s16(b.raw)); - return Combine(d8, b8, a8); -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D d8, VFromD> a, - VFromD> b) { - const Rebind dt; - return DemoteTo(d8, Combine(dt, b, a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D d8, Vec128 a, - Vec128 b) { - const Vec64 a8(vqmovn_u16(a.raw)); -#if HWY_ARCH_ARM_A64 - (void)d8; - return Vec128(vqmovn_high_u16(a8.raw, b.raw)); -#else - const Vec64 b8(vqmovn_u16(b.raw)); - return Combine(d8, b8, a8); -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D d8, VFromD> a, - VFromD> b) { - const Rebind dt; - return DemoteTo(d8, Combine(dt, b, a)); -} - -template ), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return ReorderDemote2To(d, a, b); -} - -template >> -HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { - return ReorderDemote2To(dbf16, a, b); -} - -// ================================================== CRYPTO - -// (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH). -// Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*. -#if HWY_TARGET == HWY_NEON - -#ifdef HWY_NATIVE_AES -#undef HWY_NATIVE_AES -#else -#define HWY_NATIVE_AES -#endif - -HWY_API Vec128 AESRound(Vec128 state, - Vec128 round_key) { - // NOTE: it is important that AESE and AESMC be consecutive instructions so - // they can be fused. AESE includes AddRoundKey, which is a different ordering - // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual - // round key (the compiler will hopefully optimize this for multiple rounds). - return Vec128(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^ - round_key; -} - -HWY_API Vec128 AESLastRound(Vec128 state, - Vec128 round_key) { - return Vec128(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; -} - -HWY_API Vec128 AESInvMixColumns(Vec128 state) { - return Vec128{vaesimcq_u8(state.raw)}; -} - -HWY_API Vec128 AESRoundInv(Vec128 state, - Vec128 round_key) { - // NOTE: it is important that AESD and AESIMC be consecutive instructions so - // they can be fused. AESD includes AddRoundKey, which is a different ordering - // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual - // round key (the compiler will hopefully optimize this for multiple rounds). - return Vec128(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^ - round_key; -} - -HWY_API Vec128 AESLastRoundInv(Vec128 state, - Vec128 round_key) { - return Vec128(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; -} - -HWY_API Vec128 CLMulLower(Vec128 a, Vec128 b) { - return Vec128((uint64x2_t)vmull_p64(GetLane(a), GetLane(b))); -} - -HWY_API Vec128 CLMulUpper(Vec128 a, Vec128 b) { - return Vec128( - (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw)); -} - -#endif // HWY_TARGET == HWY_NEON - -// ================================================== MISC - -template -HWY_API VFromD PromoteTo(D df32, VFromD> v) { - const Rebind du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -// ------------------------------ Truncations - -template , typename TFrom, - HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo), - hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr> -HWY_API Vec128 TruncateTo(DTo /* tag */, Vec128 v) { - const Repartition> d; - return Vec128{BitCast(d, v).raw}; -} - -template -HWY_API Vec16 TruncateTo(D /* tag */, Vec128 v) { - const Repartition> d; - const auto v1 = BitCast(d, v); - const auto v2 = detail::ConcatEven(v1, v1); - const auto v3 = detail::ConcatEven(v2, v2); - const auto v4 = detail::ConcatEven(v3, v3); - return LowerHalf(LowerHalf(LowerHalf(v4))); -} - -template -HWY_API Vec32 TruncateTo(D /* tag */, Vec128 v) { - const Repartition> d; - const auto v1 = BitCast(d, v); - const auto v2 = detail::ConcatEven(v1, v1); - const auto v3 = detail::ConcatEven(v2, v2); - return LowerHalf(LowerHalf(v3)); -} - -template -HWY_API Vec64 TruncateTo(D /* tag */, Vec128 v) { - const Repartition> d; - const auto v1 = BitCast(d, v); - const auto v2 = detail::ConcatEven(v1, v1); - return LowerHalf(v2); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const Repartition> d; - const auto v1 = BitCast(d, v); - const auto v2 = detail::ConcatEven(v1, v1); - const auto v3 = detail::ConcatEven(v2, v2); - return LowerHalf(LowerHalf(v3)); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const Repartition> d; - const auto v1 = BitCast(d, v); - const auto v2 = detail::ConcatEven(v1, v1); - return LowerHalf(v2); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const Repartition> d; - const auto v1 = BitCast(d, v); - const auto v2 = detail::ConcatEven(v1, v1); - return LowerHalf(v2); -} - -// ------------------------------ MulEven (ConcatEven) - -// Multiplies even lanes (0, 2 ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - int8x16_t a_packed = ConcatEven(d, a, a).raw; - int8x16_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed))); -} -HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - uint8x16_t a_packed = ConcatEven(d, a, a).raw; - uint8x16_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed))); -} -HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - int16x8_t a_packed = ConcatEven(d, a, a).raw; - int16x8_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed))); -} -HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - uint16x8_t a_packed = ConcatEven(d, a, a).raw; - uint16x8_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed))); -} -HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - int32x4_t a_packed = ConcatEven(d, a, a).raw; - int32x4_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); -} -HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - uint32x4_t a_packed = ConcatEven(d, a, a).raw; - uint32x4_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); -} - -template -HWY_API Vec128 MulEven(Vec128 a, - Vec128 b) { - const DFromV d; - int8x8_t a_packed = ConcatEven(d, a, a).raw; - int8x8_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vget_low_s16(vmull_s8(a_packed, b_packed))); -} -template -HWY_API Vec128 MulEven(Vec128 a, - Vec128 b) { - const DFromV d; - uint8x8_t a_packed = ConcatEven(d, a, a).raw; - uint8x8_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vget_low_u16(vmull_u8(a_packed, b_packed))); -} -template -HWY_API Vec128 MulEven(Vec128 a, - Vec128 b) { - const DFromV d; - int16x4_t a_packed = ConcatEven(d, a, a).raw; - int16x4_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vget_low_s32(vmull_s16(a_packed, b_packed))); -} -template -HWY_API Vec128 MulEven(Vec128 a, - Vec128 b) { - const DFromV d; - uint16x4_t a_packed = ConcatEven(d, a, a).raw; - uint16x4_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vget_low_u32(vmull_u16(a_packed, b_packed))); -} -template -HWY_API Vec128 MulEven(Vec128 a, - Vec128 b) { - const DFromV d; - int32x2_t a_packed = ConcatEven(d, a, a).raw; - int32x2_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vget_low_s64(vmull_s32(a_packed, b_packed))); -} -template -HWY_API Vec128 MulEven(Vec128 a, - Vec128 b) { - const DFromV d; - uint32x2_t a_packed = ConcatEven(d, a, a).raw; - uint32x2_t b_packed = ConcatEven(d, b, b).raw; - return Vec128( - vget_low_u64(vmull_u32(a_packed, b_packed))); -} - -HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { - uint64_t hi; - uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi); - return Vec128(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); -} - -// Multiplies odd lanes (1, 3 ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - int8x16_t a_packed = ConcatOdd(d, a, a).raw; - int8x16_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed))); -} -HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - uint8x16_t a_packed = ConcatOdd(d, a, a).raw; - uint8x16_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed))); -} -HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - int16x8_t a_packed = ConcatOdd(d, a, a).raw; - int16x8_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed))); -} -HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - uint16x8_t a_packed = ConcatOdd(d, a, a).raw; - uint16x8_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed))); -} -HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - int32x4_t a_packed = ConcatOdd(d, a, a).raw; - int32x4_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); -} -HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - uint32x4_t a_packed = ConcatOdd(d, a, a).raw; - uint32x4_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); -} - -template -HWY_API Vec128 MulOdd(Vec128 a, - Vec128 b) { - const DFromV d; - int8x8_t a_packed = ConcatOdd(d, a, a).raw; - int8x8_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vget_low_s16(vmull_s8(a_packed, b_packed))); -} -template -HWY_API Vec128 MulOdd(Vec128 a, - Vec128 b) { - const DFromV d; - uint8x8_t a_packed = ConcatOdd(d, a, a).raw; - uint8x8_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vget_low_u16(vmull_u8(a_packed, b_packed))); -} -template -HWY_API Vec128 MulOdd(Vec128 a, - Vec128 b) { - const DFromV d; - int16x4_t a_packed = ConcatOdd(d, a, a).raw; - int16x4_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vget_low_s32(vmull_s16(a_packed, b_packed))); -} -template -HWY_API Vec128 MulOdd(Vec128 a, - Vec128 b) { - const DFromV d; - uint16x4_t a_packed = ConcatOdd(d, a, a).raw; - uint16x4_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vget_low_u32(vmull_u16(a_packed, b_packed))); -} -template -HWY_API Vec128 MulOdd(Vec128 a, - Vec128 b) { - const DFromV d; - int32x2_t a_packed = ConcatOdd(d, a, a).raw; - int32x2_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vget_low_s64(vmull_s32(a_packed, b_packed))); -} -template -HWY_API Vec128 MulOdd(Vec128 a, - Vec128 b) { - const DFromV d; - uint32x2_t a_packed = ConcatOdd(d, a, a).raw; - uint32x2_t b_packed = ConcatOdd(d, b, b).raw; - return Vec128( - vget_low_u64(vmull_u32(a_packed, b_packed))); -} - -HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { - uint64_t hi; - uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi); - return Vec128(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); -} - -// ------------------------------ TableLookupBytes (Combine, LowerHalf) - -// Both full -template -HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { - const DFromV d; - const Repartition d8; -#if HWY_ARCH_ARM_A64 - return BitCast(d, Vec128(vqtbl1q_u8(BitCast(d8, bytes).raw, - BitCast(d8, from).raw))); -#else - uint8x16_t table0 = BitCast(d8, bytes).raw; - uint8x8x2_t table; - table.val[0] = vget_low_u8(table0); - table.val[1] = vget_high_u8(table0); - uint8x16_t idx = BitCast(d8, from).raw; - uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx)); - uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); - return BitCast(d, Vec128(vcombine_u8(low, hi))); -#endif -} - -// Partial index vector -template -HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { - const Full128 d_full; - const Vec64 from64(from.raw); - const auto idx_full = Combine(d_full, from64, from64); - const auto out_full = TableLookupBytes(bytes, idx_full); - return Vec128(LowerHalf(Half(), out_full).raw); -} - -// Partial table vector -template -HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { - const Full128 d_full; - return TableLookupBytes(Combine(d_full, bytes, bytes), from); -} - -// Partial both -template -HWY_API Vec128 TableLookupBytes(Vec128 bytes, - Vec128 from) { - const DFromV d; - const Simd d_idx; - const Repartition d_idx8; - // uint8x8 - const auto bytes8 = BitCast(Repartition(), bytes); - const auto from8 = BitCast(d_idx8, from); - const VFromD v8(vtbl1_u8(bytes8.raw, from8.raw)); - return BitCast(d_idx, v8); -} - -// For all vector widths; Arm anyway zeroes if >= 0x10. -template -HWY_API VI TableLookupBytesOr0(V bytes, VI from) { - return TableLookupBytes(bytes, from); -} - -// ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes) - -#if HWY_TARGET == HWY_NEON -template -HWY_API Vec128 AESKeyGenAssist(Vec128 v) { - alignas(16) static constexpr uint8_t kRconXorMask[16] = { - 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; - alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { - 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; - const DFromV d; - const Repartition du32; - const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); - const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask)); - return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle)); -} -#endif // HWY_TARGET == HWY_NEON - -// ------------------------------ Scatter in generic_ops-inl.h -// ------------------------------ Gather in generic_ops-inl.h - -// ------------------------------ Reductions - -namespace detail { - -// N=1 for any T: no-op -template -HWY_INLINE T ReduceMin(hwy::SizeTag /* tag */, Vec128 v) { - return GetLane(v); -} -template -HWY_INLINE T ReduceMax(hwy::SizeTag /* tag */, Vec128 v) { - return GetLane(v); -} -template -HWY_INLINE T ReduceSum(hwy::SizeTag /* tag */, Vec128 v) { - return GetLane(v); -} -template -HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag /* tag */, - Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag /* tag */, - Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag /* tag */, - Vec128 v) { - return v; -} - -// full vectors -#if HWY_ARCH_ARM_A64 - -// TODO(janwas): use normal HWY_NEON_DEF, then FULL type list. -#define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \ - HWY_API type##_t name(hwy::SizeTag, \ - Vec128 v) { \ - return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \ - } - -// Excludes u64/s64 (missing minv/maxv) and f16 (missing addv). -#define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \ - HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8) \ - HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8) \ - HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16) \ - HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16) \ - HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32) \ - HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32) \ - HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8) \ - HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8) \ - HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16) \ - HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16) \ - HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32) \ - HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32) \ - HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32) \ - HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \ - HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64) - -// Different interface than HWY_NEON_DEF_FUNCTION_FULL_UI_64. -#define HWY_NEON_DEF_REDUCTION_UI64(name, prefix) \ - HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \ - HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64) - -#if HWY_HAVE_FLOAT16 -#define HWY_NEON_DEF_REDUCTION_F16(name, prefix) \ - HWY_NEON_DEF_REDUCTION(float16, 4, name, prefix, _, f16) \ - HWY_NEON_DEF_REDUCTION(float16, 8, name, prefix##q, _, f16) -#else -#define HWY_NEON_DEF_REDUCTION_F16(name, prefix) -#endif - -HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv) -HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv) -HWY_NEON_DEF_REDUCTION_F16(ReduceMin, vminv) -HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv) - -HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv) -HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv) - -#if HWY_HAVE_FLOAT16 -HWY_API float16_t ReduceSum(hwy::SizeTag<2>, Vec64 v) { - const float16x4_t x2 = vpadd_f16(v.raw, v.raw); - return GetLane(Vec64(vpadd_f16(x2, x2))); -} -HWY_API float16_t ReduceSum(hwy::SizeTag<2> tag, Vec128 v) { - return ReduceSum(tag, LowerHalf(Vec128(vpaddq_f16(v.raw, v.raw)))); -} -#endif - -#undef HWY_NEON_DEF_REDUCTION_CORE_TYPES -#undef HWY_NEON_DEF_REDUCTION_F16 -#undef HWY_NEON_DEF_REDUCTION_UI64 -#undef HWY_NEON_DEF_REDUCTION - -// Need some fallback implementations for [ui]64x2 and [ui]16x2. -#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2) -#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, (1 << 8) | (1 << 2)) - -// Implement Min/Max/SumOfLanes in terms of the corresponding reduction. -template -HWY_API V MinOfLanes(hwy::SizeTag tag, V v) { - return Set(DFromV(), ReduceMin(tag, v)); -} -template -HWY_API V MaxOfLanes(hwy::SizeTag tag, V v) { - return Set(DFromV(), ReduceMax(tag, v)); -} -template -HWY_API V SumOfLanes(hwy::SizeTag tag, V v) { - return Set(DFromV(), ReduceSum(tag, v)); -} - -#else - -// For arm7, we implement reductions using a series of pairwise operations. This -// produces the full vector result, so we express Reduce* in terms of *OfLanes. -#define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t -#define HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) Vec128 -#define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \ - HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \ - hwy::SizeTag, Vec128 v) { \ - HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \ - if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \ - if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \ - return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size)(tmp); \ - } \ - HWY_API type##_t Reduce##name(hwy::SizeTag tag, \ - Vec128 v) { \ - return GetLane(name##OfLanes(tag, v)); \ - } - -// For the wide versions, the pairwise operations produce a half-length vector. -// We produce that value with a Reduce*Vector helper method, and express Reduce* -// and *OfLanes in terms of the helper. -#define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \ - suffix) \ - HWY_API HWY_NEON_BUILD_TYPE_T(type, half) \ - Reduce##name##Vector(Vec128 v) { \ - HWY_NEON_BUILD_TYPE_T(type, half) tmp; \ - tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \ - vget_low_##suffix(v.raw)); \ - if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \ - if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \ - if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \ - return tmp; \ - } \ - HWY_API type##_t Reduce##name(hwy::SizeTag, \ - Vec128 v) { \ - const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \ - return HWY_NEON_EVAL(vget_lane_##suffix, tmp, 0); \ - } \ - HWY_API HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION(type, size) name##OfLanes( \ - hwy::SizeTag, Vec128 v) { \ - const HWY_NEON_BUILD_TYPE_T(type, half) tmp = Reduce##name##Vector(v); \ - return HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION( \ - type, size)(vcombine_##suffix(tmp, tmp)); \ - } - -#define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \ - HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \ - HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32) - -HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Sum, vpadd) -HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Min, vpmin) -HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax) - -#undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS -#undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION -#undef HWY_NEON_DEF_PAIRWISE_REDUCTION -#undef HWY_NEON_BUILD_RET_PAIRWISE_REDUCTION -#undef HWY_NEON_BUILD_TYPE_T - -// Need fallback min/max implementations for [ui]64x2 and [ui]16x2. -#define HWY_IF_SUM_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8) -#define HWY_IF_MINMAX_REDUCTION(T) HWY_IF_T_SIZE_ONE_OF(T, 1 << 2 | 1 << 8) - -#endif - -} // namespace detail - -// [ui]16/[ui]64: N=2 -- special case for pairs of very small or large lanes -template -HWY_API Vec128 SumOfLanes(D /* tag */, Vec128 v10) { - return v10 + Reverse2(Simd(), v10); -} - -template -HWY_API T ReduceSum(D d, Vec128 v10) { - return GetLane(SumOfLanes(d, v10)); -} - -template -HWY_API Vec128 MinOfLanes(D /* tag */, Vec128 v10) { - return Min(v10, Reverse2(Simd(), v10)); -} -template -HWY_API Vec128 MaxOfLanes(D /* tag */, Vec128 v10) { - return Max(v10, Reverse2(Simd(), v10)); -} - -#undef HWY_IF_SUM_REDUCTION -#undef HWY_IF_MINMAX_REDUCTION - -template -HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { - return detail::SumOfLanes(hwy::SizeTag)>(), v); -} -template -HWY_API TFromD ReduceSum(D /* tag */, VFromD v) { - return detail::ReduceSum(hwy::SizeTag)>(), v); -} -template -HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { - return detail::MinOfLanes(hwy::SizeTag)>(), v); -} -template -HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { - return detail::MaxOfLanes(hwy::SizeTag)>(), v); -} - -// ------------------------------ LoadMaskBits (TestBit) - -namespace detail { - -// Helper function to set 64 bits and potentially return a smaller vector. The -// overload is required to call the q vs non-q intrinsics. Note that 8-bit -// LoadMaskBits only requires 16 bits, but 64 avoids casting. -template -HWY_INLINE VFromD Set64(D /* tag */, uint64_t mask_bits) { - const auto v64 = Vec64(vdup_n_u64(mask_bits)); - return VFromD(BitCast(Full64>(), v64).raw); -} -template -HWY_INLINE Vec128 Set64(Full128 d, uint64_t mask_bits) { - return BitCast(d, Vec128(vdupq_n_u64(mask_bits))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - // Easier than Set(), which would require an >8-bit type, which would not - // compile for T=uint8_t, N=1. - const auto vmask_bits = Set64(du, mask_bits); - - // Replicate bytes 8x such that each byte contains the bit that governs it. - alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1}; - const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8)); - - alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128}; - return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - const auto vmask_bits = Set(du, static_cast(mask_bits)); - return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; - const auto vmask_bits = Set(du, static_cast(mask_bits)); - return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; - return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); -} - -} // namespace detail - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - uint64_t mask_bits = 0; - CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits); - return detail::LoadMaskBits(d, mask_bits); -} - -// ------------------------------ Mask - -namespace detail { - -// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than -// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse. -template -HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { - const Full128 du16; - const Vec128 vu16 = BitCast(du16, VecFromMask(d, mask)); - const Vec64 nib(vshrn_n_u16(vu16.raw, 4)); - return GetLane(BitCast(Full64(), nib)); -} - -template -HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { - // There is no vshrn_n_u16 for uint16x4, so zero-extend. - const Twice d2; - const VFromD v128 = ZeroExtendVector(d2, VecFromMask(d, mask)); - // No need to mask, upper half is zero thanks to ZeroExtendVector. - return NibblesFromMask(d2, MaskFromVec(v128)); -} - -template -HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { - const Mask64> mask64(mask.raw); - const uint64_t nib = NibblesFromMask(Full64>(), mask64); - // Clear nibbles from upper half of 64-bits - return nib & ((1ull << (d.MaxBytes() * 4)) - 1); -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { - alignas(16) static constexpr uint8_t kSliceLanes[16] = { - 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, - }; - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); - -#if HWY_ARCH_ARM_A64 - // Can't vaddv - we need two separate bytes (16 bits). - const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); - const uint8x8_t x4 = vpadd_u8(x2, x2); - const uint8x8_t x8 = vpadd_u8(x4, x4); - return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF; -#else - // Don't have vpaddq, so keep doubling lane size. - const uint16x8_t x2 = vpaddlq_u8(values.raw); - const uint32x4_t x4 = vpaddlq_u16(x2); - const uint64x2_t x8 = vpaddlq_u32(x4); - return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { - // Upper lanes of partial loads are undefined. OnlyActive will fix this if - // we load all kSliceLanes so the upper lanes do not pollute the valid bits. - alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, - 0x10, 0x20, 0x40, 0x80}; - const DFromM d; - const RebindToUnsigned du; - const Vec128 slice(Load(Full64(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; - -#if HWY_ARCH_ARM_A64 - return vaddv_u8(values.raw); -#else - const uint16x4_t x2 = vpaddl_u8(values.raw); - const uint32x2_t x4 = vpaddl_u16(x2); - const uint64x1_t x8 = vpaddl_u32(x4); - return vget_lane_u64(x8, 0); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { - alignas(16) static constexpr uint16_t kSliceLanes[8] = { - 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; - const Full128 d; - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); -#if HWY_ARCH_ARM_A64 - return vaddvq_u16(values.raw); -#else - const uint32x4_t x2 = vpaddlq_u16(values.raw); - const uint64x2_t x4 = vpaddlq_u32(x2); - return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { - // Upper lanes of partial loads are undefined. OnlyActive will fix this if - // we load all kSliceLanes so the upper lanes do not pollute the valid bits. - alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; - const DFromM d; - const RebindToUnsigned du; - const Vec128 slice(Load(Full64(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; -#if HWY_ARCH_ARM_A64 - return vaddv_u16(values.raw); -#else - const uint32x2_t x2 = vpaddl_u16(values.raw); - const uint64x1_t x4 = vpaddl_u32(x2); - return vget_lane_u64(x4, 0); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; - const Full128 d; - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); -#if HWY_ARCH_ARM_A64 - return vaddvq_u32(values.raw); -#else - const uint64x2_t x2 = vpaddlq_u32(values.raw); - return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - // Upper lanes of partial loads are undefined. OnlyActive will fix this if - // we load all kSliceLanes so the upper lanes do not pollute the valid bits. - alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; - const DFromM d; - const RebindToUnsigned du; - const Vec128 slice(Load(Full64(), kSliceLanes).raw); - const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; -#if HWY_ARCH_ARM_A64 - return vaddv_u32(values.raw); -#else - const uint64x1_t x2 = vpaddl_u32(values.raw); - return vget_lane_u64(x2, 0); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { - alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; - const Full128 d; - const Full128 du; - const Vec128 values = - BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); -#if HWY_ARCH_ARM_A64 - return vaddvq_u64(values.raw); -#else - return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); -#endif -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { - const Full64 d; - const Full64 du; - const Vec64 values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); - return vget_lane_u64(values.raw, 0); -} - -// Returns the lowest N for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); -} - -template -HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -// Returns number of lanes whose mask is set. -// -// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op -// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also -// changes each lane to 1 (if mask set) or 0. -// NOTE: PopCount also operates on vectors, so we still have to do horizontal -// sums separately. We specialize CountTrue for full vectors (negating instead -// of PopCount because it avoids an extra shift), and use PopCount of -// NibblesFromMask for partial vectors. - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, Mask128 mask) { - const Full128 di; - const int8x16_t ones = - vnegq_s8(BitCast(di, VecFromMask(Full128(), mask)).raw); - -#if HWY_ARCH_ARM_A64 - return static_cast(vaddvq_s8(ones)); -#else - const int16x8_t x2 = vpaddlq_s8(ones); - const int32x4_t x4 = vpaddlq_s16(x2); - const int64x2_t x8 = vpaddlq_s32(x4); - return static_cast(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1)); -#endif -} -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, Mask128 mask) { - const Full128 di; - const int16x8_t ones = - vnegq_s16(BitCast(di, VecFromMask(Full128(), mask)).raw); - -#if HWY_ARCH_ARM_A64 - return static_cast(vaddvq_s16(ones)); -#else - const int32x4_t x2 = vpaddlq_s16(ones); - const int64x2_t x4 = vpaddlq_s32(x2); - return static_cast(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1)); -#endif -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - const Full128 di; - const int32x4_t ones = - vnegq_s32(BitCast(di, VecFromMask(Full128(), mask)).raw); - -#if HWY_ARCH_ARM_A64 - return static_cast(vaddvq_s32(ones)); -#else - const int64x2_t x2 = vpaddlq_s32(ones); - return static_cast(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1)); -#endif -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, Mask128 mask) { -#if HWY_ARCH_ARM_A64 - const Full128 di; - const int64x2_t ones = - vnegq_s64(BitCast(di, VecFromMask(Full128(), mask)).raw); - return static_cast(vaddvq_s64(ones)); -#else - const Full128 du; - const auto mask_u = VecFromMask(du, RebindMask(du, mask)); - const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); - return static_cast(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1)); -#endif -} - -} // namespace detail - -// Full -template > -HWY_API size_t CountTrue(D /* tag */, Mask128 mask) { - return detail::CountTrue(hwy::SizeTag(), mask); -} - -// Partial -template -HWY_API size_t CountTrue(D d, MFromD mask) { - constexpr int kDiv = 4 * sizeof(TFromD); - return PopCount(detail::NibblesFromMask(d, mask)) / kDiv; -} - -template -HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { - const uint64_t nib = detail::NibblesFromMask(d, mask); - constexpr size_t kDiv = 4 * sizeof(TFromD); - return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv; -} - -template -HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { - const uint64_t nib = detail::NibblesFromMask(d, mask); - if (nib == 0) return -1; - constexpr size_t kDiv = 4 * sizeof(TFromD); - return static_cast(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv); -} - -template -HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { - const uint64_t nib = detail::NibblesFromMask(d, mask); - constexpr size_t kDiv = 4 * sizeof(TFromD); - return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv; -} - -template -HWY_API intptr_t FindLastTrue(D d, MFromD mask) { - const uint64_t nib = detail::NibblesFromMask(d, mask); - if (nib == 0) return -1; - constexpr size_t kDiv = 4 * sizeof(TFromD); - return static_cast((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / - kDiv); -} - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = detail::BitsFromMask(mask); - const size_t kNumBytes = (d.MaxLanes() + 7) / 8; - CopyBytes(&mask_bits, bits); - return kNumBytes; -} - -template -HWY_API bool AllFalse(D d, MFromD m) { - return detail::NibblesFromMask(d, m) == 0; -} - -// Full -template > -HWY_API bool AllTrue(D d, Mask128 m) { - return detail::NibblesFromMask(d, m) == ~0ull; -} -// Partial -template -HWY_API bool AllTrue(D d, MFromD m) { - return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1; -} - -// ------------------------------ Compress - -template -struct CompressIsPartition { - enum { value = (sizeof(T) != 1) }; -}; - -namespace detail { - -// Load 8 bytes, replicate into upper half so ZipLower can use the lower half. -template -HWY_INLINE Vec128 Load8Bytes(D /*tag*/, const uint8_t* bytes) { - return Vec128(vreinterpretq_u8_u64( - vld1q_dup_u64(reinterpret_cast(bytes)))); -} - -// Load 8 bytes and return half-reg with N <= 8 bytes. -template -HWY_INLINE VFromD Load8Bytes(D d, const uint8_t* bytes) { - return Load(d, bytes); -} - -template -HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<2> /*tag*/, - uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Simd d; - const Repartition d8; - const Simd du; - - // NEON does not provide an equivalent of AVX2 permutevar, so we need byte - // indices for VTBL (one vector's worth for each of 256 combinations of - // 8 mask bits). Loading them directly would require 4 KiB. We can instead - // store lane indices and convert to byte indices (2*lane + 0..1), with the - // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane - // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. - // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles - // is likely more costly than the higher cache footprint from storing bytes. - alignas(16) static constexpr uint8_t table[256 * 8] = { - // PrintCompress16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // - 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // - 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // - 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // - 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // - 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // - 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // - 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // - 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // - 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // - 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // - 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // - 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // - 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // - 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // - 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // - 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // - 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // - 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // - 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // - 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // - 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // - 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // - 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // - 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // - 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // - 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // - 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // - 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // - 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // - 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // - 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // - 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // - 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // - 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // - 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // - 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // - 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // - 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // - 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // - 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // - 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // - 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // - 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // - 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // - 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // - 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // - 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // - 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // - 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // - 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // - 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // - 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // - 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // - 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // - 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // - 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // - 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // - 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // - 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // - 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // - 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // - 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // - 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // - 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // - 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // - 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // - 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // - 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // - 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // - 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // - 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // - 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // - 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // - 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // - 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // - 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // - 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // - 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // - 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // - 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // - 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // - 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // - 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // - 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // - 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // - 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // - 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // - 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // - 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // - 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // - 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // - 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // - 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // - 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // - 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // - 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // - 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // - 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // - 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // - 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // - 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // - 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // - 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // - 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // - 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // - 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // - 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // - 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // - 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // - 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // - 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // - 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // - 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // - 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // - 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // - 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // - 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // - 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // - 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // - 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); - const Vec128 pairs = ZipLower(byte_idx, byte_idx); - return BitCast(d, pairs + Set(du, 0x0100)); -} - -template -HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<2> /*tag*/, - uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Simd d; - const Repartition d8; - const Simd du; - - // NEON does not provide an equivalent of AVX2 permutevar, so we need byte - // indices for VTBL (one vector's worth for each of 256 combinations of - // 8 mask bits). Loading them directly would require 4 KiB. We can instead - // store lane indices and convert to byte indices (2*lane + 0..1), with the - // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane - // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. - // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles - // is likely more costly than the higher cache footprint from storing bytes. - alignas(16) static constexpr uint8_t table[256 * 8] = { - // PrintCompressNot16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // - 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // - 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // - 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // - 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // - 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // - 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // - 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // - 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // - 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // - 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // - 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // - 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // - 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // - 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // - 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // - 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // - 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // - 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // - 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // - 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // - 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // - 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // - 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // - 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // - 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // - 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // - 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // - 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // - 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // - 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // - 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // - 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // - 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // - 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // - 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // - 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // - 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // - 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // - 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // - 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // - 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // - 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // - 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // - 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // - 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // - 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // - 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // - 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // - 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // - 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // - 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // - 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // - 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // - 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // - 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // - 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // - 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // - 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // - 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // - 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // - 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // - 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // - 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // - 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // - 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // - 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // - 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // - 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // - 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // - 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // - 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // - 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // - 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // - 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // - 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // - 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // - 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // - 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // - 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // - 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // - 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // - 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // - 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // - 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // - 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // - 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // - 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // - 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // - 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // - 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // - 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // - 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // - 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // - 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // - 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // - 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // - 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // - 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // - 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // - 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // - 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // - 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // - 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // - 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // - 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // - 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // - 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // - 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // - 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // - 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // - 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // - 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // - 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // - 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // - 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // - 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // - 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // - 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // - 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // - 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); - const Vec128 pairs = ZipLower(byte_idx, byte_idx); - return BitCast(d, pairs + Set(du, 0x0100)); -} - -template -HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<4> /*tag*/, - uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { - // PrintCompress32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // - 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // - 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // - 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // - 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<4> /*tag*/, - uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { - // PrintCompressNot32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, - 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15}; - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 - -template -HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<8> /*tag*/, - uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[64] = { - // PrintCompress64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<8> /*tag*/, - uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { - // PrintCompressNot64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -#endif - -// Helper function called by both Compress and CompressStore - avoids a -// redundant BitsFromMask in the latter. -template -HWY_INLINE Vec128 Compress(Vec128 v, uint64_t mask_bits) { - const auto idx = - detail::IdxFromBits(hwy::SizeTag(), mask_bits); - using D = DFromV; - const RebindToSigned di; - return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); -} - -template -HWY_INLINE Vec128 CompressNot(Vec128 v, uint64_t mask_bits) { - const auto idx = - detail::IdxFromNotBits(hwy::SizeTag(), mask_bits); - using D = DFromV; - const RebindToSigned di; - return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); -} - -} // namespace detail - -// Single lane: no-op -template -HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. - const DFromV d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskL, maskH); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 byte lanes -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); -} - -// Single lane: no-op -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. - const DFromV d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskH, maskL); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 byte lanes -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // For partial vectors, we cannot pull the Not() into the table because - // BitsFromMask clears the upper bits. - if (N < 16 / sizeof(T)) { - return detail::Compress(v, detail::BitsFromMask(Not(mask))); - } - return detail::CompressNot(v, detail::BitsFromMask(mask)); -} - -// ------------------------------ CompressBlocksNot -HWY_API Vec128 CompressBlocksNot(Vec128 v, - Mask128 /* m */) { - return v; -} - -// ------------------------------ CompressBits - -template -HWY_INLINE Vec128 CompressBits(Vec128 v, - const uint8_t* HWY_RESTRICT bits) { - uint64_t mask_bits = 0; - constexpr size_t kNumBytes = (N + 7) / 8; - CopyBytes(bits, &mask_bits); - if (N < 8) { - mask_bits &= (1ull << N) - 1; - } - - return detail::Compress(v, mask_bits); -} - -// ------------------------------ CompressStore -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(mask); - StoreU(detail::Compress(v, mask_bits), d, unaligned); - return PopCount(mask_bits); -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = detail::BitsFromMask(m); - const size_t count = PopCount(mask_bits); - const MFromD store_mask = RebindMask(d, FirstN(du, count)); - const VFromD compressed = - detail::Compress(BitCast(du, v), mask_bits); - BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); - return count; -} - -// ------------------------------ CompressBitsStore - -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - uint64_t mask_bits = 0; - constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8; - CopyBytes(bits, &mask_bits); - if (d.MaxLanes() < 8) { - mask_bits &= (1ull << d.MaxLanes()) - 1; - } - - StoreU(detail::Compress(v, mask_bits), d, unaligned); - return PopCount(mask_bits); -} - -// ------------------------------ LoadInterleaved2 - -// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. -#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#else -#define HWY_NATIVE_LOAD_STORE_INTERLEAVED -#endif - -namespace detail { -#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT -#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from - -#if HWY_ARCH_ARM_A64 -#define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4) -#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES -#else -// Exclude 64x2 and f64x1, which are only supported on aarch64 -#define HWY_IF_LOAD_INT(D) \ - HWY_IF_V_SIZE_GT_D(D, 4), \ - hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD) < 8)>* = \ - nullptr -#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ - HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) -#endif // HWY_ARCH_ARM_A64 - -// Must return raw tuple because Tuple2 lack a ctor, and we cannot use -// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return -// void. -#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ - decltype(Tuple2().raw) -// Tuple tag arg allows overloading (cannot just overload on return type) -#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ - const type##_t *from, Tuple2 -HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT) -#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT -#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT - -#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ - decltype(Tuple3().raw) -#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ - const type##_t *from, Tuple3 -HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT) -#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT -#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT - -#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ - decltype(Tuple4().raw) -#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ - const type##_t *from, Tuple4 -HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT) -#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT -#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT - -#undef HWY_NEON_DEF_FUNCTION_LOAD_INT -#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT -#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT -} // namespace detail - -template > -HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1) { - auto raw = - detail::LoadInterleaved2(unaligned, detail::Tuple2()); - v0 = VFromD(raw.val[0]); - v1 = VFromD(raw.val[1]); -} - -// <= 32 bits: avoid loading more than N bytes by copying to buffer -template > -HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1) { - // The smallest vector registers are 64-bits and we want space for two. - alignas(16) T buf[2 * 8 / sizeof(T)] = {}; - CopyBytes(unaligned, buf); - auto raw = detail::LoadInterleaved2(buf, detail::Tuple2()); - v0 = VFromD(raw.val[0]); - v1 = VFromD(raw.val[1]); -} - -#if HWY_ARCH_ARM_V7 -// 64x2: split into two 64x1 -template , HWY_IF_T_SIZE(T, 8)> -HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128& v0, - Vec128& v1) { - const Half dh; - VFromD v00, v10, v01, v11; - LoadInterleaved2(dh, unaligned, v00, v10); - LoadInterleaved2(dh, unaligned + 2, v01, v11); - v0 = Combine(d, v01, v00); - v1 = Combine(d, v11, v10); -} -#endif // HWY_ARCH_ARM_V7 - -// ------------------------------ LoadInterleaved3 - -template > -HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - auto raw = - detail::LoadInterleaved3(unaligned, detail::Tuple3()); - v0 = VFromD(raw.val[0]); - v1 = VFromD(raw.val[1]); - v2 = VFromD(raw.val[2]); -} - -// <= 32 bits: avoid writing more than N bytes by copying to buffer -template > -HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - // The smallest vector registers are 64-bits and we want space for three. - alignas(16) T buf[3 * 8 / sizeof(T)] = {}; - CopyBytes(unaligned, buf); - auto raw = detail::LoadInterleaved3(buf, detail::Tuple3()); - v0 = VFromD(raw.val[0]); - v1 = VFromD(raw.val[1]); - v2 = VFromD(raw.val[2]); -} - -#if HWY_ARCH_ARM_V7 -// 64x2: split into two 64x1 -template , HWY_IF_T_SIZE(T, 8)> -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - Vec128& v0, Vec128& v1, Vec128& v2) { - const Half dh; - VFromD v00, v10, v20, v01, v11, v21; - LoadInterleaved3(dh, unaligned, v00, v10, v20); - LoadInterleaved3(dh, unaligned + 3, v01, v11, v21); - v0 = Combine(d, v01, v00); - v1 = Combine(d, v11, v10); - v2 = Combine(d, v21, v20); -} -#endif // HWY_ARCH_ARM_V7 - -// ------------------------------ LoadInterleaved4 - -template > -HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - auto raw = - detail::LoadInterleaved4(unaligned, detail::Tuple4()); - v0 = VFromD(raw.val[0]); - v1 = VFromD(raw.val[1]); - v2 = VFromD(raw.val[2]); - v3 = VFromD(raw.val[3]); -} - -// <= 32 bits: avoid writing more than N bytes by copying to buffer -template > -HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - alignas(16) T buf[4 * 8 / sizeof(T)] = {}; - CopyBytes(unaligned, buf); - auto raw = detail::LoadInterleaved4(buf, detail::Tuple4()); - v0 = VFromD(raw.val[0]); - v1 = VFromD(raw.val[1]); - v2 = VFromD(raw.val[2]); - v3 = VFromD(raw.val[3]); -} - -#if HWY_ARCH_ARM_V7 -// 64x2: split into two 64x1 -template , HWY_IF_T_SIZE(T, 8)> -HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, - Vec128& v0, Vec128& v1, Vec128& v2, - Vec128& v3) { - const Half dh; - VFromD v00, v10, v20, v30, v01, v11, v21, v31; - LoadInterleaved4(dh, unaligned, v00, v10, v20, v30); - LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31); - v0 = Combine(d, v01, v00); - v1 = Combine(d, v11, v10); - v2 = Combine(d, v21, v20); - v3 = Combine(d, v31, v30); -} -#endif // HWY_ARCH_ARM_V7 - -#undef HWY_IF_LOAD_INT - -// ------------------------------ StoreInterleaved2 - -namespace detail { -#define HWY_NEON_BUILD_TPL_HWY_STORE_INT -#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void -#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw - -#if HWY_ARCH_ARM_A64 -#define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4) -#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES -#else -// Exclude 64x2 and f64x1, which are only supported on aarch64 -#define HWY_IF_STORE_INT(D) \ - HWY_IF_V_SIZE_GT_D(D, 4), \ - hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD) < 8)>* = \ - nullptr -#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ - HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ - HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) -#endif // HWY_ARCH_ARM_A64 - -#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ - Tuple2 tup, type##_t *to -HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT) -#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT - -#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ - Tuple3 tup, type##_t *to -HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT) -#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT - -#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ - Tuple4 tup, type##_t *to -HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT) -#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT - -#undef HWY_NEON_DEF_FUNCTION_STORE_INT -#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT -#undef HWY_NEON_BUILD_RET_HWY_STORE_INT -#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT -} // namespace detail - -template > -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - T* HWY_RESTRICT unaligned) { - detail::Tuple2 tup = {{{v0.raw, v1.raw}}}; - detail::StoreInterleaved2(tup, unaligned); -} - -// <= 32 bits: avoid writing more than N bytes by copying to buffer -template > -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - T* HWY_RESTRICT unaligned) { - alignas(16) T buf[2 * 8 / sizeof(T)]; - detail::Tuple2 tup = {{{v0.raw, v1.raw}}}; - detail::StoreInterleaved2(tup, buf); - CopyBytes(buf, unaligned); -} - -#if HWY_ARCH_ARM_V7 -// 64x2: split into two 64x1 -template , HWY_IF_T_SIZE(T, 8)> -HWY_API void StoreInterleaved2(Vec128 v0, Vec128 v1, D d, - T* HWY_RESTRICT unaligned) { - const Half dh; - StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned); - StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2); -} -#endif // HWY_ARCH_ARM_V7 - -// ------------------------------ StoreInterleaved3 - -template > -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - T* HWY_RESTRICT unaligned) { - detail::Tuple3 tup = {{{v0.raw, v1.raw, v2.raw}}}; - detail::StoreInterleaved3(tup, unaligned); -} - -// <= 32 bits: avoid writing more than N bytes by copying to buffer -template > -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - T* HWY_RESTRICT unaligned) { - alignas(16) T buf[3 * 8 / sizeof(T)]; - detail::Tuple3 tup = {{{v0.raw, v1.raw, v2.raw}}}; - detail::StoreInterleaved3(tup, buf); - CopyBytes(buf, unaligned); -} - -#if HWY_ARCH_ARM_V7 -// 64x2: split into two 64x1 -template , HWY_IF_T_SIZE(T, 8)> -HWY_API void StoreInterleaved3(Vec128 v0, Vec128 v1, Vec128 v2, D d, - T* HWY_RESTRICT unaligned) { - const Half dh; - StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh, - unaligned); - StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh, - unaligned + 3); -} -#endif // HWY_ARCH_ARM_V7 - -// ------------------------------ StoreInterleaved4 - -template > -HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, - VFromD v3, D d, T* HWY_RESTRICT unaligned) { - detail::Tuple4 tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; - detail::StoreInterleaved4(tup, unaligned); -} - -// <= 32 bits: avoid writing more than N bytes by copying to buffer -template > -HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, - VFromD v3, D d, T* HWY_RESTRICT unaligned) { - alignas(16) T buf[4 * 8 / sizeof(T)]; - detail::Tuple4 tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; - detail::StoreInterleaved4(tup, buf); - CopyBytes(buf, unaligned); -} - -#if HWY_ARCH_ARM_V7 -// 64x2: split into two 64x1 -template , HWY_IF_T_SIZE(T, 8)> -HWY_API void StoreInterleaved4(Vec128 v0, Vec128 v1, Vec128 v2, - Vec128 v3, D d, T* HWY_RESTRICT unaligned) { - const Half dh; - StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), - LowerHalf(dh, v3), dh, unaligned); - StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), - UpperHalf(dh, v3), dh, unaligned + 4); -} -#endif // HWY_ARCH_ARM_V7 - -#undef HWY_IF_STORE_INT - -// ------------------------------ Additional mask logical operations -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const FixedTag d; - const auto vmask = VecFromMask(d, mask); - return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Simd d; - const auto vmask = VecFromMask(d, mask); - const auto neg_vmask = - ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); - return MaskFromVec(Or(vmask, neg_vmask)); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Full128 d; - const Repartition di64; - - auto vmask = BitCast(di64, VecFromMask(d, mask)); - vmask = Or(vmask, Neg(vmask)); - - // Copy the sign bit of the first int64_t lane to the second int64_t lane - const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask)); - return MaskFromVec(BitCast(d, Or(vmask, vmask2))); -} - -template -HWY_API Mask128 SetBeforeFirst(Mask128 mask) { - return Not(SetAtOrAfterFirst(mask)); -} - -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const FixedTag d; - const RebindToSigned di; - - const auto vmask = BitCast(di, VecFromMask(d, mask)); - const auto zero = Zero(di); - const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); - return MaskFromVec(BitCast(d, And(vmask, vmask2))); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Simd d; - const RebindToSigned di; - - const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); - const auto only_first_vmask = - BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); - return MaskFromVec(only_first_vmask); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Full128 d; - const RebindToSigned di; - const Repartition di64; - - const auto zero = Zero(di64); - const auto vmask = BitCast(di64, VecFromMask(d, mask)); - const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); - const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); - return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); -} - -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { - const FixedTag d; - const RebindToSigned di; - using TI = MakeSigned; - - return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); -} -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { - const Simd d; - return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); -} - -// ------------------------------ Lt128 - -template -HWY_INLINE MFromD Lt128(D d, VFromD a, VFromD b) { - static_assert(IsSame, uint64_t>(), "T must be u64"); - // Truth table of Eq and Lt for Hi and Lo u64. - // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) - // =H =L cH cL | out = cH | (=H & cL) - // 0 0 0 0 | 0 - // 0 0 0 1 | 0 - // 0 0 1 0 | 1 - // 0 0 1 1 | 1 - // 0 1 0 0 | 0 - // 0 1 0 1 | 0 - // 0 1 1 0 | 1 - // 1 0 0 0 | 0 - // 1 0 0 1 | 1 - // 1 1 0 0 | 0 - const MFromD eqHL = Eq(a, b); - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - // We need to bring cL to the upper lane/bit corresponding to cH. Comparing - // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the - // comparison result leftwards requires only 4. IfThenElse compiles to the - // same code as OrAnd(). - const VFromD ltLx = DupEven(ltHL); - const VFromD outHx = IfThenElse(eqHL, ltLx, ltHL); - return MaskFromVec(DupOdd(outHx)); -} - -template -HWY_INLINE MFromD Lt128Upper(D d, VFromD a, VFromD b) { - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); -} - -// ------------------------------ Eq128 - -template -HWY_INLINE MFromD Eq128(D d, VFromD a, VFromD b) { - static_assert(IsSame, uint64_t>(), "T must be u64"); - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); -} - -template -HWY_INLINE MFromD Eq128Upper(D d, VFromD a, VFromD b) { - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); -} - -// ------------------------------ Ne128 - -template -HWY_INLINE MFromD Ne128(D d, VFromD a, VFromD b) { - static_assert(IsSame, uint64_t>(), "T must be u64"); - const VFromD neHL = VecFromMask(d, Ne(a, b)); - return MaskFromVec(Or(Reverse2(d, neHL), neHL)); -} - -template -HWY_INLINE MFromD Ne128Upper(D d, VFromD a, VFromD b) { - const VFromD neHL = VecFromMask(d, Ne(a, b)); - return MaskFromVec(InterleaveUpper(d, neHL, neHL)); -} - -// ------------------------------ Min128, Max128 (Lt128) - -// Without a native OddEven, it seems infeasible to go faster than Lt128. -template -HWY_INLINE VFromD Min128(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128(d, a, b), a, b); -} - -template -HWY_INLINE VFromD Max128(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128(d, b, a), a, b); -} - -template -HWY_INLINE VFromD Min128Upper(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128Upper(d, a, b), a, b); -} - -template -HWY_INLINE VFromD Max128Upper(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128Upper(d, b, a), a, b); -} - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -#ifdef HWY_NATIVE_LEADING_ZERO_COUNT -#undef HWY_NATIVE_LEADING_ZERO_COUNT -#else -#define HWY_NATIVE_LEADING_ZERO_COUNT -#endif - -HWY_NEON_DEF_FUNCTION_INT_8_16_32(LeadingZeroCount, vclz, _, 1) -HWY_NEON_DEF_FUNCTION_UINT_8_16_32(LeadingZeroCount, vclz, _, 1) - -template )> -HWY_API V LeadingZeroCount(V v) { - const DFromV d; - const RebindToUnsigned du; - const Repartition du32; - - const auto v_k32 = BitCast(du32, Set(du, 32)); - const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32; - const auto v_u32_lo_lzcnt = - And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu))); - const auto v_u32_hi_lzcnt = - BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt))); - - return BitCast( - d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt)); -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - using T = TFromD; - return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); -} - -template -HWY_API V TrailingZeroCount(V v) { - return LeadingZeroCount(ReverseBits(v)); -} - -template -HWY_API V TrailingZeroCount(V v) { - const DFromV d; - const Repartition du8; - return LeadingZeroCount( - ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))))); -} - -namespace detail { // for code folding -#if HWY_ARCH_ARM_V7 -#undef vuzp1_s8 -#undef vuzp1_u8 -#undef vuzp1_s16 -#undef vuzp1_u16 -#undef vuzp1_s32 -#undef vuzp1_u32 -#undef vuzp1_f32 -#undef vuzp1q_s8 -#undef vuzp1q_u8 -#undef vuzp1q_s16 -#undef vuzp1q_u16 -#undef vuzp1q_s32 -#undef vuzp1q_u32 -#undef vuzp1q_f32 -#undef vuzp2_s8 -#undef vuzp2_u8 -#undef vuzp2_s16 -#undef vuzp2_u16 -#undef vuzp2_s32 -#undef vuzp2_u32 -#undef vuzp2_f32 -#undef vuzp2q_s8 -#undef vuzp2q_u8 -#undef vuzp2q_s16 -#undef vuzp2q_u16 -#undef vuzp2q_s32 -#undef vuzp2q_u32 -#undef vuzp2q_f32 -#undef vzip1_s8 -#undef vzip1_u8 -#undef vzip1_s16 -#undef vzip1_u16 -#undef vzip1_s32 -#undef vzip1_u32 -#undef vzip1_f32 -#undef vzip1q_s8 -#undef vzip1q_u8 -#undef vzip1q_s16 -#undef vzip1q_u16 -#undef vzip1q_s32 -#undef vzip1q_u32 -#undef vzip1q_f32 -#undef vzip2_s8 -#undef vzip2_u8 -#undef vzip2_s16 -#undef vzip2_u16 -#undef vzip2_s32 -#undef vzip2_u32 -#undef vzip2_f32 -#undef vzip2q_s8 -#undef vzip2q_u8 -#undef vzip2q_s16 -#undef vzip2q_u16 -#undef vzip2q_s32 -#undef vzip2q_u32 -#undef vzip2q_f32 -#endif - -#undef HWY_NEON_BUILD_ARG_1 -#undef HWY_NEON_BUILD_ARG_2 -#undef HWY_NEON_BUILD_ARG_3 -#undef HWY_NEON_BUILD_PARAM_1 -#undef HWY_NEON_BUILD_PARAM_2 -#undef HWY_NEON_BUILD_PARAM_3 -#undef HWY_NEON_BUILD_RET_1 -#undef HWY_NEON_BUILD_RET_2 -#undef HWY_NEON_BUILD_RET_3 -#undef HWY_NEON_BUILD_TPL_1 -#undef HWY_NEON_BUILD_TPL_2 -#undef HWY_NEON_BUILD_TPL_3 -#undef HWY_NEON_DEF_FUNCTION -#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS -#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES -#undef HWY_NEON_DEF_FUNCTION_BFLOAT_16 -#undef HWY_NEON_DEF_FUNCTION_FLOAT_16 -#undef HWY_NEON_DEF_FUNCTION_FLOAT_16_32 -#undef HWY_NEON_DEF_FUNCTION_FLOAT_32 -#undef HWY_NEON_DEF_FUNCTION_FLOAT_64 -#undef HWY_NEON_DEF_FUNCTION_FULL_UI -#undef HWY_NEON_DEF_FUNCTION_FULL_UI_64 -#undef HWY_NEON_DEF_FUNCTION_FULL_UIF_64 -#undef HWY_NEON_DEF_FUNCTION_INT_16 -#undef HWY_NEON_DEF_FUNCTION_INT_32 -#undef HWY_NEON_DEF_FUNCTION_INT_64 -#undef HWY_NEON_DEF_FUNCTION_INT_8 -#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32 -#undef HWY_NEON_DEF_FUNCTION_INTS -#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS -#undef HWY_NEON_DEF_FUNCTION_UI_8_16_32 -#undef HWY_NEON_DEF_FUNCTION_UIF_64 -#undef HWY_NEON_DEF_FUNCTION_UIF_8_16_32 -#undef HWY_NEON_DEF_FUNCTION_UINT_16 -#undef HWY_NEON_DEF_FUNCTION_UINT_32 -#undef HWY_NEON_DEF_FUNCTION_UINT_64 -#undef HWY_NEON_DEF_FUNCTION_UINT_8 -#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 -#undef HWY_NEON_DEF_FUNCTION_UINTS -#undef HWY_NEON_EVAL - -} // namespace detail - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/arm_sve-inl.h b/deps/highway/include/hwy/ops/arm_sve-inl.h deleted file mode 100644 index 944c5df7..00000000 --- a/deps/highway/include/hwy/ops/arm_sve-inl.h +++ /dev/null @@ -1,5050 +0,0 @@ -// Copyright 2021 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Arm SVE[2] vectors (length not known at compile time). -// External include guard in highway.h - see comment there. - -#include - -#include "hwy/ops/shared-inl.h" - -// Arm C215 declares that SVE vector lengths will always be a power of two. -// We default to relying on this, which makes some operations more efficient. -// You can still opt into fixups by setting this to 0 (unsupported). -#ifndef HWY_SVE_IS_POW2 -#define HWY_SVE_IS_POW2 1 -#endif - -#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 -#define HWY_SVE_HAVE_2 1 -#else -#define HWY_SVE_HAVE_2 0 -#endif - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -template -struct DFromV_t {}; // specialized in macros -template -using DFromV = typename DFromV_t>::type; - -template -using TFromV = TFromD>; - -// ================================================== MACROS - -// Generate specializations and function definitions using X macros. Although -// harder to read and debug, writing everything manually is too bulky. - -namespace detail { // for code folding - -// Args: BASE, CHAR, BITS, HALF, NAME, OP - -// Unsigned: -#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP) -#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP) -#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ - X_MACRO(uint, u, 32, 16, NAME, OP) -#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \ - X_MACRO(uint, u, 64, 32, NAME, OP) - -// Signed: -#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP) -#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP) -#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP) -#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP) - -// Float: -#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \ - X_MACRO(float, f, 16, 16, NAME, OP) -#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ - X_MACRO(float, f, 32, 16, NAME, OP) -#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \ - X_MACRO(float, f, 64, 32, NAME, OP) - -#if HWY_SVE_HAVE_BFLOAT16 -#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) \ - X_MACRO(bfloat, bf, 16, 16, NAME, OP) -#else -#define HWY_SVE_FOREACH_BF16(X_MACRO, NAME, OP) -#endif - -// For all element sizes: -#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) - -// HWY_SVE_FOREACH_F does not include HWY_SVE_FOREACH_BF16 because SVE lacks -// bf16 overloads for some intrinsics (especially less-common arithmetic). -#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) - -// Commonly used type categories for a given element size: -#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) - -// Commonly used type categories: -#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) - -#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \ - HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) - -// Assemble types for use in x-macros -#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t -#define HWY_SVE_D(BASE, BITS, N, POW2) Simd -#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t -#define HWY_SVE_TUPLE(BASE, BITS, MUL) sv##BASE##BITS##x##MUL##_t - -} // namespace detail - -#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \ - template <> \ - struct DFromV_t { \ - using type = ScalableTag; \ - }; - -HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _) -HWY_SVE_FOREACH_BF16(HWY_SPECIALIZE, _, _) -#undef HWY_SPECIALIZE - -// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX -// instructions, and we anyway only use it when the predicate is ptrue. - -// vector = f(vector), e.g. Not -#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ - } -#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS(v); \ - } - -// vector = f(vector, scalar), e.g. detail::AddN -#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \ - } -#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS(a, b); \ - } - -// vector = f(vector, vector), e.g. Add -#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \ - } -#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS(a, b); \ - } - -#define HWY_SVE_RETV_ARGVVV(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b, \ - HWY_SVE_V(BASE, BITS) c) { \ - return sv##OP##_##CHAR##BITS(a, b, c); \ - } - -// ------------------------------ Lanes - -namespace detail { - -// Returns actual lanes of a hardware vector without rounding to a power of two. -template -HWY_INLINE size_t AllHardwareLanes() { - return svcntb_pat(SV_ALL); -} -template -HWY_INLINE size_t AllHardwareLanes() { - return svcnth_pat(SV_ALL); -} -template -HWY_INLINE size_t AllHardwareLanes() { - return svcntw_pat(SV_ALL); -} -template -HWY_INLINE size_t AllHardwareLanes() { - return svcntd_pat(SV_ALL); -} - -// All-true mask from a macro - -#if HWY_SVE_IS_POW2 -#define HWY_SVE_ALL_PTRUE(BITS) svptrue_b##BITS() -#define HWY_SVE_PTRUE(BITS) svptrue_b##BITS() -#else -#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL) -#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2) -#endif // HWY_SVE_IS_POW2 - -} // namespace detail - -#if HWY_HAVE_SCALABLE - -// Returns actual number of lanes after capping by N and shifting. May return 0 -// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8). -template -HWY_API size_t Lanes(Simd d) { - const size_t actual = detail::AllHardwareLanes(); - constexpr size_t kMaxLanes = MaxLanes(d); - constexpr int kClampedPow2 = HWY_MIN(kPow2, 0); - // Common case of full vectors: avoid any extra instructions. - if (detail::IsFull(d)) return actual; - return HWY_MIN(detail::ScaleByPower(actual, kClampedPow2), kMaxLanes); -} - -#endif // HWY_HAVE_SCALABLE - -// ================================================== MASK INIT - -// One mask bit per byte; only the one belonging to the lowest byte is valid. - -// ------------------------------ FirstN -#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) { \ - const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \ - return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast(limit)); \ - } -HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt) -HWY_SVE_FOREACH_BF16(HWY_SVE_FIRSTN, FirstN, whilelt) - -#undef HWY_SVE_FIRSTN - -template -using MFromD = decltype(FirstN(D(), 0)); - -#if !HWY_HAVE_FLOAT16 -template -MFromD> FirstN(D /* tag */, size_t count) { - return FirstN(RebindToUnsigned(), count); -} -#endif // !HWY_HAVE_FLOAT16 - -#if !HWY_SVE_HAVE_BFLOAT16 -template -MFromD> FirstN(D /* tag */, size_t count) { - return FirstN(RebindToUnsigned(), count); -} -#endif // !HWY_SVE_HAVE_BFLOAT16 - -namespace detail { - -#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ - return HWY_SVE_PTRUE(BITS); \ - } \ - template \ - HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ - return HWY_SVE_ALL_PTRUE(BITS); \ - } - -HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) // return all-true -HWY_SVE_FOREACH_BF16(HWY_SVE_WRAP_PTRUE, PTrue, ptrue) -#undef HWY_SVE_WRAP_PTRUE - -HWY_API svbool_t PFalse() { return svpfalse_b(); } - -// Returns all-true if d is HWY_FULL or FirstN(N) after capping N. -// -// This is used in functions that load/store memory; other functions (e.g. -// arithmetic) can ignore d and use PTrue instead. -template -svbool_t MakeMask(D d) { - return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d)); -} - -} // namespace detail - -// ================================================== INIT - -// ------------------------------ Set -// vector = f(d, scalar), e.g. Set -#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - HWY_SVE_T(BASE, BITS) arg) { \ - return sv##OP##_##CHAR##BITS(arg); \ - } - -HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n) -HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, dup_n) -#if !HWY_SVE_HAVE_BFLOAT16 -// Required for Zero and VFromD -template -svuint16_t Set(Simd d, bfloat16_t arg) { - return Set(RebindToUnsigned(), arg.bits); -} -#endif // HWY_SVE_HAVE_BFLOAT16 -#undef HWY_SVE_SET - -template -using VFromD = decltype(Set(D(), TFromD())); - -using VBF16 = VFromD>; - -// ------------------------------ Zero - -template -VFromD Zero(D d) { - // Cast to support bfloat16_t. - const RebindToUnsigned du; - return BitCast(d, Set(du, 0)); -} - -// ------------------------------ Undefined - -#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \ - return sv##OP##_##CHAR##BITS(); \ - } - -HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef) - -// ------------------------------ BitCast - -namespace detail { - -// u8: no change -#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \ - return v; \ - } \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte( \ - HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \ - return v; \ - } - -// All other types -#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_u8_##CHAR##BITS(v); \ - } \ - template \ - HWY_INLINE HWY_SVE_V(BASE, BITS) \ - BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \ - return sv##OP##_##CHAR##BITS##_u8(v); \ - } - -HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _) -HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret) -HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret) -HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret) -HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret) -HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret) -HWY_SVE_FOREACH_BF16(HWY_SVE_CAST, _, reinterpret) - -#undef HWY_SVE_CAST_NOP -#undef HWY_SVE_CAST - -#if !HWY_SVE_HAVE_BFLOAT16 -template -HWY_INLINE VBF16 BitCastFromByte(Simd /* d */, - svuint8_t v) { - return BitCastFromByte(Simd(), v); -} -#endif // !HWY_SVE_HAVE_BFLOAT16 - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, FromV v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(v)); -} - -// ------------------------------ Tuple - -// tuples = f(d, v..), e.g. Create2 -#define HWY_SVE_CREATE(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \ - NAME##2(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1) { \ - return sv##OP##2_##CHAR##BITS(v0, v1); \ - } \ - template \ - HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) NAME##3( \ - HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v0, \ - HWY_SVE_V(BASE, BITS) v1, HWY_SVE_V(BASE, BITS) v2) { \ - return sv##OP##3_##CHAR##BITS(v0, v1, v2); \ - } \ - template \ - HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \ - NAME##4(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ - HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3) { \ - return sv##OP##4_##CHAR##BITS(v0, v1, v2, v3); \ - } - -HWY_SVE_FOREACH(HWY_SVE_CREATE, Create, create) -HWY_SVE_FOREACH_BF16(HWY_SVE_CREATE, Create, create) -#undef HWY_SVE_CREATE - -template -using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D()))); -template -using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D()))); -template -using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D()))); - -#define HWY_SVE_GET(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple) { \ - return sv##OP##2_##CHAR##BITS(tuple, kIndex); \ - } \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple) { \ - return sv##OP##3_##CHAR##BITS(tuple, kIndex); \ - } \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple) { \ - return sv##OP##4_##CHAR##BITS(tuple, kIndex); \ - } - -HWY_SVE_FOREACH(HWY_SVE_GET, Get, get) -HWY_SVE_FOREACH_BF16(HWY_SVE_GET, Get, get) -#undef HWY_SVE_GET - -#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_TUPLE(BASE, BITS, 2) \ - NAME##2(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(BASE, BITS) vec) { \ - return sv##OP##2_##CHAR##BITS(tuple, kIndex, vec); \ - } \ - template \ - HWY_API HWY_SVE_TUPLE(BASE, BITS, 3) \ - NAME##3(HWY_SVE_TUPLE(BASE, BITS, 3) tuple, HWY_SVE_V(BASE, BITS) vec) { \ - return sv##OP##3_##CHAR##BITS(tuple, kIndex, vec); \ - } \ - template \ - HWY_API HWY_SVE_TUPLE(BASE, BITS, 4) \ - NAME##4(HWY_SVE_TUPLE(BASE, BITS, 4) tuple, HWY_SVE_V(BASE, BITS) vec) { \ - return sv##OP##4_##CHAR##BITS(tuple, kIndex, vec); \ - } - -HWY_SVE_FOREACH(HWY_SVE_SET, Set, set) -HWY_SVE_FOREACH_BF16(HWY_SVE_SET, Set, set) -#undef HWY_SVE_SET - -// ------------------------------ ResizeBitCast - -// Same as BitCast on SVE -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, v); -} - -// ================================================== LOGICAL - -// detail::*N() functions accept a scalar argument to avoid extra Set(). - -// ------------------------------ Not -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not ) // NOLINT - -// ------------------------------ And - -namespace detail { -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, AndN, and_n) -} // namespace detail - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, And, and) - -template -HWY_API V And(const V a, const V b) { - const DFromV df; - const RebindToUnsigned du; - return BitCast(df, And(BitCast(du, a), BitCast(du, b))); -} - -// ------------------------------ Or - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr) - -template -HWY_API V Or(const V a, const V b) { - const DFromV df; - const RebindToUnsigned du; - return BitCast(df, Or(BitCast(du, a), BitCast(du, b))); -} - -// ------------------------------ Xor - -namespace detail { -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, XorN, eor_n) -} // namespace detail - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Xor, eor) - -template -HWY_API V Xor(const V a, const V b) { - const DFromV df; - const RebindToUnsigned du; - return BitCast(df, Xor(BitCast(du, a), BitCast(du, b))); -} - -// ------------------------------ AndNot - -namespace detail { -#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN_SWAP, AndNotN, bic_n) -#undef HWY_SVE_RETV_ARGPVN_SWAP -} // namespace detail - -#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a); \ - } -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV_SWAP, AndNot, bic) -#undef HWY_SVE_RETV_ARGPVV_SWAP - -template -HWY_API V AndNot(const V a, const V b) { - const DFromV df; - const RebindToUnsigned du; - return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b))); -} - -// ------------------------------ Xor3 - -#if HWY_SVE_HAVE_2 - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGVVV, Xor3, eor3) - -template -HWY_API V Xor3(const V x1, const V x2, const V x3) { - const DFromV df; - const RebindToUnsigned du; - return BitCast(df, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3))); -} - -#else -template -HWY_API V Xor3(V x1, V x2, V x3) { - return Xor(x1, Xor(x2, x3)); -} -#endif - -// ------------------------------ Or3 -template -HWY_API V Or3(V o1, V o2, V o3) { - return Or(o1, Or(o2, o3)); -} - -// ------------------------------ OrAnd -template -HWY_API V OrAnd(const V o, const V a1, const V a2) { - return Or(o, And(a1, a2)); -} - -// ------------------------------ PopulationCount - -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -// Need to return original type instead of unsigned. -#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - return BitCast(DFromV(), \ - sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \ - } -HWY_SVE_FOREACH_UI(HWY_SVE_POPCNT, PopulationCount, cnt) -#undef HWY_SVE_POPCNT - -// ================================================== SIGN - -// ------------------------------ Neg -HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Neg, neg) - -HWY_API VBF16 Neg(VBF16 v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask()))); -} - -// ------------------------------ Abs -HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs) - -// ================================================== ARITHMETIC - -// Per-target flags to prevent generic_ops-inl.h defining Add etc. -#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS -#undef HWY_NATIVE_OPERATOR_REPLACEMENTS -#else -#define HWY_NATIVE_OPERATOR_REPLACEMENTS -#endif - -// ------------------------------ Add - -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN, AddN, add_n) -} // namespace detail - -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Add, add) - -// ------------------------------ Sub - -namespace detail { -// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg. -#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS##_z(pg, a, b); \ - } - -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN_MASK, SubN, sub_n) -#undef HWY_SVE_RETV_ARGPVN_MASK -} // namespace detail - -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Sub, sub) - -// ------------------------------ SumsOf8 -HWY_API svuint64_t SumsOf8(const svuint8_t v) { - const ScalableTag du32; - const ScalableTag du64; - const svbool_t pg = detail::PTrue(du64); - - const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1); - // Compute pairwise sum of u32 and extend to u64. - // TODO(janwas): on SVE2, we can instead use svaddp. - const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32); - // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended) - const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4)); - return Add(hi, lo); -} - -// ------------------------------ SaturatedAdd - -#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB -#undef HWY_NATIVE_I32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB -#undef HWY_NATIVE_U32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB -#undef HWY_NATIVE_I64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I64_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB -#undef HWY_NATIVE_U64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U64_SATURATED_ADDSUB -#endif - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGVV, SaturatedAdd, qadd) - -// ------------------------------ SaturatedSub - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGVV, SaturatedSub, qsub) - -// ------------------------------ AbsDiff -#ifdef HWY_NATIVE_INTEGER_ABS_DIFF -#undef HWY_NATIVE_INTEGER_ABS_DIFF -#else -#define HWY_NATIVE_INTEGER_ABS_DIFF -#endif - -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, AbsDiff, abd) - -// ------------------------------ ShiftLeft[Same] - -#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits); \ - } \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n) - -// ------------------------------ ShiftRight[Same] - -HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_N, ShiftRight, lsr_n) -HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n) - -#undef HWY_SVE_SHIFT_N - -// ------------------------------ RotateRight - -// TODO(janwas): svxar on SVE2 -template -HWY_API V RotateRight(const V v) { - constexpr size_t kSizeInBits = sizeof(TFromV) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -// ------------------------------ Shl/r - -#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \ - const RebindToUnsigned> du; \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, \ - BitCast(du, bits)); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT, Shl, lsl) - -HWY_SVE_FOREACH_U(HWY_SVE_SHIFT, Shr, lsr) -HWY_SVE_FOREACH_I(HWY_SVE_SHIFT, Shr, asr) - -#undef HWY_SVE_SHIFT - -// ------------------------------ Min/Max - -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Min, min) -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max) -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm) -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm) - -namespace detail { -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n) -HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n) -} // namespace detail - -// ------------------------------ Mul - -// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Mul, mul) - -// ------------------------------ MulHigh -HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) -// Not part of API, used internally: -HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) -HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) -HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh) - -// ------------------------------ MulFixedPoint15 -HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) { -#if HWY_SVE_HAVE_2 - return svqrdmulh_s16(a, b); -#else - const DFromV d; - const RebindToUnsigned du; - - const svuint16_t lo = BitCast(du, Mul(a, b)); - const svint16_t hi = MulHigh(a, b); - // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must - // carry that into the result. Instead isolate the top two bits because only - // they can influence the result. - const svuint16_t lo_top2 = ShiftRight<14>(lo); - // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0. - const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1)); - return Add(Add(hi, hi), BitCast(d, rounding)); -#endif -} - -// ------------------------------ Div -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div) - -// ------------------------------ ApproximateReciprocal -#ifdef HWY_NATIVE_F64_APPROX_RECIP -#undef HWY_NATIVE_F64_APPROX_RECIP -#else -#define HWY_NATIVE_F64_APPROX_RECIP -#endif - -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe) - -// ------------------------------ Sqrt -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt) - -// ------------------------------ ApproximateReciprocalSqrt -#ifdef HWY_NATIVE_F64_APPROX_RSQRT -#undef HWY_NATIVE_F64_APPROX_RSQRT -#else -#define HWY_NATIVE_F64_APPROX_RSQRT -#endif - -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGV, ApproximateReciprocalSqrt, rsqrte) - -// ------------------------------ MulAdd - -// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. -#ifdef HWY_NATIVE_INT_FMA -#undef HWY_NATIVE_INT_FMA -#else -#define HWY_NATIVE_INT_FMA -#endif - -#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x, \ - HWY_SVE_V(BASE, BITS) add) { \ - return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \ - } - -HWY_SVE_FOREACH(HWY_SVE_FMA, MulAdd, mad) - -// ------------------------------ NegMulAdd -HWY_SVE_FOREACH(HWY_SVE_FMA, NegMulAdd, msb) - -// ------------------------------ MulSub -HWY_SVE_FOREACH_F(HWY_SVE_FMA, MulSub, nmsb) - -// ------------------------------ NegMulSub -HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulSub, nmad) - -#undef HWY_SVE_FMA - -// ------------------------------ Round etc. - -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Round, rintn) -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Floor, rintm) -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Ceil, rintp) -HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Trunc, rintz) - -// ================================================== MASK - -// ------------------------------ RebindMask -template -HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) { - return mask; -} - -// ------------------------------ Mask logical - -HWY_API svbool_t Not(svbool_t m) { - // We don't know the lane type, so assume 8-bit. For larger types, this will - // de-canonicalize the predicate, i.e. set bits to 1 even though they do not - // correspond to the lowest byte in the lane. Arm says such bits are ignored. - return svnot_b_z(HWY_SVE_PTRUE(8), m); -} -HWY_API svbool_t And(svbool_t a, svbool_t b) { - return svand_b_z(b, b, a); // same order as AndNot for consistency -} -HWY_API svbool_t AndNot(svbool_t a, svbool_t b) { - return svbic_b_z(b, b, a); // reversed order like NEON -} -HWY_API svbool_t Or(svbool_t a, svbool_t b) { - return svsel_b(a, a, b); // a ? true : b -} -HWY_API svbool_t Xor(svbool_t a, svbool_t b) { - return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b. -} - -HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) { - return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b. -} - -// ------------------------------ CountTrue - -#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \ - return sv##OP##_b##BITS(detail::MakeMask(d), m); \ - } - -HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE, CountTrue, cntp) -#undef HWY_SVE_COUNT_TRUE - -// For 16-bit Compress: full vector, not limited to SV_POW2. -namespace detail { - -#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \ - return sv##OP##_b##BITS(svptrue_b##BITS(), m); \ - } - -HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp) -#undef HWY_SVE_COUNT_TRUE_FULL - -} // namespace detail - -// ------------------------------ AllFalse -template -HWY_API bool AllFalse(D d, svbool_t m) { - return !svptest_any(detail::MakeMask(d), m); -} - -// ------------------------------ AllTrue -template -HWY_API bool AllTrue(D d, svbool_t m) { - return CountTrue(d, m) == Lanes(d); -} - -// ------------------------------ FindFirstTrue -template -HWY_API intptr_t FindFirstTrue(D d, svbool_t m) { - return AllFalse(d, m) ? intptr_t{-1} - : static_cast( - CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m))); -} - -// ------------------------------ FindKnownFirstTrue -template -HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) { - return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)); -} - -// ------------------------------ IfThenElse -#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \ - return sv##OP##_##CHAR##BITS(m, yes, no); \ - } - -HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel) -#undef HWY_SVE_IF_THEN_ELSE - -// ------------------------------ IfThenElseZero -template -HWY_API V IfThenElseZero(const svbool_t mask, const V yes) { - return IfThenElse(mask, yes, Zero(DFromV())); -} - -// ------------------------------ IfThenZeroElse -template -HWY_API V IfThenZeroElse(const svbool_t mask, const V no) { - return IfThenElse(mask, Zero(DFromV()), no); -} - -// ------------------------------ Additional mask logical operations -HWY_API svbool_t SetBeforeFirst(svbool_t m) { - // We don't know the lane type, so assume 8-bit. For larger types, this will - // de-canonicalize the predicate, i.e. set bits to 1 even though they do not - // correspond to the lowest byte in the lane. Arm says such bits are ignored. - return svbrkb_b_z(HWY_SVE_PTRUE(8), m); -} - -HWY_API svbool_t SetAtOrBeforeFirst(svbool_t m) { - // We don't know the lane type, so assume 8-bit. For larger types, this will - // de-canonicalize the predicate, i.e. set bits to 1 even though they do not - // correspond to the lowest byte in the lane. Arm says such bits are ignored. - return svbrka_b_z(HWY_SVE_PTRUE(8), m); -} - -HWY_API svbool_t SetOnlyFirst(svbool_t m) { return svbrka_b_z(m, m); } - -HWY_API svbool_t SetAtOrAfterFirst(svbool_t m) { - return Not(SetBeforeFirst(m)); -} - -// ================================================== COMPARE - -// mask = f(vector, vector) -#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \ - } -#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \ - return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b); \ - } - -// ------------------------------ Eq -HWY_SVE_FOREACH(HWY_SVE_COMPARE, Eq, cmpeq) -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, EqN, cmpeq_n) -} // namespace detail - -// ------------------------------ Ne -HWY_SVE_FOREACH(HWY_SVE_COMPARE, Ne, cmpne) -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, NeN, cmpne_n) -} // namespace detail - -// ------------------------------ Lt -HWY_SVE_FOREACH(HWY_SVE_COMPARE, Lt, cmplt) -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LtN, cmplt_n) -} // namespace detail - -// ------------------------------ Le -HWY_SVE_FOREACH(HWY_SVE_COMPARE, Le, cmple) -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LeN, cmple_n) -} // namespace detail - -// ------------------------------ Gt/Ge (swapped order) -template -HWY_API svbool_t Gt(const V a, const V b) { - return Lt(b, a); -} -template -HWY_API svbool_t Ge(const V a, const V b) { - return Le(b, a); -} -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, GeN, cmpge_n) -HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, GtN, cmpgt_n) -} // namespace detail - -#undef HWY_SVE_COMPARE -#undef HWY_SVE_COMPARE_N - -// ------------------------------ TestBit -template -HWY_API svbool_t TestBit(const V a, const V bit) { - return detail::NeN(And(a, bit), 0); -} - -// ------------------------------ MaskFromVec (Ne) -template -HWY_API svbool_t MaskFromVec(const V v) { - return detail::NeN(v, static_cast>(0)); -} - -// ------------------------------ VecFromMask -template -HWY_API VFromD VecFromMask(const D d, svbool_t mask) { - const RebindToSigned di; - // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which - // requires an extra instruction plus M0 pipeline. - return BitCast(d, IfThenElseZero(mask, Set(di, -1))); -} - -// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse) - -#if HWY_SVE_HAVE_2 - -#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \ - HWY_SVE_V(BASE, BITS) no) { \ - return sv##OP##_##CHAR##BITS(yes, no, mask); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_IF_VEC, IfVecThenElse, bsl) -#undef HWY_SVE_IF_VEC - -template -HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { - const DFromV d; - const RebindToUnsigned du; - return BitCast( - d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no))); -} - -#else - -template -HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { - return Or(And(mask, yes), AndNot(mask, no)); -} - -#endif // HWY_SVE_HAVE_2 - -// ------------------------------ BitwiseIfThenElse - -#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#else -#define HWY_NATIVE_BITWISE_IF_THEN_ELSE -#endif - -template -HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { - return IfVecThenElse(mask, yes, no); -} - -// ------------------------------ CopySign (BitwiseIfThenElse) -template -HWY_API V CopySign(const V magn, const V sign) { - const DFromV d; - return BitwiseIfThenElse(SignBit(d), sign, magn); -} - -// ------------------------------ CopySignToAbs -template -HWY_API V CopySignToAbs(const V abs, const V sign) { -#if HWY_SVE_HAVE_2 // CopySign is more efficient than OrAnd - return CopySign(abs, sign); -#else - const DFromV d; - return OrAnd(abs, SignBit(d), sign); -#endif -} - -// ------------------------------ Floating-point classification (Ne) - -template -HWY_API svbool_t IsNaN(const V v) { - return Ne(v, v); // could also use cmpuo -} - -template -HWY_API svbool_t IsInf(const V v) { - using T = TFromV; - const DFromV d; - const RebindToSigned di; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2())); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API svbool_t IsFinite(const V v) { - using T = TFromV; - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - const VFromD vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). - const VFromD exp = - BitCast(di, ShiftRight() + 1>(Add(vu, vu))); - return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField())); -} - -// ================================================== MEMORY - -// ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream - -#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ - return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p); \ - } - -#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ - return sv##OP##_##CHAR##BITS(m, p); \ - } - -#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ - /* All-true predicate to load all 128 bits. */ \ - return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p); \ - } - -#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ - HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v); \ - } - -#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \ - HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \ - sv##OP##_##CHAR##BITS(m, p, v); \ - } - -HWY_SVE_FOREACH(HWY_SVE_LOAD, Load, ld1) -HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1) -HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1) -HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1) -HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1) - -HWY_SVE_FOREACH_BF16(HWY_SVE_LOAD, Load, ld1) -HWY_SVE_FOREACH_BF16(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1) -HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Store, st1) -HWY_SVE_FOREACH_BF16(HWY_SVE_STORE, Stream, stnt1) -HWY_SVE_FOREACH_BF16(HWY_SVE_BLENDED_STORE, BlendedStore, st1) - -#if HWY_TARGET != HWY_SVE2_128 -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDupFull128, ld1rq) -} // namespace detail -#endif // HWY_TARGET != HWY_SVE2_128 - -#undef HWY_SVE_LOAD -#undef HWY_SVE_MASKED_LOAD -#undef HWY_SVE_LOAD_DUP128 -#undef HWY_SVE_STORE -#undef HWY_SVE_BLENDED_STORE - -#if !HWY_SVE_HAVE_BFLOAT16 - -template -HWY_API VBF16 Load(Simd d, - const bfloat16_t* HWY_RESTRICT p) { - return Load(RebindToUnsigned(), - reinterpret_cast(p)); -} - -#endif // !HWY_SVE_HAVE_BFLOAT16 - -#if HWY_TARGET == HWY_SVE2_128 -// On the HWY_SVE2_128 target, LoadDup128 is the same as Load since vectors -// cannot exceed 16 bytes on the HWY_SVE2_128 target. -template -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} -#else -// If D().MaxBytes() <= 16 is true, simply do a Load operation. -template -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} - -// If D().MaxBytes() > 16 is true, need to load the vector using ld1rq -template , bfloat16_t>()>* = nullptr> -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { - return detail::LoadDupFull128(d, p); -} - -#if !HWY_SVE_HAVE_BFLOAT16 - -template -HWY_API VBF16 LoadDup128(D d, const bfloat16_t* HWY_RESTRICT p) { - return detail::LoadDupFull128( - RebindToUnsigned(), - reinterpret_cast(p)); -} -#endif // !HWY_SVE_HAVE_BFLOAT16 - -#endif // HWY_TARGET != HWY_SVE2_128 - -#if !HWY_SVE_HAVE_BFLOAT16 - -template -HWY_API void Store(VBF16 v, Simd d, - bfloat16_t* HWY_RESTRICT p) { - Store(v, RebindToUnsigned(), - reinterpret_cast(p)); -} - -#endif - -// ------------------------------ Load/StoreU - -// SVE only requires lane alignment, not natural alignment of the entire -// vector. -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} - -template -HWY_API void StoreU(const V v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); -} - -// ------------------------------ MaskedLoadOr - -// SVE MaskedLoad hard-codes zero, so this requires an extra blend. -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElse(m, MaskedLoad(m, d, p), v); -} - -// ------------------------------ ScatterOffset/Index - -#ifdef HWY_NATIVE_SCATTER -#undef HWY_NATIVE_SCATTER -#else -#define HWY_NATIVE_SCATTER -#endif - -#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, \ - HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ - HWY_SVE_V(int, BITS) offset) { \ - sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \ - v); \ - } - -#define HWY_SVE_MASKED_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m, \ - HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ - HWY_SVE_V(int, BITS) index) { \ - sv##OP##_s##BITS##index_##CHAR##BITS(m, base, index, v); \ - } - -HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter) -HWY_SVE_FOREACH_UIF3264(HWY_SVE_MASKED_SCATTER_INDEX, MaskedScatterIndex, - st1_scatter) -#undef HWY_SVE_SCATTER_OFFSET -#undef HWY_SVE_MASKED_SCATTER_INDEX - -template -HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT p, - VFromD> indices) { - MaskedScatterIndex(v, detail::MakeMask(d), d, p, indices); -} - -// ------------------------------ GatherOffset/Index - -#ifdef HWY_NATIVE_GATHER -#undef HWY_NATIVE_GATHER -#else -#define HWY_NATIVE_GATHER -#endif - -#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ - HWY_SVE_V(int, BITS) offset) { \ - return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \ - offset); \ - } -#define HWY_SVE_MASKED_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, \ - HWY_SVE_V(int, BITS) index) { \ - return sv##OP##_s##BITS##index_##CHAR##BITS(m, base, index); \ - } - -HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather) -HWY_SVE_FOREACH_UIF3264(HWY_SVE_MASKED_GATHER_INDEX, MaskedGatherIndex, - ld1_gather) -#undef HWY_SVE_GATHER_OFFSET -#undef HWY_SVE_MASKED_GATHER_INDEX - -template -HWY_API VFromD GatherIndex(D d, const TFromD* HWY_RESTRICT p, - VFromD> indices) { - return MaskedGatherIndex(detail::MakeMask(d), d, p, indices); -} - -// ------------------------------ LoadInterleaved2 - -// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. -#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#else -#define HWY_NATIVE_LOAD_STORE_INTERLEAVED -#endif - -#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ - HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \ - const HWY_SVE_TUPLE(BASE, BITS, 2) tuple = \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ - v0 = svget2(tuple, 0); \ - v1 = svget2(tuple, 1); \ - } -HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2) - -#undef HWY_SVE_LOAD2 - -// ------------------------------ LoadInterleaved3 - -#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ - HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \ - HWY_SVE_V(BASE, BITS) & v2) { \ - const HWY_SVE_TUPLE(BASE, BITS, 3) tuple = \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ - v0 = svget3(tuple, 0); \ - v1 = svget3(tuple, 1); \ - v2 = svget3(tuple, 2); \ - } -HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3) - -#undef HWY_SVE_LOAD3 - -// ------------------------------ LoadInterleaved4 - -#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned, \ - HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \ - HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \ - const HWY_SVE_TUPLE(BASE, BITS, 4) tuple = \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned); \ - v0 = svget4(tuple, 0); \ - v1 = svget4(tuple, 1); \ - v2 = svget4(tuple, 2); \ - v3 = svget4(tuple, 3); \ - } -HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4) - -#undef HWY_SVE_LOAD4 - -// ------------------------------ StoreInterleaved2 - -#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ - HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, Create2(d, v0, v1)); \ - } -HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2) - -#undef HWY_SVE_STORE2 - -// ------------------------------ StoreInterleaved3 - -#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ - HWY_SVE_V(BASE, BITS) v2, \ - HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \ - Create3(d, v0, v1, v2)); \ - } -HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3) - -#undef HWY_SVE_STORE3 - -// ------------------------------ StoreInterleaved4 - -#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \ - HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \ - HWY_SVE_D(BASE, BITS, N, kPow2) d, \ - HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) { \ - sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, \ - Create4(d, v0, v1, v2, v3)); \ - } -HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4) - -#undef HWY_SVE_STORE4 - -// ================================================== CONVERT - -// ------------------------------ PromoteTo - -// Same sign -#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME( \ - HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \ - return sv##OP##_##CHAR##BITS(v); \ - } - -HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) -HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) -HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo) - -// 2x -template -HWY_API svuint32_t PromoteTo(Simd dto, svuint8_t vfrom) { - const RepartitionToWide> d2; - return PromoteTo(dto, PromoteTo(d2, vfrom)); -} -template -HWY_API svint32_t PromoteTo(Simd dto, svint8_t vfrom) { - const RepartitionToWide> d2; - return PromoteTo(dto, PromoteTo(d2, vfrom)); -} -template -HWY_API svuint64_t PromoteTo(Simd dto, svuint16_t vfrom) { - const RepartitionToWide> d2; - return PromoteTo(dto, PromoteTo(d2, vfrom)); -} -template -HWY_API svint64_t PromoteTo(Simd dto, svint16_t vfrom) { - const RepartitionToWide> d2; - return PromoteTo(dto, PromoteTo(d2, vfrom)); -} - -// 3x -template -HWY_API svuint64_t PromoteTo(Simd dto, svuint8_t vfrom) { - const RepartitionToNarrow d4; - const RepartitionToNarrow d2; - return PromoteTo(dto, PromoteTo(d4, PromoteTo(d2, vfrom))); -} -template -HWY_API svint64_t PromoteTo(Simd dto, svint8_t vfrom) { - const RepartitionToNarrow d4; - const RepartitionToNarrow d2; - return PromoteTo(dto, PromoteTo(d4, PromoteTo(d2, vfrom))); -} - -// Sign change -template ), sizeof(TFromV))> -HWY_API VFromD PromoteTo(D di, V v) { - const RebindToUnsigned du; - return BitCast(di, PromoteTo(du, v)); -} - -// ------------------------------ PromoteTo F - -// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif - -// Unlike Highway's ZipLower, this returns the same type. -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1) -} // namespace detail - -template -HWY_API svfloat32_t PromoteTo(Simd /* d */, - const svfloat16_t v) { - // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so - // first replicate each lane once. - const svfloat16_t vv = detail::ZipLowerSame(v, v); - return svcvt_f32_f16_x(detail::PTrue(Simd()), vv); -} - -template -HWY_API svfloat64_t PromoteTo(Simd /* d */, - const svfloat32_t v) { - const svfloat32_t vv = detail::ZipLowerSame(v, v); - return svcvt_f64_f32_x(detail::PTrue(Simd()), vv); -} - -template -HWY_API svfloat64_t PromoteTo(Simd /* d */, - const svint32_t v) { - const svint32_t vv = detail::ZipLowerSame(v, v); - return svcvt_f64_s32_x(detail::PTrue(Simd()), vv); -} - -// For 16-bit Compress -namespace detail { -HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi) -#undef HWY_SVE_PROMOTE_TO - -template -HWY_API svfloat32_t PromoteUpperTo(Simd df, svfloat16_t v) { - const RebindToUnsigned du; - const RepartitionToNarrow dn; - return BitCast(df, PromoteUpperTo(du, BitCast(dn, v))); -} - -} // namespace detail - -// ------------------------------ DemoteTo U - -namespace detail { - -// Saturates unsigned vectors to half/quarter-width TN. -template -VU SaturateU(VU v) { - return detail::MinN(v, static_cast>(LimitsMax())); -} - -// Saturates unsigned vectors to half/quarter-width TN. -template -VI SaturateI(VI v) { - return detail::MinN(detail::MaxN(v, LimitsMin()), LimitsMax()); -} - -} // namespace detail - -template -HWY_API svuint8_t DemoteTo(Simd dn, const svint16_t v) { -#if HWY_SVE_HAVE_2 - const svuint8_t vn = BitCast(dn, svqxtunb_s16(v)); -#else - const DFromV di; - const RebindToUnsigned du; - using TN = TFromD; - // First clamp negative numbers to zero and cast to unsigned. - const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0)); - // Saturate to unsigned-max and halve the width. - const svuint8_t vn = BitCast(dn, detail::SaturateU(clamped)); -#endif - return svuzp1_u8(vn, vn); -} - -template -HWY_API svuint16_t DemoteTo(Simd dn, const svint32_t v) { -#if HWY_SVE_HAVE_2 - const svuint16_t vn = BitCast(dn, svqxtunb_s32(v)); -#else - const DFromV di; - const RebindToUnsigned du; - using TN = TFromD; - // First clamp negative numbers to zero and cast to unsigned. - const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0)); - // Saturate to unsigned-max and halve the width. - const svuint16_t vn = BitCast(dn, detail::SaturateU(clamped)); -#endif - return svuzp1_u16(vn, vn); -} - -template -HWY_API svuint8_t DemoteTo(Simd dn, const svint32_t v) { - const DFromV di; - const RebindToUnsigned du; - const RepartitionToNarrow d2; -#if HWY_SVE_HAVE_2 - const svuint16_t cast16 = BitCast(d2, svqxtnb_u16(svqxtunb_s32(v))); -#else - using TN = TFromD; - // First clamp negative numbers to zero and cast to unsigned. - const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0)); - // Saturate to unsigned-max and quarter the width. - const svuint16_t cast16 = BitCast(d2, detail::SaturateU(clamped)); -#endif - const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16)); - return svuzp1_u8(x2, x2); -} - -HWY_API svuint8_t U8FromU32(const svuint32_t v) { - const DFromV du32; - const RepartitionToNarrow du16; - const RepartitionToNarrow du8; - - const svuint16_t cast16 = BitCast(du16, v); - const svuint16_t x2 = svuzp1_u16(cast16, cast16); - const svuint8_t cast8 = BitCast(du8, x2); - return svuzp1_u8(cast8, cast8); -} - -template -HWY_API svuint8_t DemoteTo(Simd dn, const svuint16_t v) { -#if HWY_SVE_HAVE_2 - const svuint8_t vn = BitCast(dn, svqxtnb_u16(v)); -#else - using TN = TFromD; - const svuint8_t vn = BitCast(dn, detail::SaturateU(v)); -#endif - return svuzp1_u8(vn, vn); -} - -template -HWY_API svuint16_t DemoteTo(Simd dn, const svuint32_t v) { -#if HWY_SVE_HAVE_2 - const svuint16_t vn = BitCast(dn, svqxtnb_u32(v)); -#else - using TN = TFromD; - const svuint16_t vn = BitCast(dn, detail::SaturateU(v)); -#endif - return svuzp1_u16(vn, vn); -} - -template -HWY_API svuint8_t DemoteTo(Simd dn, const svuint32_t v) { - using TN = TFromD; - return U8FromU32(detail::SaturateU(v)); -} - -// ------------------------------ Truncations - -template -HWY_API svuint8_t TruncateTo(Simd /* tag */, - const svuint64_t v) { - const DFromV d; - const svuint8_t v1 = BitCast(d, v); - const svuint8_t v2 = svuzp1_u8(v1, v1); - const svuint8_t v3 = svuzp1_u8(v2, v2); - return svuzp1_u8(v3, v3); -} - -template -HWY_API svuint16_t TruncateTo(Simd /* tag */, - const svuint64_t v) { - const DFromV d; - const svuint16_t v1 = BitCast(d, v); - const svuint16_t v2 = svuzp1_u16(v1, v1); - return svuzp1_u16(v2, v2); -} - -template -HWY_API svuint32_t TruncateTo(Simd /* tag */, - const svuint64_t v) { - const DFromV d; - const svuint32_t v1 = BitCast(d, v); - return svuzp1_u32(v1, v1); -} - -template -HWY_API svuint8_t TruncateTo(Simd /* tag */, - const svuint32_t v) { - const DFromV d; - const svuint8_t v1 = BitCast(d, v); - const svuint8_t v2 = svuzp1_u8(v1, v1); - return svuzp1_u8(v2, v2); -} - -template -HWY_API svuint16_t TruncateTo(Simd /* tag */, - const svuint32_t v) { - const DFromV d; - const svuint16_t v1 = BitCast(d, v); - return svuzp1_u16(v1, v1); -} - -template -HWY_API svuint8_t TruncateTo(Simd /* tag */, - const svuint16_t v) { - const DFromV d; - const svuint8_t v1 = BitCast(d, v); - return svuzp1_u8(v1, v1); -} - -// ------------------------------ DemoteTo I - -template -HWY_API svint8_t DemoteTo(Simd dn, const svint16_t v) { -#if HWY_SVE_HAVE_2 - const svint8_t vn = BitCast(dn, svqxtnb_s16(v)); -#else - using TN = TFromD; - const svint8_t vn = BitCast(dn, detail::SaturateI(v)); -#endif - return svuzp1_s8(vn, vn); -} - -template -HWY_API svint16_t DemoteTo(Simd dn, const svint32_t v) { -#if HWY_SVE_HAVE_2 - const svint16_t vn = BitCast(dn, svqxtnb_s32(v)); -#else - using TN = TFromD; - const svint16_t vn = BitCast(dn, detail::SaturateI(v)); -#endif - return svuzp1_s16(vn, vn); -} - -template -HWY_API svint8_t DemoteTo(Simd dn, const svint32_t v) { - const RepartitionToWide d2; -#if HWY_SVE_HAVE_2 - const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v))); -#else - using TN = TFromD; - const svint16_t cast16 = BitCast(d2, detail::SaturateI(v)); -#endif - const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16)); - return BitCast(dn, svuzp1_s8(v2, v2)); -} - -// ------------------------------ I64/U64 DemoteTo - -template -HWY_API svint32_t DemoteTo(Simd dn, const svint64_t v) { - const Rebind du64; - const RebindToUnsigned dn_u; -#if HWY_SVE_HAVE_2 - const svuint64_t vn = BitCast(du64, svqxtnb_s64(v)); -#else - using TN = TFromD; - const svuint64_t vn = BitCast(du64, detail::SaturateI(v)); -#endif - return BitCast(dn, TruncateTo(dn_u, vn)); -} - -template -HWY_API svint16_t DemoteTo(Simd dn, const svint64_t v) { - const Rebind du64; - const RebindToUnsigned dn_u; -#if HWY_SVE_HAVE_2 - const svuint64_t vn = BitCast(du64, svqxtnb_s32(svqxtnb_s64(v))); -#else - using TN = TFromD; - const svuint64_t vn = BitCast(du64, detail::SaturateI(v)); -#endif - return BitCast(dn, TruncateTo(dn_u, vn)); -} - -template -HWY_API svint8_t DemoteTo(Simd dn, const svint64_t v) { - const Rebind du64; - const RebindToUnsigned dn_u; - using TN = TFromD; - const svuint64_t vn = BitCast(du64, detail::SaturateI(v)); - return BitCast(dn, TruncateTo(dn_u, vn)); -} - -template -HWY_API svuint32_t DemoteTo(Simd dn, const svint64_t v) { - const Rebind du64; -#if HWY_SVE_HAVE_2 - const svuint64_t vn = BitCast(du64, svqxtunb_s64(v)); -#else - using TN = TFromD; - // First clamp negative numbers to zero and cast to unsigned. - const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0)); - // Saturate to unsigned-max - const svuint64_t vn = detail::SaturateU(clamped); -#endif - return TruncateTo(dn, vn); -} - -template -HWY_API svuint16_t DemoteTo(Simd dn, const svint64_t v) { - const Rebind du64; -#if HWY_SVE_HAVE_2 - const svuint64_t vn = BitCast(du64, svqxtnb_u32(svqxtunb_s64(v))); -#else - using TN = TFromD; - // First clamp negative numbers to zero and cast to unsigned. - const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0)); - // Saturate to unsigned-max - const svuint64_t vn = detail::SaturateU(clamped); -#endif - return TruncateTo(dn, vn); -} - -template -HWY_API svuint8_t DemoteTo(Simd dn, const svint64_t v) { - const Rebind du64; - using TN = TFromD; - // First clamp negative numbers to zero and cast to unsigned. - const svuint64_t clamped = BitCast(du64, detail::MaxN(v, 0)); - // Saturate to unsigned-max - const svuint64_t vn = detail::SaturateU(clamped); - return TruncateTo(dn, vn); -} - -template -HWY_API svuint32_t DemoteTo(Simd dn, const svuint64_t v) { - const Rebind du64; -#if HWY_SVE_HAVE_2 - const svuint64_t vn = BitCast(du64, svqxtnb_u64(v)); -#else - using TN = TFromD; - const svuint64_t vn = BitCast(du64, detail::SaturateU(v)); -#endif - return TruncateTo(dn, vn); -} - -template -HWY_API svuint16_t DemoteTo(Simd dn, const svuint64_t v) { - const Rebind du64; -#if HWY_SVE_HAVE_2 - const svuint64_t vn = BitCast(du64, svqxtnb_u32(svqxtnb_u64(v))); -#else - using TN = TFromD; - const svuint64_t vn = BitCast(du64, detail::SaturateU(v)); -#endif - return TruncateTo(dn, vn); -} - -template -HWY_API svuint8_t DemoteTo(Simd dn, const svuint64_t v) { - const Rebind du64; - using TN = TFromD; - const svuint64_t vn = BitCast(du64, detail::SaturateU(v)); - return TruncateTo(dn, vn); -} - -// ------------------------------ ConcatEven/ConcatOdd - -// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the -// full vector length, not rounded down to a power of two as we require). -namespace detail { - -#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_INLINE HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ - return sv##OP##_##CHAR##BITS(lo, hi); \ - } -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1) -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2) -#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q) -HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q) -#endif -#undef HWY_SVE_CONCAT_EVERY_SECOND - -// Used to slide up / shift whole register left; mask indicates which range -// to take from lo, and the rest is filled from hi starting at its lowest. -#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME( \ - HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \ - return sv##OP##_##CHAR##BITS(mask, lo, hi); \ - } -HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice) -#undef HWY_SVE_SPLICE - -} // namespace detail - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { -#if HWY_SVE_IS_POW2 - if (detail::IsFull(d)) return detail::ConcatOddFull(hi, lo); -#endif - const VFromD hi_odd = detail::ConcatOddFull(hi, hi); - const VFromD lo_odd = detail::ConcatOddFull(lo, lo); - return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { -#if HWY_SVE_IS_POW2 - if (detail::IsFull(d)) return detail::ConcatEvenFull(hi, lo); -#endif - const VFromD hi_odd = detail::ConcatEvenFull(hi, hi); - const VFromD lo_odd = detail::ConcatEvenFull(lo, lo); - return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2)); -} - -// ------------------------------ DemoteTo F - -// We already toggled HWY_NATIVE_F16C above. - -template -HWY_API svfloat16_t DemoteTo(Simd d, const svfloat32_t v) { - const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v); - return detail::ConcatEvenFull(in_even, - in_even); // lower half -} - -template -HWY_API VBF16 DemoteTo(Simd dbf16, svfloat32_t v) { - const svuint16_t in_even = BitCast(ScalableTag(), v); - return BitCast(dbf16, detail::ConcatOddFull(in_even, in_even)); // lower half -} - -template -HWY_API svfloat32_t DemoteTo(Simd d, const svfloat64_t v) { - const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v); - return detail::ConcatEvenFull(in_even, - in_even); // lower half -} - -template -HWY_API svint32_t DemoteTo(Simd d, const svfloat64_t v) { - const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v); - return detail::ConcatEvenFull(in_even, - in_even); // lower half -} - -// ------------------------------ ConvertTo F - -#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP) \ - /* signed integers */ \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \ - return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ - } \ - /* unsigned integers */ \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \ - return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ - } \ - /* Truncates (rounds toward zero). */ \ - template \ - HWY_API HWY_SVE_V(int, BITS) \ - NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v); \ - } - -// API only requires f32 but we provide f64 for use by Iota. -HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt) -#undef HWY_SVE_CONVERT - -// ------------------------------ NearestInt (Round, ConvertTo) -template >> -HWY_API VFromD NearestInt(VF v) { - // No single instruction, round then truncate. - return ConvertTo(DI(), Round(v)); -} - -// ------------------------------ Iota (Add, ConvertTo) - -#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \ - HWY_SVE_T(BASE, BITS) first) { \ - return sv##OP##_##CHAR##BITS(first, 1); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index) -#undef HWY_SVE_IOTA - -template -HWY_API VFromD Iota(const D d, TFromD first) { - const RebindToSigned di; - return detail::AddN(ConvertTo(d, Iota(di, 0)), first); -} - -// ------------------------------ InterleaveLower - -template -HWY_API V InterleaveLower(D d, const V a, const V b) { - static_assert(IsSame, TFromV>(), "D/V mismatch"); -#if HWY_TARGET == HWY_SVE2_128 - (void)d; - return detail::ZipLowerSame(a, b); -#else - // Move lower halves of blocks to lower half of vector. - const Repartition d64; - const auto a64 = BitCast(d64, a); - const auto b64 = BitCast(d64, b); - const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half - const auto b_blocks = detail::ConcatEvenFull(b64, b64); - return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks)); -#endif -} - -template -HWY_API V InterleaveLower(const V a, const V b) { - return InterleaveLower(DFromV(), a, b); -} - -// ------------------------------ InterleaveUpper - -// Only use zip2 if vector are a powers of two, otherwise getting the actual -// "upper half" requires MaskUpperHalf. -#if HWY_TARGET == HWY_SVE2_128 -namespace detail { -// Unlike Highway's ZipUpper, this returns the same type. -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2) -} // namespace detail -#endif - -// Full vector: guaranteed to have at least one block -template , - hwy::EnableIf* = nullptr> -HWY_API V InterleaveUpper(D d, const V a, const V b) { -#if HWY_TARGET == HWY_SVE2_128 - (void)d; - return detail::ZipUpperSame(a, b); -#else - // Move upper halves of blocks to lower half of vector. - const Repartition d64; - const auto a64 = BitCast(d64, a); - const auto b64 = BitCast(d64, b); - const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half - const auto b_blocks = detail::ConcatOddFull(b64, b64); - return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks)); -#endif -} - -// Capped/fraction: need runtime check -template , - hwy::EnableIf* = nullptr> -HWY_API V InterleaveUpper(D d, const V a, const V b) { - // Less than one block: treat as capped - if (Lanes(d) * sizeof(TFromD) < 16) { - const Half d2; - return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b)); - } - return InterleaveUpper(DFromV(), a, b); -} - -// ------------------------------ Per4LaneBlockShuffle - -namespace detail { - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, - hwy::SizeTag /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - const DFromV d; - const RebindToUnsigned du; - const RepartitionToWide dw; - - const auto evens = BitCast(dw, ConcatEvenFull(v, v)); - return BitCast(d, ZipLowerSame(evens, evens)); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, - hwy::SizeTag /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - const DFromV d; - const RebindToUnsigned du; - const RepartitionToWide dw; - - const auto odds = BitCast(dw, ConcatOddFull(v, v)); - return BitCast(d, ZipLowerSame(odds, odds)); -} - -} // namespace detail - -// ================================================== COMBINE - -namespace detail { - -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE -template -svbool_t MaskLowerHalf(D d) { - switch (Lanes(d)) { - case 32: - return svptrue_pat_b8(SV_VL16); - case 16: - return svptrue_pat_b8(SV_VL8); - case 8: - return svptrue_pat_b8(SV_VL4); - case 4: - return svptrue_pat_b8(SV_VL2); - default: - return svptrue_pat_b8(SV_VL1); - } -} -template -svbool_t MaskLowerHalf(D d) { - switch (Lanes(d)) { - case 16: - return svptrue_pat_b16(SV_VL8); - case 8: - return svptrue_pat_b16(SV_VL4); - case 4: - return svptrue_pat_b16(SV_VL2); - default: - return svptrue_pat_b16(SV_VL1); - } -} -template -svbool_t MaskLowerHalf(D d) { - switch (Lanes(d)) { - case 8: - return svptrue_pat_b32(SV_VL4); - case 4: - return svptrue_pat_b32(SV_VL2); - default: - return svptrue_pat_b32(SV_VL1); - } -} -template -svbool_t MaskLowerHalf(D d) { - switch (Lanes(d)) { - case 4: - return svptrue_pat_b64(SV_VL2); - default: - return svptrue_pat_b64(SV_VL1); - } -} -#endif -#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE -template -svbool_t MaskLowerHalf(D d) { - switch (Lanes(d)) { - case 16: - return svptrue_pat_b8(SV_VL8); - case 8: - return svptrue_pat_b8(SV_VL4); - case 4: - return svptrue_pat_b8(SV_VL2); - case 2: - case 1: - default: - return svptrue_pat_b8(SV_VL1); - } -} -template -svbool_t MaskLowerHalf(D d) { - switch (Lanes(d)) { - case 8: - return svptrue_pat_b16(SV_VL4); - case 4: - return svptrue_pat_b16(SV_VL2); - case 2: - case 1: - default: - return svptrue_pat_b16(SV_VL1); - } -} -template -svbool_t MaskLowerHalf(D d) { - return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1); -} -template -svbool_t MaskLowerHalf(D /*d*/) { - return svptrue_pat_b64(SV_VL1); -} -#endif // HWY_TARGET == HWY_SVE2_128 -#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128 -template -svbool_t MaskLowerHalf(D d) { - return FirstN(d, Lanes(d) / 2); -} -#endif - -template -svbool_t MaskUpperHalf(D d) { - // TODO(janwas): WHILEGE on SVE2 - if (HWY_SVE_IS_POW2 && IsFull(d)) { - return Not(MaskLowerHalf(d)); - } - - // For Splice to work as intended, make sure bits above Lanes(d) are zero. - return AndNot(MaskLowerHalf(d), detail::MakeMask(d)); -} - -// Right-shift vector pair by constexpr; can be used to slide down (=N) or up -// (=Lanes()-N). -#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \ - return sv##OP##_##CHAR##BITS(lo, hi, kIndex); \ - } -HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext) -#undef HWY_SVE_EXT - -} // namespace detail - -// ------------------------------ ConcatUpperLower -template -HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) { - return IfThenElse(detail::MaskLowerHalf(d), lo, hi); -} - -// ------------------------------ ConcatLowerLower -template -HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) { - if (detail::IsFull(d)) { -#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256 - return detail::ConcatEvenBlocks(hi, lo); -#endif -#if HWY_TARGET == HWY_SVE2_128 - const Repartition du64; - const auto lo64 = BitCast(du64, lo); - return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi))); -#endif - } - return detail::Splice(hi, lo, detail::MaskLowerHalf(d)); -} - -// ------------------------------ ConcatLowerUpper -template -HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) { -#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes - if (detail::IsFull(d)) { - return detail::Ext(hi, lo); - } -#endif - return detail::Splice(hi, lo, detail::MaskUpperHalf(d)); -} - -// ------------------------------ ConcatUpperUpper -template -HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) { - if (detail::IsFull(d)) { -#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256 - return detail::ConcatOddBlocks(hi, lo); -#endif -#if HWY_TARGET == HWY_SVE2_128 - const Repartition du64; - const auto lo64 = BitCast(du64, lo); - return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi))); -#endif - } - const svbool_t mask_upper = detail::MaskUpperHalf(d); - const V lo_upper = detail::Splice(lo, lo, mask_upper); - return IfThenElse(mask_upper, hi, lo_upper); -} - -// ------------------------------ Combine -template -HWY_API VFromD Combine(const D d, const V2 hi, const V2 lo) { - return ConcatLowerLower(d, hi, lo); -} - -// ------------------------------ ZeroExtendVector -template -HWY_API V ZeroExtendVector(const D d, const V lo) { - return Combine(d, Zero(Half()), lo); -} - -// ------------------------------ Lower/UpperHalf - -template -HWY_API V LowerHalf(D2 /* tag */, const V v) { - return v; -} - -template -HWY_API V LowerHalf(const V v) { - return v; -} - -template -HWY_API V UpperHalf(const DH dh, const V v) { - const Twice d; - // Cast so that we support bfloat16_t. - const RebindToUnsigned du; - const VFromD vu = BitCast(du, v); -#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes - return BitCast(d, detail::Ext(vu, vu)); -#else - const MFromD mask = detail::MaskUpperHalf(du); - return BitCast(d, detail::Splice(vu, vu, mask)); -#endif -} - -// ================================================== REDUCE - -// These return T, whereas the Highway op returns a broadcasted vector. -namespace detail { -#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ - /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \ - using T = HWY_SVE_T(BASE, BITS); \ - using TU = MakeUnsigned; \ - constexpr uint64_t kMask = LimitsMax(); \ - return static_cast(static_cast( \ - static_cast(sv##OP##_##CHAR##BITS(pg, v)) & kMask)); \ - } - -#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS(pg, v); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv) -HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv) - -HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv) -HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv) -// NaN if all are -HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv) -HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv) - -#undef HWY_SVE_REDUCE -#undef HWY_SVE_REDUCE_ADD -} // namespace detail - -template -V SumOfLanes(D d, V v) { - return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v)); -} - -template -TFromV ReduceSum(D d, V v) { - return detail::SumOfLanesM(detail::MakeMask(d), v); -} - -template -V MinOfLanes(D d, V v) { - return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v)); -} - -template -V MaxOfLanes(D d, V v) { - return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v)); -} - -// ================================================== SWIZZLE - -// ------------------------------ GetLane - -namespace detail { -#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_INLINE HWY_SVE_T(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ - return sv##OP##_##CHAR##BITS(mask, v); \ - } - -HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta) -HWY_SVE_FOREACH(HWY_SVE_GET_LANE, ExtractLastMatchingLaneM, lastb) -#undef HWY_SVE_GET_LANE -} // namespace detail - -template -HWY_API TFromV GetLane(V v) { - return detail::GetLaneM(v, detail::PFalse()); -} - -// ------------------------------ ExtractLane -template -HWY_API TFromV ExtractLane(V v, size_t i) { - return detail::GetLaneM(v, FirstN(DFromV(), i)); -} - -// ------------------------------ InsertLane (IfThenElse) -template -HWY_API V InsertLane(const V v, size_t i, TFromV t) { - const DFromV d; - const auto is_i = detail::EqN(Iota(d, 0), static_cast>(i)); - return IfThenElse(RebindMask(d, is_i), Set(d, t), v); -} - -// ------------------------------ DupEven - -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1) -} // namespace detail - -template -HWY_API V DupEven(const V v) { - return detail::InterleaveEven(v, v); -} - -// ------------------------------ DupOdd - -namespace detail { -HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2) -} // namespace detail - -template -HWY_API V DupOdd(const V v) { - return detail::InterleaveOdd(v, v); -} - -// ------------------------------ OddEven - -#if HWY_SVE_HAVE_2 - -#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \ - return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_ODD_EVEN, OddEven, eortb_n) -#undef HWY_SVE_ODD_EVEN - -template -HWY_API V OddEven(const V odd, const V even) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even))); -} - -#else - -template -HWY_API V OddEven(const V odd, const V even) { - const auto odd_in_even = detail::Ext<1>(odd, odd); - return detail::InterleaveEven(even, odd_in_even); -} - -#endif // HWY_TARGET - -// ------------------------------ OddEvenBlocks -template -HWY_API V OddEvenBlocks(const V odd, const V even) { - const DFromV d; -#if HWY_TARGET == HWY_SVE_256 - return ConcatUpperLower(d, odd, even); -#elif HWY_TARGET == HWY_SVE2_128 - (void)odd; - (void)d; - return even; -#else - const RebindToUnsigned du; - using TU = TFromD; - constexpr size_t kShift = CeilLog2(16 / sizeof(TU)); - const auto idx_block = ShiftRight(Iota(du, 0)); - const auto lsb = detail::AndN(idx_block, static_cast(1)); - const svbool_t is_even = detail::EqN(lsb, static_cast(0)); - return IfThenElse(is_even, even, odd); -#endif -} - -// ------------------------------ TableLookupLanes - -template -HWY_API VFromD> IndicesFromVec(D d, VI vec) { - using TI = TFromV; - static_assert(sizeof(TFromD) == sizeof(TI), "Index/lane size mismatch"); - const RebindToUnsigned du; - const auto indices = BitCast(du, vec); -#if HWY_IS_DEBUG_BUILD - using TU = MakeUnsigned; - const size_t twice_max_lanes = Lanes(d) * 2; - HWY_DASSERT(AllTrue( - du, Eq(indices, - detail::AndN(indices, static_cast(twice_max_lanes - 1))))); -#else - (void)d; -#endif - return indices; -} - -template -HWY_API VFromD> SetTableIndices(D d, const TI* idx) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); - return IndicesFromVec(d, LoadU(Rebind(), idx)); -} - -#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \ - return sv##OP##_##CHAR##BITS(v, idx); \ - } - -HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl) -#undef HWY_SVE_TABLE - -#if HWY_SVE_HAVE_2 -namespace detail { -#define HWY_SVE_TABLE2(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_TUPLE(BASE, BITS, 2) tuple, HWY_SVE_V(uint, BITS) idx) { \ - return sv##OP##_##CHAR##BITS(tuple, idx); \ - } - -HWY_SVE_FOREACH(HWY_SVE_TABLE2, NativeTwoTableLookupLanes, tbl2) -#undef HWY_SVE_TABLE -} // namespace detail -#endif // HWY_SVE_HAVE_2 - -template -HWY_API VFromD TwoTablesLookupLanes(D d, VFromD a, VFromD b, - VFromD> idx) { - // SVE2 has an instruction for this, but it only works for full 2^n vectors. -#if HWY_SVE_HAVE_2 && HWY_SVE_IS_POW2 - if (detail::IsFull(d)) { - return detail::NativeTwoTableLookupLanes(Create2(d, a, b), idx); - } -#endif - const RebindToUnsigned du; - using TU = TFromD; - - const size_t num_of_lanes = Lanes(d); - const auto idx_mod = detail::AndN(idx, static_cast(num_of_lanes - 1)); - const auto sel_a_mask = Eq(idx, idx_mod); - - const auto a_lookup_result = TableLookupLanes(a, idx_mod); - const auto b_lookup_result = TableLookupLanes(b, idx_mod); - return IfThenElse(sel_a_mask, a_lookup_result, b_lookup_result); -} - -template -HWY_API V TwoTablesLookupLanes(V a, V b, - VFromD>> idx) { - const DFromV d; - return TwoTablesLookupLanes(d, a, b, idx); -} - -// ------------------------------ SwapAdjacentBlocks (TableLookupLanes) - -namespace detail { - -template -constexpr size_t LanesPerBlock(Simd d) { - // We might have a capped vector smaller than a block, so honor that. - return HWY_MIN(16 / sizeof(T), MaxLanes(d)); -} - -} // namespace detail - -template -HWY_API V SwapAdjacentBlocks(const V v) { - const DFromV d; -#if HWY_TARGET == HWY_SVE_256 - return ConcatLowerUpper(d, v, v); -#elif HWY_TARGET == HWY_SVE2_128 - (void)d; - return v; -#else - const RebindToUnsigned du; - constexpr auto kLanesPerBlock = - static_cast>(detail::LanesPerBlock(d)); - const VFromD idx = detail::XorN(Iota(du, 0), kLanesPerBlock); - return TableLookupLanes(v, idx); -#endif -} - -// ------------------------------ Reverse - -namespace detail { - -#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS(v); \ - } - -HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev) -#undef HWY_SVE_REVERSE - -} // namespace detail - -template -HWY_API V Reverse(D d, V v) { - using T = TFromD; - const auto reversed = detail::ReverseFull(v); - if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed; - // Shift right to remove extra (non-pow2 and remainder) lanes. - // TODO(janwas): on SVE2, use WHILEGE. - // Avoids FirstN truncating to the return vector size. Must also avoid Not - // because that is limited to SV_POW2. - const ScalableTag dfull; - const svbool_t all_true = detail::AllPTrue(dfull); - const size_t all_lanes = detail::AllHardwareLanes(); - const size_t want_lanes = Lanes(d); - HWY_DASSERT(want_lanes <= all_lanes); - const svbool_t mask = - svnot_b_z(all_true, FirstN(dfull, all_lanes - want_lanes)); - return detail::Splice(reversed, reversed, mask); -} - -// ------------------------------ Reverse2 - -// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const RebindToUnsigned du; - const RepartitionToWide dw; - return BitCast(d, svrevb_u16_x(detail::PTrue(d), BitCast(dw, v))); -} - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const RebindToUnsigned du; - const RepartitionToWide dw; - return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v))); -} - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const RebindToUnsigned du; - const RepartitionToWide dw; - return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v))); -} - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { // 3210 -#if HWY_TARGET == HWY_SVE2_128 - if (detail::IsFull(d)) { - return detail::Ext<1>(v, v); - } -#endif - (void)d; - const auto odd_in_even = detail::Ext<1>(v, v); // x321 - return detail::InterleaveEven(odd_in_even, v); // 2301 -} - -// ------------------------------ Reverse4 (TableLookupLanes) - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - const RebindToUnsigned du; - const RepartitionToWide> du32; - return BitCast(d, svrevb_u32_x(detail::PTrue(d), BitCast(du32, v))); -} - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - const RebindToUnsigned du; - const RepartitionToWide> du64; - return BitCast(d, svrevh_u64_x(detail::PTrue(d), BitCast(du64, v))); -} - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - if (HWY_TARGET == HWY_SVE2_128 && detail::IsFull(d)) { - return detail::ReverseFull(v); - } - // TODO(janwas): is this approach faster than Shuffle0123? - const RebindToUnsigned du; - const auto idx = detail::XorN(Iota(du, 0), 3); - return TableLookupLanes(v, idx); -} - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - if (HWY_TARGET == HWY_SVE_256 && detail::IsFull(d)) { - return detail::ReverseFull(v); - } - // TODO(janwas): is this approach faster than Shuffle0123? - const RebindToUnsigned du; - const auto idx = detail::XorN(Iota(du, 0), 3); - return TableLookupLanes(v, idx); -} - -// ------------------------------ Reverse8 (TableLookupLanes) - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const Repartition du64; - return BitCast(d, svrevb_u64_x(detail::PTrue(d), BitCast(du64, v))); -} - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const RebindToUnsigned du; - const auto idx = detail::XorN(Iota(du, 0), 7); - return TableLookupLanes(v, idx); -} - -// ------------------------------- ReverseBits - -#ifdef HWY_NATIVE_REVERSE_BITS_UI8 -#undef HWY_NATIVE_REVERSE_BITS_UI8 -#else -#define HWY_NATIVE_REVERSE_BITS_UI8 -#endif - -#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#else -#define HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#endif - -#define HWY_SVE_REVERSE_BITS(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - const DFromV d; \ - return sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_REVERSE_BITS, ReverseBits, rbit) -#undef HWY_SVE_REVERSE_BITS - -// ------------------------------ SlideUpLanes - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { - return detail::Splice(v, Zero(d), FirstN(d, amt)); -} - -// ------------------------------ Slide1Up - -#ifdef HWY_NATIVE_SLIDE1_UP_DOWN -#undef HWY_NATIVE_SLIDE1_UP_DOWN -#else -#define HWY_NATIVE_SLIDE1_UP_DOWN -#endif - -template -HWY_API VFromD Slide1Up(D d, VFromD v) { - return SlideUpLanes(d, v, 1); -} - -// ------------------------------ SlideDownLanes (TableLookupLanes) - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { - const RebindToUnsigned du; - using TU = TFromD; - const auto idx = Iota(du, static_cast(amt)); - return IfThenElseZero(FirstN(d, Lanes(d) - amt), TableLookupLanes(v, idx)); -} - -// ------------------------------ Slide1Down - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { - return SlideDownLanes(d, v, 1); -} - -// ------------------------------ Block insert/extract/broadcast ops -#if HWY_TARGET != HWY_SVE2_128 - -#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT -#undef HWY_NATIVE_BLK_INSERT_EXTRACT -#else -#define HWY_NATIVE_BLK_INSERT_EXTRACT -#endif - -template -HWY_API V InsertBlock(V v, V blk_to_insert) { - const DFromV d; - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - -#if HWY_TARGET == HWY_SVE_256 - return (kBlockIdx == 0) ? ConcatUpperLower(d, v, blk_to_insert) - : ConcatLowerLower(d, blk_to_insert, v); -#else - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d); - - constexpr size_t kBlockOffset = - static_cast(kBlockIdx) * kLanesPerBlock; - const auto splice_mask = FirstN(d, kBlockOffset); - const auto sel_lo_mask = FirstN(d, kBlockOffset + kLanesPerBlock); - - const auto splice_result = detail::Splice(blk_to_insert, v, splice_mask); - return IfThenElse(sel_lo_mask, splice_result, v); -#endif -} - -template -HWY_API V ExtractBlock(V v) { - const DFromV d; - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - - if (kBlockIdx == 0) return v; - -#if HWY_TARGET == HWY_SVE_256 - return UpperHalf(Half(), v); -#else - const RebindToUnsigned du; - using TU = TFromD; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d); - constexpr size_t kBlockOffset = - static_cast(kBlockIdx) * kLanesPerBlock; - const auto splice_mask = - RebindMask(d, detail::LtN(Iota(du, static_cast(0u - kBlockOffset)), - static_cast(kLanesPerBlock))); - return detail::Splice(v, v, splice_mask); -#endif -} - -template -HWY_API V BroadcastBlock(V v) { - const DFromV d; - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - -#if HWY_TARGET == HWY_SVE_256 - return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v) - : ConcatUpperUpper(d, v, v); -#else - const RebindToUnsigned du; - using TU = TFromD; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d); - constexpr size_t kBlockOffset = - static_cast(kBlockIdx) * kLanesPerBlock; - - const auto idx = detail::AddN( - detail::AndN(Iota(du, TU{0}), static_cast(kLanesPerBlock - 1)), - static_cast(kBlockOffset)); - return TableLookupLanes(v, idx); -#endif -} - -#endif // HWY_TARGET != HWY_SVE2_128 - -// ------------------------------ Compress (PromoteTo) - -template -struct CompressIsPartition { -#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 - // Optimization for 64-bit lanes (could also be applied to 32-bit, but that - // requires a larger table). - enum { value = (sizeof(T) == 8) }; -#else - enum { value = 0 }; -#endif // HWY_TARGET == HWY_SVE_256 -}; - -#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \ - return sv##OP##_##CHAR##BITS(mask, v); \ - } - -#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 -HWY_SVE_FOREACH_UI32(HWY_SVE_COMPRESS, Compress, compact) -HWY_SVE_FOREACH_F32(HWY_SVE_COMPRESS, Compress, compact) -#else -HWY_SVE_FOREACH_UIF3264(HWY_SVE_COMPRESS, Compress, compact) -#endif -#undef HWY_SVE_COMPRESS - -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE -template -HWY_API V Compress(V v, svbool_t mask) { - const DFromV d; - const RebindToUnsigned du64; - - // Convert mask into bitfield via horizontal sum (faster than ORV) of masked - // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for - // SetTableIndices. - const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); - const size_t offset = detail::SumOfLanesM(mask, bits); - - // See CompressIsPartition. - alignas(16) static constexpr uint64_t table[4 * 16] = { - // PrintCompress64x4Tables - 0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2, - 1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2, - 0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3}; - return TableLookupLanes(v, SetTableIndices(d, table + offset)); -} - -#endif // HWY_TARGET == HWY_SVE_256 -#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE -template -HWY_API V Compress(V v, svbool_t mask) { - // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10 - // swaps upper/lower (the lower half is set to the upper half, and the - // remaining upper half is filled from the lower half of the second v), and - // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10 - // unchanged and map everything else to 00. - const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane - return detail::Splice(v, v, AndNot(maskLL, mask)); -} - -#endif // HWY_TARGET == HWY_SVE2_128 - -template -HWY_API V Compress(V v, svbool_t mask16) { - static_assert(!IsSame(), "Must use overload"); - const DFromV d16; - - // Promote vector and mask to 32-bit - const RepartitionToWide dw; - const auto v32L = PromoteTo(dw, v); - const auto v32H = detail::PromoteUpperTo(dw, v); - const svbool_t mask32L = svunpklo_b(mask16); - const svbool_t mask32H = svunpkhi_b(mask16); - - const auto compressedL = Compress(v32L, mask32L); - const auto compressedH = Compress(v32H, mask32H); - - // Demote to 16-bit (already in range) - separately so we can splice - const V evenL = BitCast(d16, compressedL); - const V evenH = BitCast(d16, compressedH); - const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half - const V v16H = detail::ConcatEvenFull(evenH, evenH); - - // We need to combine two vectors of non-constexpr length, so the only option - // is Splice, which requires us to synthesize a mask. NOTE: this function uses - // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt. - const size_t countL = detail::CountTrueFull(dw, mask32L); - const auto compressed_maskL = FirstN(d16, countL); - return detail::Splice(v16H, v16L, compressed_maskL); -} - -// Must treat float16_t as integers so we can ConcatEven. -HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) { - const DFromV df; - const RebindToSigned di; - return BitCast(df, Compress(BitCast(di, v), mask16)); -} - -// ------------------------------ CompressNot - -// 2 or 4 bytes -template -HWY_API V CompressNot(V v, const svbool_t mask) { - return Compress(v, Not(mask)); -} - -template -HWY_API V CompressNot(V v, svbool_t mask) { -#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE - // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10 - // swaps upper/lower (the lower half is set to the upper half, and the - // remaining upper half is filled from the lower half of the second v), and - // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map - // 01 to 10, and everything else to 00. - const svbool_t maskLL = svzip1_b64(mask, mask); // broadcast lower lane - return detail::Splice(v, v, AndNot(mask, maskLL)); -#endif -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE - const DFromV d; - const RebindToUnsigned du64; - - // Convert mask into bitfield via horizontal sum (faster than ORV) of masked - // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for - // SetTableIndices. - const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); - const size_t offset = detail::SumOfLanesM(mask, bits); - - // See CompressIsPartition. - alignas(16) static constexpr uint64_t table[4 * 16] = { - // PrintCompressNot64x4Tables - 0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3, - 0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3, - 2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; - return TableLookupLanes(v, SetTableIndices(d, table + offset)); -#endif // HWY_TARGET == HWY_SVE_256 - - return Compress(v, Not(mask)); -} - -// ------------------------------ CompressBlocksNot -HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) { -#if HWY_TARGET == HWY_SVE2_128 - (void)mask; - return v; -#endif -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE - uint64_t bits = 0; // predicate reg is 32-bit - CopyBytes<4>(&mask, &bits); // not same size - 64-bit more efficient - // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx. - const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u); - // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1]. - alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1, - 0, 1, 2, 3, 0, 1, 2, 3}; - const ScalableTag d; - return TableLookupLanes(v, SetTableIndices(d, table + offset)); -#endif - - return CompressNot(v, mask); -} - -// ------------------------------ CompressStore -template -HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(Compress(v, mask), d, unaligned); - return CountTrue(d, mask); -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d, - TFromD* HWY_RESTRICT unaligned) { - const size_t count = CountTrue(d, mask); - const svbool_t store_mask = FirstN(d, count); - BlendedStore(Compress(v, mask), store_mask, d, unaligned); - return count; -} - -// ================================================== MASK (2) - -// ------------------------------ FindKnownLastTrue -template -HWY_API size_t FindKnownLastTrue(D d, svbool_t m) { - const RebindToUnsigned du; - return static_cast(detail::ExtractLastMatchingLaneM( - Iota(du, 0), And(m, detail::MakeMask(d)))); -} - -// ------------------------------ FindLastTrue -template -HWY_API intptr_t FindLastTrue(D d, svbool_t m) { - return AllFalse(d, m) ? intptr_t{-1} - : static_cast(FindKnownLastTrue(d, m)); -} - -// ================================================== BLOCKWISE - -// ------------------------------ CombineShiftRightBytes - -// Prevent accidentally using these for 128-bit vectors - should not be -// necessary. -#if HWY_TARGET != HWY_SVE2_128 -namespace detail { - -// For x86-compatible behaviour mandated by Highway API: TableLookupBytes -// offsets are implicitly relative to the start of their 128-bit block. -template -HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { - using T = MakeUnsigned>; - return detail::AndNotN(static_cast(LanesPerBlock(d) - 1), iota0); -} - -template -svbool_t FirstNPerBlock(D d) { - const RebindToUnsigned du; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); - const svuint8_t idx_mod = - svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, - 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock, - 6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock, - 9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock, - 12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock, - 15 % kLanesPerBlock); - return detail::LtN(BitCast(du, idx_mod), kLanes); -} -template -svbool_t FirstNPerBlock(D d) { - const RebindToUnsigned du; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); - const svuint16_t idx_mod = - svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, - 3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock, - 6 % kLanesPerBlock, 7 % kLanesPerBlock); - return detail::LtN(BitCast(du, idx_mod), kLanes); -} -template -svbool_t FirstNPerBlock(D d) { - const RebindToUnsigned du; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); - const svuint32_t idx_mod = - svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock, - 3 % kLanesPerBlock); - return detail::LtN(BitCast(du, idx_mod), kLanes); -} -template -svbool_t FirstNPerBlock(D d) { - const RebindToUnsigned du; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); - const svuint64_t idx_mod = - svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock); - return detail::LtN(BitCast(du, idx_mod), kLanes); -} - -} // namespace detail -#endif // HWY_TARGET != HWY_SVE2_128 - -template > -HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) { - const Repartition d8; - const auto hi8 = BitCast(d8, hi); - const auto lo8 = BitCast(d8, lo); -#if HWY_TARGET == HWY_SVE2_128 - return BitCast(d, detail::Ext(hi8, lo8)); -#else - const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes)); - const auto lo_down = detail::Ext(lo8, lo8); - const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); - return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); -#endif -} - -// ------------------------------ Shuffle2301 -template -HWY_API V Shuffle2301(const V v) { - const DFromV d; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - return Reverse2(d, v); -} - -// ------------------------------ Shuffle2103 -template -HWY_API V Shuffle2103(const V v) { - const DFromV d; - const Repartition d8; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - const svuint8_t v8 = BitCast(d8, v); - return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8)); -} - -// ------------------------------ Shuffle0321 -template -HWY_API V Shuffle0321(const V v) { - const DFromV d; - const Repartition d8; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - const svuint8_t v8 = BitCast(d8, v); - return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8)); -} - -// ------------------------------ Shuffle1032 -template -HWY_API V Shuffle1032(const V v) { - const DFromV d; - const Repartition d8; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - const svuint8_t v8 = BitCast(d8, v); - return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8)); -} - -// ------------------------------ Shuffle01 -template -HWY_API V Shuffle01(const V v) { - const DFromV d; - const Repartition d8; - static_assert(sizeof(TFromD) == 8, "Defined for 64-bit types"); - const svuint8_t v8 = BitCast(d8, v); - return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8)); -} - -// ------------------------------ Shuffle0123 -template -HWY_API V Shuffle0123(const V v) { - return Shuffle2301(Shuffle1032(v)); -} - -// ------------------------------ ReverseBlocks (Reverse, Shuffle01) -template > -HWY_API V ReverseBlocks(D d, V v) { -#if HWY_TARGET == HWY_SVE_256 - if (detail::IsFull(d)) { - return SwapAdjacentBlocks(v); - } else if (detail::IsFull(Twice())) { - return v; - } -#elif HWY_TARGET == HWY_SVE2_128 - (void)d; - return v; -#endif - const Repartition du64; - return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v)))); -} - -// ------------------------------ TableLookupBytes - -template -HWY_API VI TableLookupBytes(const V v, const VI idx) { - const DFromV d; - const Repartition du8; -#if HWY_TARGET == HWY_SVE2_128 - return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx))); -#else - const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0)); - const auto idx8 = Add(BitCast(du8, idx), offsets128); - return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8)); -#endif -} - -template -HWY_API VI TableLookupBytesOr0(const V v, const VI idx) { - const DFromV d; - // Mask size must match vector type, so cast everything to this type. - const Repartition di8; - - auto idx8 = BitCast(di8, idx); - const auto msb = detail::LtN(idx8, 0); - - const auto lookup = TableLookupBytes(BitCast(di8, v), idx8); - return BitCast(d, IfThenZeroElse(msb, lookup)); -} - -// ------------------------------ Broadcast - -#ifdef HWY_NATIVE_BROADCASTLANE -#undef HWY_NATIVE_BROADCASTLANE -#else -#define HWY_NATIVE_BROADCASTLANE -#endif - -namespace detail { -#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - return sv##OP##_##CHAR##BITS(v, kLane); \ - } - -HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane) -#undef HWY_SVE_BROADCAST -} // namespace detail - -template -HWY_API V Broadcast(const V v) { - const DFromV d; - const RebindToUnsigned du; - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du); - static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane"); -#if HWY_TARGET == HWY_SVE2_128 - return detail::BroadcastLane(v); -#else - auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0)); - if (kLane != 0) { - idx = detail::AddN(idx, kLane); - } - return TableLookupLanes(v, idx); -#endif -} - -template -HWY_API V BroadcastLane(const V v) { - static_assert(0 <= kLane && kLane < HWY_MAX_LANES_V(V), "Invalid lane"); - return detail::BroadcastLane(v); -} - -// ------------------------------ ShiftLeftLanes - -template > -HWY_API V ShiftLeftLanes(D d, const V v) { - const auto zero = Zero(d); - const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes)); -#if HWY_TARGET == HWY_SVE2_128 - return shifted; -#else - // Match x86 semantics by zeroing lower lanes in 128-bit blocks - return IfThenElse(detail::FirstNPerBlock(d), zero, shifted); -#endif -} - -template -HWY_API V ShiftLeftLanes(const V v) { - return ShiftLeftLanes(DFromV(), v); -} - -// ------------------------------ ShiftRightLanes -template > -HWY_API V ShiftRightLanes(D d, V v) { - // For capped/fractional vectors, clear upper lanes so we shift in zeros. - if (!detail::IsFull(d)) { - v = IfThenElseZero(detail::MakeMask(d), v); - } - -#if HWY_TARGET == HWY_SVE2_128 - return detail::Ext(Zero(d), v); -#else - const auto shifted = detail::Ext(v, v); - // Match x86 semantics by zeroing upper lanes in 128-bit blocks - constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d); - const svbool_t mask = detail::FirstNPerBlock(d); - return IfThenElseZero(mask, shifted); -#endif -} - -// ------------------------------ ShiftLeftBytes - -template > -HWY_API V ShiftLeftBytes(const D d, const V v) { - const Repartition d8; - return BitCast(d, ShiftLeftLanes(BitCast(d8, v))); -} - -template -HWY_API V ShiftLeftBytes(const V v) { - return ShiftLeftBytes(DFromV(), v); -} - -// ------------------------------ ShiftRightBytes -template > -HWY_API V ShiftRightBytes(const D d, const V v) { - const Repartition d8; - return BitCast(d, ShiftRightLanes(d8, BitCast(d8, v))); -} - -// ------------------------------ ZipLower - -template >> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - const RepartitionToNarrow dn; - static_assert(IsSame, TFromV>(), "D/V mismatch"); - return BitCast(dw, InterleaveLower(dn, a, b)); -} -template , class DW = RepartitionToWide> -HWY_API VFromD ZipLower(const V a, const V b) { - return BitCast(DW(), InterleaveLower(D(), a, b)); -} - -// ------------------------------ ZipUpper -template >> -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - const RepartitionToNarrow dn; - static_assert(IsSame, TFromV>(), "D/V mismatch"); - return BitCast(dw, InterleaveUpper(dn, a, b)); -} - -// ================================================== Ops with dependencies - -// ------------------------------ PromoteTo bfloat16 (ZipLower) -template -HWY_API svfloat32_t PromoteTo(Simd df32, VBF16 v) { - const ScalableTag du16; - return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), BitCast(du16, v))); -} - -// ------------------------------ ReorderDemote2To (OddEven) - -template -HWY_API VBF16 ReorderDemote2To(Simd dbf16, svfloat32_t a, - svfloat32_t b) { - const RebindToUnsigned du16; - const Repartition du32; - const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b)); - return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); -} - -template -HWY_API svint16_t ReorderDemote2To(Simd d16, svint32_t a, - svint32_t b) { -#if HWY_SVE_HAVE_2 - (void)d16; - const svint16_t a_in_even = svqxtnb_s32(a); - return svqxtnt_s32(a_in_even, b); -#else - const svint16_t a16 = BitCast(d16, detail::SaturateI(a)); - const svint16_t b16 = BitCast(d16, detail::SaturateI(b)); - return detail::InterleaveEven(a16, b16); -#endif -} - -template -HWY_API svuint16_t ReorderDemote2To(Simd d16, svint32_t a, - svint32_t b) { -#if HWY_SVE_HAVE_2 - (void)d16; - const svuint16_t a_in_even = svqxtunb_s32(a); - return svqxtunt_s32(a_in_even, b); -#else - const Repartition du32; - const svuint32_t clamped_a = BitCast(du32, detail::MaxN(a, 0)); - const svuint32_t clamped_b = BitCast(du32, detail::MaxN(b, 0)); - const svuint16_t a16 = BitCast(d16, detail::SaturateU(clamped_a)); - const svuint16_t b16 = BitCast(d16, detail::SaturateU(clamped_b)); - return detail::InterleaveEven(a16, b16); -#endif -} - -template -HWY_API svuint16_t ReorderDemote2To(Simd d16, svuint32_t a, - svuint32_t b) { -#if HWY_SVE_HAVE_2 - (void)d16; - const svuint16_t a_in_even = svqxtnb_u32(a); - return svqxtnt_u32(a_in_even, b); -#else - const svuint16_t a16 = BitCast(d16, detail::SaturateU(a)); - const svuint16_t b16 = BitCast(d16, detail::SaturateU(b)); - return detail::InterleaveEven(a16, b16); -#endif -} - -template -HWY_API svint8_t ReorderDemote2To(Simd d8, svint16_t a, - svint16_t b) { -#if HWY_SVE_HAVE_2 - (void)d8; - const svint8_t a_in_even = svqxtnb_s16(a); - return svqxtnt_s16(a_in_even, b); -#else - const svint8_t a8 = BitCast(d8, detail::SaturateI(a)); - const svint8_t b8 = BitCast(d8, detail::SaturateI(b)); - return detail::InterleaveEven(a8, b8); -#endif -} - -template -HWY_API svuint8_t ReorderDemote2To(Simd d8, svint16_t a, - svint16_t b) { -#if HWY_SVE_HAVE_2 - (void)d8; - const svuint8_t a_in_even = svqxtunb_s16(a); - return svqxtunt_s16(a_in_even, b); -#else - const Repartition du16; - const svuint16_t clamped_a = BitCast(du16, detail::MaxN(a, 0)); - const svuint16_t clamped_b = BitCast(du16, detail::MaxN(b, 0)); - const svuint8_t a8 = BitCast(d8, detail::SaturateU(clamped_a)); - const svuint8_t b8 = BitCast(d8, detail::SaturateU(clamped_b)); - return detail::InterleaveEven(a8, b8); -#endif -} - -template -HWY_API svuint8_t ReorderDemote2To(Simd d8, svuint16_t a, - svuint16_t b) { -#if HWY_SVE_HAVE_2 - (void)d8; - const svuint8_t a_in_even = svqxtnb_u16(a); - return svqxtnt_u16(a_in_even, b); -#else - const svuint8_t a8 = BitCast(d8, detail::SaturateU(a)); - const svuint8_t b8 = BitCast(d8, detail::SaturateU(b)); - return detail::InterleaveEven(a8, b8); -#endif -} - -template -HWY_API svint32_t ReorderDemote2To(Simd d32, svint64_t a, - svint64_t b) { -#if HWY_SVE_HAVE_2 - (void)d32; - const svint32_t a_in_even = svqxtnb_s64(a); - return svqxtnt_s64(a_in_even, b); -#else - const svint32_t a32 = BitCast(d32, detail::SaturateI(a)); - const svint32_t b32 = BitCast(d32, detail::SaturateI(b)); - return detail::InterleaveEven(a32, b32); -#endif -} - -template -HWY_API svuint32_t ReorderDemote2To(Simd d32, svint64_t a, - svint64_t b) { -#if HWY_SVE_HAVE_2 - (void)d32; - const svuint32_t a_in_even = svqxtunb_s64(a); - return svqxtunt_s64(a_in_even, b); -#else - const Repartition du64; - const svuint64_t clamped_a = BitCast(du64, detail::MaxN(a, 0)); - const svuint64_t clamped_b = BitCast(du64, detail::MaxN(b, 0)); - const svuint32_t a32 = BitCast(d32, detail::SaturateU(clamped_a)); - const svuint32_t b32 = BitCast(d32, detail::SaturateU(clamped_b)); - return detail::InterleaveEven(a32, b32); -#endif -} - -template -HWY_API svuint32_t ReorderDemote2To(Simd d32, svuint64_t a, - svuint64_t b) { -#if HWY_SVE_HAVE_2 - (void)d32; - const svuint32_t a_in_even = svqxtnb_u64(a); - return svqxtnt_u64(a_in_even, b); -#else - const svuint32_t a32 = BitCast(d32, detail::SaturateU(a)); - const svuint32_t b32 = BitCast(d32, detail::SaturateU(b)); - return detail::InterleaveEven(a32, b32); -#endif -} - -template ), - HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2)> -HWY_API VFromD OrderedDemote2To(D dn, V a, V b) { - const Half dnh; - const auto demoted_a = DemoteTo(dnh, a); - const auto demoted_b = DemoteTo(dnh, b); - return Combine(dn, demoted_b, demoted_a); -} - -template -HWY_API VBF16 OrderedDemote2To(D dn, svfloat32_t a, svfloat32_t b) { - const Half dnh; - const RebindToUnsigned dn_u; - const RebindToUnsigned dnh_u; - const auto demoted_a = DemoteTo(dnh, a); - const auto demoted_b = DemoteTo(dnh, b); - return BitCast( - dn, Combine(dn_u, BitCast(dnh_u, demoted_b), BitCast(dnh_u, demoted_a))); -} - -// ------------------------------ ZeroIfNegative (Lt, IfThenElse) -template -HWY_API V ZeroIfNegative(const V v) { - return IfThenZeroElse(detail::LtN(v, 0), v); -} - -// ------------------------------ BroadcastSignBit (ShiftRight) -template -HWY_API V BroadcastSignBit(const V v) { - return ShiftRight) * 8 - 1>(v); -} - -// ------------------------------ IfNegativeThenElse (BroadcastSignBit) -template -HWY_API V IfNegativeThenElse(V v, V yes, V no) { - static_assert(IsSigned>(), "Only works for signed/float"); - const DFromV d; - const RebindToSigned di; - - const svbool_t m = detail::LtN(BitCast(di, v), 0); - return IfThenElse(m, yes, no); -} - -// ------------------------------ AverageRound (ShiftRight) - -#if HWY_SVE_HAVE_2 -HWY_SVE_FOREACH_U08(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd) -HWY_SVE_FOREACH_U16(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd) -#else -template -V AverageRound(const V a, const V b) { - return ShiftRight<1>(detail::AddN(Add(a, b), 1)); -} -#endif // HWY_SVE_HAVE_2 - -// ------------------------------ LoadMaskBits (TestBit) - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - // TODO(janwas): with SVE2.1, load to vector, then PMOV - const RebindToUnsigned du; - const svuint8_t iota = Iota(du, 0); - - // Load correct number of bytes (bits/8) with 7 zeros after each. - const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits)); - // Replicate bytes 8x such that each byte contains the bit that governs it. - const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota)); - - const svuint8_t bit = - svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); - return TestBit(rep8, bit); -} - -template -HWY_INLINE svbool_t LoadMaskBits(D /* tag */, - const uint8_t* HWY_RESTRICT bits) { - const RebindToUnsigned du; - const Repartition du8; - - // There may be up to 128 bits; avoid reading past the end. - const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits); - - // Replicate bytes 16x such that each lane contains the bit that governs it. - const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0))); - - const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128); - return TestBit(BitCast(du, rep16), bit); -} - -template -HWY_INLINE svbool_t LoadMaskBits(D /* tag */, - const uint8_t* HWY_RESTRICT bits) { - const RebindToUnsigned du; - const Repartition du8; - - // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable, - // so we can skip computing the actual length (Lanes(du)+7)/8. - const svuint8_t bytes = svld1(FirstN(du8, 8), bits); - - // Replicate bytes 32x such that each lane contains the bit that governs it. - const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0))); - - // 1, 2, 4, 8, 16, 32, 64, 128, 1, 2 .. - const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7)); - - return TestBit(BitCast(du, rep32), bit); -} - -template -HWY_INLINE svbool_t LoadMaskBits(D /* tag */, - const uint8_t* HWY_RESTRICT bits) { - const RebindToUnsigned du; - - // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane. - // The "at least 8 byte" guarantee in quick_reference ensures this is safe. - uint32_t mask_bits; - CopyBytes<4>(bits, &mask_bits); // copy from bytes - const auto vbits = Set(du, mask_bits); - - // 2 ^ {0,1, .., 31}, will not have more lanes than that. - const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0)); - - return TestBit(vbits, bit); -} - -// ------------------------------ StoreMaskBits - -namespace detail { - -// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes. -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - return svdup_n_u8_z(m, 1); -} -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - const ScalableTag d8; - const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1)); - return detail::ConcatEvenFull(b16, b16); // lower half -} -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - return U8FromU32(svdup_n_u32_z(m, 1)); -} -template -HWY_INLINE svuint8_t BoolFromMask(svbool_t m) { - const ScalableTag d32; - const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1)); - return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half -} - -// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane. -HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) { - const ScalableTag d8; - const ScalableTag d16; - const ScalableTag d32; - const ScalableTag d64; - // TODO(janwas): could use SVE2 BDEP, but it's optional. - x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x)))); - x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x)))); - x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x)))); - return BitCast(d64, x); -} - -} // namespace detail - -// `p` points to at least 8 writable bytes. -// TODO(janwas): specialize for HWY_SVE_256 -// TODO(janwas): with SVE2.1, use PMOV to store to vector, then StoreU -template -HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) { - svuint64_t bits_in_u64 = - detail::BitsFromBool(detail::BoolFromMask>(m)); - - const size_t num_bits = Lanes(d); - const size_t num_bytes = (num_bits + 8 - 1) / 8; // Round up, see below - - // Truncate each u64 to 8 bits and store to u8. - svst1b_u64(FirstN(ScalableTag(), num_bytes), bits, bits_in_u64); - - // Non-full byte, need to clear the undefined upper bits. Can happen for - // capped/fractional vectors or large T and small hardware vectors. - if (num_bits < 8) { - const int mask = static_cast((1ull << num_bits) - 1); - bits[0] = static_cast(bits[0] & mask); - } - // Else: we wrote full bytes because num_bits is a power of two >= 8. - - return num_bytes; -} - -// ------------------------------ CompressBits (LoadMaskBits) -template -HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { - return Compress(v, LoadMaskBits(DFromV(), bits)); -} - -// ------------------------------ CompressBitsStore (LoadMaskBits) -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); -} - -// ------------------------------ Expand (StoreMaskBits) - -#ifdef HWY_NATIVE_EXPAND -#undef HWY_NATIVE_EXPAND -#else -#define HWY_NATIVE_EXPAND -#endif - -namespace detail { - -HWY_INLINE svuint8_t IndicesForExpandFromBits(uint64_t mask_bits) { - const CappedTag du8; - alignas(16) static constexpr uint8_t table[8 * 256] = { - // PrintExpand8x8Tables - 128, 128, 128, 128, 128, 128, 128, 128, // - 0, 128, 128, 128, 128, 128, 128, 128, // - 128, 0, 128, 128, 128, 128, 128, 128, // - 0, 1, 128, 128, 128, 128, 128, 128, // - 128, 128, 0, 128, 128, 128, 128, 128, // - 0, 128, 1, 128, 128, 128, 128, 128, // - 128, 0, 1, 128, 128, 128, 128, 128, // - 0, 1, 2, 128, 128, 128, 128, 128, // - 128, 128, 128, 0, 128, 128, 128, 128, // - 0, 128, 128, 1, 128, 128, 128, 128, // - 128, 0, 128, 1, 128, 128, 128, 128, // - 0, 1, 128, 2, 128, 128, 128, 128, // - 128, 128, 0, 1, 128, 128, 128, 128, // - 0, 128, 1, 2, 128, 128, 128, 128, // - 128, 0, 1, 2, 128, 128, 128, 128, // - 0, 1, 2, 3, 128, 128, 128, 128, // - 128, 128, 128, 128, 0, 128, 128, 128, // - 0, 128, 128, 128, 1, 128, 128, 128, // - 128, 0, 128, 128, 1, 128, 128, 128, // - 0, 1, 128, 128, 2, 128, 128, 128, // - 128, 128, 0, 128, 1, 128, 128, 128, // - 0, 128, 1, 128, 2, 128, 128, 128, // - 128, 0, 1, 128, 2, 128, 128, 128, // - 0, 1, 2, 128, 3, 128, 128, 128, // - 128, 128, 128, 0, 1, 128, 128, 128, // - 0, 128, 128, 1, 2, 128, 128, 128, // - 128, 0, 128, 1, 2, 128, 128, 128, // - 0, 1, 128, 2, 3, 128, 128, 128, // - 128, 128, 0, 1, 2, 128, 128, 128, // - 0, 128, 1, 2, 3, 128, 128, 128, // - 128, 0, 1, 2, 3, 128, 128, 128, // - 0, 1, 2, 3, 4, 128, 128, 128, // - 128, 128, 128, 128, 128, 0, 128, 128, // - 0, 128, 128, 128, 128, 1, 128, 128, // - 128, 0, 128, 128, 128, 1, 128, 128, // - 0, 1, 128, 128, 128, 2, 128, 128, // - 128, 128, 0, 128, 128, 1, 128, 128, // - 0, 128, 1, 128, 128, 2, 128, 128, // - 128, 0, 1, 128, 128, 2, 128, 128, // - 0, 1, 2, 128, 128, 3, 128, 128, // - 128, 128, 128, 0, 128, 1, 128, 128, // - 0, 128, 128, 1, 128, 2, 128, 128, // - 128, 0, 128, 1, 128, 2, 128, 128, // - 0, 1, 128, 2, 128, 3, 128, 128, // - 128, 128, 0, 1, 128, 2, 128, 128, // - 0, 128, 1, 2, 128, 3, 128, 128, // - 128, 0, 1, 2, 128, 3, 128, 128, // - 0, 1, 2, 3, 128, 4, 128, 128, // - 128, 128, 128, 128, 0, 1, 128, 128, // - 0, 128, 128, 128, 1, 2, 128, 128, // - 128, 0, 128, 128, 1, 2, 128, 128, // - 0, 1, 128, 128, 2, 3, 128, 128, // - 128, 128, 0, 128, 1, 2, 128, 128, // - 0, 128, 1, 128, 2, 3, 128, 128, // - 128, 0, 1, 128, 2, 3, 128, 128, // - 0, 1, 2, 128, 3, 4, 128, 128, // - 128, 128, 128, 0, 1, 2, 128, 128, // - 0, 128, 128, 1, 2, 3, 128, 128, // - 128, 0, 128, 1, 2, 3, 128, 128, // - 0, 1, 128, 2, 3, 4, 128, 128, // - 128, 128, 0, 1, 2, 3, 128, 128, // - 0, 128, 1, 2, 3, 4, 128, 128, // - 128, 0, 1, 2, 3, 4, 128, 128, // - 0, 1, 2, 3, 4, 5, 128, 128, // - 128, 128, 128, 128, 128, 128, 0, 128, // - 0, 128, 128, 128, 128, 128, 1, 128, // - 128, 0, 128, 128, 128, 128, 1, 128, // - 0, 1, 128, 128, 128, 128, 2, 128, // - 128, 128, 0, 128, 128, 128, 1, 128, // - 0, 128, 1, 128, 128, 128, 2, 128, // - 128, 0, 1, 128, 128, 128, 2, 128, // - 0, 1, 2, 128, 128, 128, 3, 128, // - 128, 128, 128, 0, 128, 128, 1, 128, // - 0, 128, 128, 1, 128, 128, 2, 128, // - 128, 0, 128, 1, 128, 128, 2, 128, // - 0, 1, 128, 2, 128, 128, 3, 128, // - 128, 128, 0, 1, 128, 128, 2, 128, // - 0, 128, 1, 2, 128, 128, 3, 128, // - 128, 0, 1, 2, 128, 128, 3, 128, // - 0, 1, 2, 3, 128, 128, 4, 128, // - 128, 128, 128, 128, 0, 128, 1, 128, // - 0, 128, 128, 128, 1, 128, 2, 128, // - 128, 0, 128, 128, 1, 128, 2, 128, // - 0, 1, 128, 128, 2, 128, 3, 128, // - 128, 128, 0, 128, 1, 128, 2, 128, // - 0, 128, 1, 128, 2, 128, 3, 128, // - 128, 0, 1, 128, 2, 128, 3, 128, // - 0, 1, 2, 128, 3, 128, 4, 128, // - 128, 128, 128, 0, 1, 128, 2, 128, // - 0, 128, 128, 1, 2, 128, 3, 128, // - 128, 0, 128, 1, 2, 128, 3, 128, // - 0, 1, 128, 2, 3, 128, 4, 128, // - 128, 128, 0, 1, 2, 128, 3, 128, // - 0, 128, 1, 2, 3, 128, 4, 128, // - 128, 0, 1, 2, 3, 128, 4, 128, // - 0, 1, 2, 3, 4, 128, 5, 128, // - 128, 128, 128, 128, 128, 0, 1, 128, // - 0, 128, 128, 128, 128, 1, 2, 128, // - 128, 0, 128, 128, 128, 1, 2, 128, // - 0, 1, 128, 128, 128, 2, 3, 128, // - 128, 128, 0, 128, 128, 1, 2, 128, // - 0, 128, 1, 128, 128, 2, 3, 128, // - 128, 0, 1, 128, 128, 2, 3, 128, // - 0, 1, 2, 128, 128, 3, 4, 128, // - 128, 128, 128, 0, 128, 1, 2, 128, // - 0, 128, 128, 1, 128, 2, 3, 128, // - 128, 0, 128, 1, 128, 2, 3, 128, // - 0, 1, 128, 2, 128, 3, 4, 128, // - 128, 128, 0, 1, 128, 2, 3, 128, // - 0, 128, 1, 2, 128, 3, 4, 128, // - 128, 0, 1, 2, 128, 3, 4, 128, // - 0, 1, 2, 3, 128, 4, 5, 128, // - 128, 128, 128, 128, 0, 1, 2, 128, // - 0, 128, 128, 128, 1, 2, 3, 128, // - 128, 0, 128, 128, 1, 2, 3, 128, // - 0, 1, 128, 128, 2, 3, 4, 128, // - 128, 128, 0, 128, 1, 2, 3, 128, // - 0, 128, 1, 128, 2, 3, 4, 128, // - 128, 0, 1, 128, 2, 3, 4, 128, // - 0, 1, 2, 128, 3, 4, 5, 128, // - 128, 128, 128, 0, 1, 2, 3, 128, // - 0, 128, 128, 1, 2, 3, 4, 128, // - 128, 0, 128, 1, 2, 3, 4, 128, // - 0, 1, 128, 2, 3, 4, 5, 128, // - 128, 128, 0, 1, 2, 3, 4, 128, // - 0, 128, 1, 2, 3, 4, 5, 128, // - 128, 0, 1, 2, 3, 4, 5, 128, // - 0, 1, 2, 3, 4, 5, 6, 128, // - 128, 128, 128, 128, 128, 128, 128, 0, // - 0, 128, 128, 128, 128, 128, 128, 1, // - 128, 0, 128, 128, 128, 128, 128, 1, // - 0, 1, 128, 128, 128, 128, 128, 2, // - 128, 128, 0, 128, 128, 128, 128, 1, // - 0, 128, 1, 128, 128, 128, 128, 2, // - 128, 0, 1, 128, 128, 128, 128, 2, // - 0, 1, 2, 128, 128, 128, 128, 3, // - 128, 128, 128, 0, 128, 128, 128, 1, // - 0, 128, 128, 1, 128, 128, 128, 2, // - 128, 0, 128, 1, 128, 128, 128, 2, // - 0, 1, 128, 2, 128, 128, 128, 3, // - 128, 128, 0, 1, 128, 128, 128, 2, // - 0, 128, 1, 2, 128, 128, 128, 3, // - 128, 0, 1, 2, 128, 128, 128, 3, // - 0, 1, 2, 3, 128, 128, 128, 4, // - 128, 128, 128, 128, 0, 128, 128, 1, // - 0, 128, 128, 128, 1, 128, 128, 2, // - 128, 0, 128, 128, 1, 128, 128, 2, // - 0, 1, 128, 128, 2, 128, 128, 3, // - 128, 128, 0, 128, 1, 128, 128, 2, // - 0, 128, 1, 128, 2, 128, 128, 3, // - 128, 0, 1, 128, 2, 128, 128, 3, // - 0, 1, 2, 128, 3, 128, 128, 4, // - 128, 128, 128, 0, 1, 128, 128, 2, // - 0, 128, 128, 1, 2, 128, 128, 3, // - 128, 0, 128, 1, 2, 128, 128, 3, // - 0, 1, 128, 2, 3, 128, 128, 4, // - 128, 128, 0, 1, 2, 128, 128, 3, // - 0, 128, 1, 2, 3, 128, 128, 4, // - 128, 0, 1, 2, 3, 128, 128, 4, // - 0, 1, 2, 3, 4, 128, 128, 5, // - 128, 128, 128, 128, 128, 0, 128, 1, // - 0, 128, 128, 128, 128, 1, 128, 2, // - 128, 0, 128, 128, 128, 1, 128, 2, // - 0, 1, 128, 128, 128, 2, 128, 3, // - 128, 128, 0, 128, 128, 1, 128, 2, // - 0, 128, 1, 128, 128, 2, 128, 3, // - 128, 0, 1, 128, 128, 2, 128, 3, // - 0, 1, 2, 128, 128, 3, 128, 4, // - 128, 128, 128, 0, 128, 1, 128, 2, // - 0, 128, 128, 1, 128, 2, 128, 3, // - 128, 0, 128, 1, 128, 2, 128, 3, // - 0, 1, 128, 2, 128, 3, 128, 4, // - 128, 128, 0, 1, 128, 2, 128, 3, // - 0, 128, 1, 2, 128, 3, 128, 4, // - 128, 0, 1, 2, 128, 3, 128, 4, // - 0, 1, 2, 3, 128, 4, 128, 5, // - 128, 128, 128, 128, 0, 1, 128, 2, // - 0, 128, 128, 128, 1, 2, 128, 3, // - 128, 0, 128, 128, 1, 2, 128, 3, // - 0, 1, 128, 128, 2, 3, 128, 4, // - 128, 128, 0, 128, 1, 2, 128, 3, // - 0, 128, 1, 128, 2, 3, 128, 4, // - 128, 0, 1, 128, 2, 3, 128, 4, // - 0, 1, 2, 128, 3, 4, 128, 5, // - 128, 128, 128, 0, 1, 2, 128, 3, // - 0, 128, 128, 1, 2, 3, 128, 4, // - 128, 0, 128, 1, 2, 3, 128, 4, // - 0, 1, 128, 2, 3, 4, 128, 5, // - 128, 128, 0, 1, 2, 3, 128, 4, // - 0, 128, 1, 2, 3, 4, 128, 5, // - 128, 0, 1, 2, 3, 4, 128, 5, // - 0, 1, 2, 3, 4, 5, 128, 6, // - 128, 128, 128, 128, 128, 128, 0, 1, // - 0, 128, 128, 128, 128, 128, 1, 2, // - 128, 0, 128, 128, 128, 128, 1, 2, // - 0, 1, 128, 128, 128, 128, 2, 3, // - 128, 128, 0, 128, 128, 128, 1, 2, // - 0, 128, 1, 128, 128, 128, 2, 3, // - 128, 0, 1, 128, 128, 128, 2, 3, // - 0, 1, 2, 128, 128, 128, 3, 4, // - 128, 128, 128, 0, 128, 128, 1, 2, // - 0, 128, 128, 1, 128, 128, 2, 3, // - 128, 0, 128, 1, 128, 128, 2, 3, // - 0, 1, 128, 2, 128, 128, 3, 4, // - 128, 128, 0, 1, 128, 128, 2, 3, // - 0, 128, 1, 2, 128, 128, 3, 4, // - 128, 0, 1, 2, 128, 128, 3, 4, // - 0, 1, 2, 3, 128, 128, 4, 5, // - 128, 128, 128, 128, 0, 128, 1, 2, // - 0, 128, 128, 128, 1, 128, 2, 3, // - 128, 0, 128, 128, 1, 128, 2, 3, // - 0, 1, 128, 128, 2, 128, 3, 4, // - 128, 128, 0, 128, 1, 128, 2, 3, // - 0, 128, 1, 128, 2, 128, 3, 4, // - 128, 0, 1, 128, 2, 128, 3, 4, // - 0, 1, 2, 128, 3, 128, 4, 5, // - 128, 128, 128, 0, 1, 128, 2, 3, // - 0, 128, 128, 1, 2, 128, 3, 4, // - 128, 0, 128, 1, 2, 128, 3, 4, // - 0, 1, 128, 2, 3, 128, 4, 5, // - 128, 128, 0, 1, 2, 128, 3, 4, // - 0, 128, 1, 2, 3, 128, 4, 5, // - 128, 0, 1, 2, 3, 128, 4, 5, // - 0, 1, 2, 3, 4, 128, 5, 6, // - 128, 128, 128, 128, 128, 0, 1, 2, // - 0, 128, 128, 128, 128, 1, 2, 3, // - 128, 0, 128, 128, 128, 1, 2, 3, // - 0, 1, 128, 128, 128, 2, 3, 4, // - 128, 128, 0, 128, 128, 1, 2, 3, // - 0, 128, 1, 128, 128, 2, 3, 4, // - 128, 0, 1, 128, 128, 2, 3, 4, // - 0, 1, 2, 128, 128, 3, 4, 5, // - 128, 128, 128, 0, 128, 1, 2, 3, // - 0, 128, 128, 1, 128, 2, 3, 4, // - 128, 0, 128, 1, 128, 2, 3, 4, // - 0, 1, 128, 2, 128, 3, 4, 5, // - 128, 128, 0, 1, 128, 2, 3, 4, // - 0, 128, 1, 2, 128, 3, 4, 5, // - 128, 0, 1, 2, 128, 3, 4, 5, // - 0, 1, 2, 3, 128, 4, 5, 6, // - 128, 128, 128, 128, 0, 1, 2, 3, // - 0, 128, 128, 128, 1, 2, 3, 4, // - 128, 0, 128, 128, 1, 2, 3, 4, // - 0, 1, 128, 128, 2, 3, 4, 5, // - 128, 128, 0, 128, 1, 2, 3, 4, // - 0, 128, 1, 128, 2, 3, 4, 5, // - 128, 0, 1, 128, 2, 3, 4, 5, // - 0, 1, 2, 128, 3, 4, 5, 6, // - 128, 128, 128, 0, 1, 2, 3, 4, // - 0, 128, 128, 1, 2, 3, 4, 5, // - 128, 0, 128, 1, 2, 3, 4, 5, // - 0, 1, 128, 2, 3, 4, 5, 6, // - 128, 128, 0, 1, 2, 3, 4, 5, // - 0, 128, 1, 2, 3, 4, 5, 6, // - 128, 0, 1, 2, 3, 4, 5, 6, // - 0, 1, 2, 3, 4, 5, 6, 7}; - return Load(du8, table + mask_bits * 8); -} - -template -HWY_INLINE svuint8_t LaneIndicesFromByteIndices(D, svuint8_t idx) { - return idx; -} -template , HWY_IF_NOT_T_SIZE_D(D, 1)> -HWY_INLINE VFromD LaneIndicesFromByteIndices(D, svuint8_t idx) { - return PromoteTo(DU(), idx); -} - -// General case when we don't know the vector size, 8 elements at a time. -template -HWY_INLINE V ExpandLoop(V v, svbool_t mask) { - const DFromV d; - uint8_t mask_bytes[256 / 8]; - StoreMaskBits(d, mask, mask_bytes); - - // ShiftLeftLanes is expensive, so we're probably better off storing to memory - // and loading the final result. - alignas(16) TFromV out[2 * MaxLanes(d)]; - - svbool_t next = svpfalse_b(); - size_t input_consumed = 0; - const V iota = Iota(d, 0); - for (size_t i = 0; i < Lanes(d); i += 8) { - uint64_t mask_bits = mask_bytes[i / 8]; - - // We want to skip past the v lanes already consumed. There is no - // instruction for variable-shift-reg, but we can splice. - const V vH = detail::Splice(v, v, next); - input_consumed += PopCount(mask_bits); - next = detail::GeN(iota, static_cast>(input_consumed)); - - const auto idx = detail::LaneIndicesFromByteIndices( - d, detail::IndicesForExpandFromBits(mask_bits)); - const V expand = TableLookupLanes(vH, idx); - StoreU(expand, d, out + i); - } - return LoadU(d, out); -} - -} // namespace detail - -template -HWY_API V Expand(V v, svbool_t mask) { -#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE - const DFromV d; - uint8_t mask_bytes[256 / 8]; - StoreMaskBits(d, mask, mask_bytes); - const uint64_t maskL = mask_bytes[0]; - const uint64_t maskH = mask_bytes[1]; - - // We want to skip past the v bytes already consumed by expandL. There is no - // instruction for shift-reg by variable bytes, but we can splice. Instead of - // GeN, Not(FirstN()) would also work. - using T = TFromV; - const T countL = static_cast(PopCount(maskL)); - const V vH = detail::Splice(v, v, detail::GeN(Iota(d, 0), countL)); - - const svuint8_t idxL = detail::IndicesForExpandFromBits(maskL); - const svuint8_t idxH = detail::IndicesForExpandFromBits(maskH); - return Combine(d, TableLookupLanes(vH, idxH), TableLookupLanes(v, idxL)); -#else - return detail::ExpandLoop(v, mask); -#endif -} - -template -HWY_API V Expand(V v, svbool_t mask) { -#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE // 16x8 - const DFromV d; - const RebindToUnsigned du16; - const Rebind du8; - // Convert mask into bitfield via horizontal sum (faster than ORV) of 8 bits. - // Pre-multiply by N so we can use it as an offset for Load. - const svuint16_t bits = Shl(Set(du16, 1), Iota(du16, 3)); - const size_t offset = detail::SumOfLanesM(mask, bits); - - // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply - // the nibble trick used below because not all indices fit within one lane. - alignas(16) static constexpr uint8_t table[8 * 256] = { - // PrintExpand16x8LaneTables - 255, 255, 255, 255, 255, 255, 255, 255, // - 0, 255, 255, 255, 255, 255, 255, 255, // - 255, 0, 255, 255, 255, 255, 255, 255, // - 0, 1, 255, 255, 255, 255, 255, 255, // - 255, 255, 0, 255, 255, 255, 255, 255, // - 0, 255, 1, 255, 255, 255, 255, 255, // - 255, 0, 1, 255, 255, 255, 255, 255, // - 0, 1, 2, 255, 255, 255, 255, 255, // - 255, 255, 255, 0, 255, 255, 255, 255, // - 0, 255, 255, 1, 255, 255, 255, 255, // - 255, 0, 255, 1, 255, 255, 255, 255, // - 0, 1, 255, 2, 255, 255, 255, 255, // - 255, 255, 0, 1, 255, 255, 255, 255, // - 0, 255, 1, 2, 255, 255, 255, 255, // - 255, 0, 1, 2, 255, 255, 255, 255, // - 0, 1, 2, 3, 255, 255, 255, 255, // - 255, 255, 255, 255, 0, 255, 255, 255, // - 0, 255, 255, 255, 1, 255, 255, 255, // - 255, 0, 255, 255, 1, 255, 255, 255, // - 0, 1, 255, 255, 2, 255, 255, 255, // - 255, 255, 0, 255, 1, 255, 255, 255, // - 0, 255, 1, 255, 2, 255, 255, 255, // - 255, 0, 1, 255, 2, 255, 255, 255, // - 0, 1, 2, 255, 3, 255, 255, 255, // - 255, 255, 255, 0, 1, 255, 255, 255, // - 0, 255, 255, 1, 2, 255, 255, 255, // - 255, 0, 255, 1, 2, 255, 255, 255, // - 0, 1, 255, 2, 3, 255, 255, 255, // - 255, 255, 0, 1, 2, 255, 255, 255, // - 0, 255, 1, 2, 3, 255, 255, 255, // - 255, 0, 1, 2, 3, 255, 255, 255, // - 0, 1, 2, 3, 4, 255, 255, 255, // - 255, 255, 255, 255, 255, 0, 255, 255, // - 0, 255, 255, 255, 255, 1, 255, 255, // - 255, 0, 255, 255, 255, 1, 255, 255, // - 0, 1, 255, 255, 255, 2, 255, 255, // - 255, 255, 0, 255, 255, 1, 255, 255, // - 0, 255, 1, 255, 255, 2, 255, 255, // - 255, 0, 1, 255, 255, 2, 255, 255, // - 0, 1, 2, 255, 255, 3, 255, 255, // - 255, 255, 255, 0, 255, 1, 255, 255, // - 0, 255, 255, 1, 255, 2, 255, 255, // - 255, 0, 255, 1, 255, 2, 255, 255, // - 0, 1, 255, 2, 255, 3, 255, 255, // - 255, 255, 0, 1, 255, 2, 255, 255, // - 0, 255, 1, 2, 255, 3, 255, 255, // - 255, 0, 1, 2, 255, 3, 255, 255, // - 0, 1, 2, 3, 255, 4, 255, 255, // - 255, 255, 255, 255, 0, 1, 255, 255, // - 0, 255, 255, 255, 1, 2, 255, 255, // - 255, 0, 255, 255, 1, 2, 255, 255, // - 0, 1, 255, 255, 2, 3, 255, 255, // - 255, 255, 0, 255, 1, 2, 255, 255, // - 0, 255, 1, 255, 2, 3, 255, 255, // - 255, 0, 1, 255, 2, 3, 255, 255, // - 0, 1, 2, 255, 3, 4, 255, 255, // - 255, 255, 255, 0, 1, 2, 255, 255, // - 0, 255, 255, 1, 2, 3, 255, 255, // - 255, 0, 255, 1, 2, 3, 255, 255, // - 0, 1, 255, 2, 3, 4, 255, 255, // - 255, 255, 0, 1, 2, 3, 255, 255, // - 0, 255, 1, 2, 3, 4, 255, 255, // - 255, 0, 1, 2, 3, 4, 255, 255, // - 0, 1, 2, 3, 4, 5, 255, 255, // - 255, 255, 255, 255, 255, 255, 0, 255, // - 0, 255, 255, 255, 255, 255, 1, 255, // - 255, 0, 255, 255, 255, 255, 1, 255, // - 0, 1, 255, 255, 255, 255, 2, 255, // - 255, 255, 0, 255, 255, 255, 1, 255, // - 0, 255, 1, 255, 255, 255, 2, 255, // - 255, 0, 1, 255, 255, 255, 2, 255, // - 0, 1, 2, 255, 255, 255, 3, 255, // - 255, 255, 255, 0, 255, 255, 1, 255, // - 0, 255, 255, 1, 255, 255, 2, 255, // - 255, 0, 255, 1, 255, 255, 2, 255, // - 0, 1, 255, 2, 255, 255, 3, 255, // - 255, 255, 0, 1, 255, 255, 2, 255, // - 0, 255, 1, 2, 255, 255, 3, 255, // - 255, 0, 1, 2, 255, 255, 3, 255, // - 0, 1, 2, 3, 255, 255, 4, 255, // - 255, 255, 255, 255, 0, 255, 1, 255, // - 0, 255, 255, 255, 1, 255, 2, 255, // - 255, 0, 255, 255, 1, 255, 2, 255, // - 0, 1, 255, 255, 2, 255, 3, 255, // - 255, 255, 0, 255, 1, 255, 2, 255, // - 0, 255, 1, 255, 2, 255, 3, 255, // - 255, 0, 1, 255, 2, 255, 3, 255, // - 0, 1, 2, 255, 3, 255, 4, 255, // - 255, 255, 255, 0, 1, 255, 2, 255, // - 0, 255, 255, 1, 2, 255, 3, 255, // - 255, 0, 255, 1, 2, 255, 3, 255, // - 0, 1, 255, 2, 3, 255, 4, 255, // - 255, 255, 0, 1, 2, 255, 3, 255, // - 0, 255, 1, 2, 3, 255, 4, 255, // - 255, 0, 1, 2, 3, 255, 4, 255, // - 0, 1, 2, 3, 4, 255, 5, 255, // - 255, 255, 255, 255, 255, 0, 1, 255, // - 0, 255, 255, 255, 255, 1, 2, 255, // - 255, 0, 255, 255, 255, 1, 2, 255, // - 0, 1, 255, 255, 255, 2, 3, 255, // - 255, 255, 0, 255, 255, 1, 2, 255, // - 0, 255, 1, 255, 255, 2, 3, 255, // - 255, 0, 1, 255, 255, 2, 3, 255, // - 0, 1, 2, 255, 255, 3, 4, 255, // - 255, 255, 255, 0, 255, 1, 2, 255, // - 0, 255, 255, 1, 255, 2, 3, 255, // - 255, 0, 255, 1, 255, 2, 3, 255, // - 0, 1, 255, 2, 255, 3, 4, 255, // - 255, 255, 0, 1, 255, 2, 3, 255, // - 0, 255, 1, 2, 255, 3, 4, 255, // - 255, 0, 1, 2, 255, 3, 4, 255, // - 0, 1, 2, 3, 255, 4, 5, 255, // - 255, 255, 255, 255, 0, 1, 2, 255, // - 0, 255, 255, 255, 1, 2, 3, 255, // - 255, 0, 255, 255, 1, 2, 3, 255, // - 0, 1, 255, 255, 2, 3, 4, 255, // - 255, 255, 0, 255, 1, 2, 3, 255, // - 0, 255, 1, 255, 2, 3, 4, 255, // - 255, 0, 1, 255, 2, 3, 4, 255, // - 0, 1, 2, 255, 3, 4, 5, 255, // - 255, 255, 255, 0, 1, 2, 3, 255, // - 0, 255, 255, 1, 2, 3, 4, 255, // - 255, 0, 255, 1, 2, 3, 4, 255, // - 0, 1, 255, 2, 3, 4, 5, 255, // - 255, 255, 0, 1, 2, 3, 4, 255, // - 0, 255, 1, 2, 3, 4, 5, 255, // - 255, 0, 1, 2, 3, 4, 5, 255, // - 0, 1, 2, 3, 4, 5, 6, 255, // - 255, 255, 255, 255, 255, 255, 255, 0, // - 0, 255, 255, 255, 255, 255, 255, 1, // - 255, 0, 255, 255, 255, 255, 255, 1, // - 0, 1, 255, 255, 255, 255, 255, 2, // - 255, 255, 0, 255, 255, 255, 255, 1, // - 0, 255, 1, 255, 255, 255, 255, 2, // - 255, 0, 1, 255, 255, 255, 255, 2, // - 0, 1, 2, 255, 255, 255, 255, 3, // - 255, 255, 255, 0, 255, 255, 255, 1, // - 0, 255, 255, 1, 255, 255, 255, 2, // - 255, 0, 255, 1, 255, 255, 255, 2, // - 0, 1, 255, 2, 255, 255, 255, 3, // - 255, 255, 0, 1, 255, 255, 255, 2, // - 0, 255, 1, 2, 255, 255, 255, 3, // - 255, 0, 1, 2, 255, 255, 255, 3, // - 0, 1, 2, 3, 255, 255, 255, 4, // - 255, 255, 255, 255, 0, 255, 255, 1, // - 0, 255, 255, 255, 1, 255, 255, 2, // - 255, 0, 255, 255, 1, 255, 255, 2, // - 0, 1, 255, 255, 2, 255, 255, 3, // - 255, 255, 0, 255, 1, 255, 255, 2, // - 0, 255, 1, 255, 2, 255, 255, 3, // - 255, 0, 1, 255, 2, 255, 255, 3, // - 0, 1, 2, 255, 3, 255, 255, 4, // - 255, 255, 255, 0, 1, 255, 255, 2, // - 0, 255, 255, 1, 2, 255, 255, 3, // - 255, 0, 255, 1, 2, 255, 255, 3, // - 0, 1, 255, 2, 3, 255, 255, 4, // - 255, 255, 0, 1, 2, 255, 255, 3, // - 0, 255, 1, 2, 3, 255, 255, 4, // - 255, 0, 1, 2, 3, 255, 255, 4, // - 0, 1, 2, 3, 4, 255, 255, 5, // - 255, 255, 255, 255, 255, 0, 255, 1, // - 0, 255, 255, 255, 255, 1, 255, 2, // - 255, 0, 255, 255, 255, 1, 255, 2, // - 0, 1, 255, 255, 255, 2, 255, 3, // - 255, 255, 0, 255, 255, 1, 255, 2, // - 0, 255, 1, 255, 255, 2, 255, 3, // - 255, 0, 1, 255, 255, 2, 255, 3, // - 0, 1, 2, 255, 255, 3, 255, 4, // - 255, 255, 255, 0, 255, 1, 255, 2, // - 0, 255, 255, 1, 255, 2, 255, 3, // - 255, 0, 255, 1, 255, 2, 255, 3, // - 0, 1, 255, 2, 255, 3, 255, 4, // - 255, 255, 0, 1, 255, 2, 255, 3, // - 0, 255, 1, 2, 255, 3, 255, 4, // - 255, 0, 1, 2, 255, 3, 255, 4, // - 0, 1, 2, 3, 255, 4, 255, 5, // - 255, 255, 255, 255, 0, 1, 255, 2, // - 0, 255, 255, 255, 1, 2, 255, 3, // - 255, 0, 255, 255, 1, 2, 255, 3, // - 0, 1, 255, 255, 2, 3, 255, 4, // - 255, 255, 0, 255, 1, 2, 255, 3, // - 0, 255, 1, 255, 2, 3, 255, 4, // - 255, 0, 1, 255, 2, 3, 255, 4, // - 0, 1, 2, 255, 3, 4, 255, 5, // - 255, 255, 255, 0, 1, 2, 255, 3, // - 0, 255, 255, 1, 2, 3, 255, 4, // - 255, 0, 255, 1, 2, 3, 255, 4, // - 0, 1, 255, 2, 3, 4, 255, 5, // - 255, 255, 0, 1, 2, 3, 255, 4, // - 0, 255, 1, 2, 3, 4, 255, 5, // - 255, 0, 1, 2, 3, 4, 255, 5, // - 0, 1, 2, 3, 4, 5, 255, 6, // - 255, 255, 255, 255, 255, 255, 0, 1, // - 0, 255, 255, 255, 255, 255, 1, 2, // - 255, 0, 255, 255, 255, 255, 1, 2, // - 0, 1, 255, 255, 255, 255, 2, 3, // - 255, 255, 0, 255, 255, 255, 1, 2, // - 0, 255, 1, 255, 255, 255, 2, 3, // - 255, 0, 1, 255, 255, 255, 2, 3, // - 0, 1, 2, 255, 255, 255, 3, 4, // - 255, 255, 255, 0, 255, 255, 1, 2, // - 0, 255, 255, 1, 255, 255, 2, 3, // - 255, 0, 255, 1, 255, 255, 2, 3, // - 0, 1, 255, 2, 255, 255, 3, 4, // - 255, 255, 0, 1, 255, 255, 2, 3, // - 0, 255, 1, 2, 255, 255, 3, 4, // - 255, 0, 1, 2, 255, 255, 3, 4, // - 0, 1, 2, 3, 255, 255, 4, 5, // - 255, 255, 255, 255, 0, 255, 1, 2, // - 0, 255, 255, 255, 1, 255, 2, 3, // - 255, 0, 255, 255, 1, 255, 2, 3, // - 0, 1, 255, 255, 2, 255, 3, 4, // - 255, 255, 0, 255, 1, 255, 2, 3, // - 0, 255, 1, 255, 2, 255, 3, 4, // - 255, 0, 1, 255, 2, 255, 3, 4, // - 0, 1, 2, 255, 3, 255, 4, 5, // - 255, 255, 255, 0, 1, 255, 2, 3, // - 0, 255, 255, 1, 2, 255, 3, 4, // - 255, 0, 255, 1, 2, 255, 3, 4, // - 0, 1, 255, 2, 3, 255, 4, 5, // - 255, 255, 0, 1, 2, 255, 3, 4, // - 0, 255, 1, 2, 3, 255, 4, 5, // - 255, 0, 1, 2, 3, 255, 4, 5, // - 0, 1, 2, 3, 4, 255, 5, 6, // - 255, 255, 255, 255, 255, 0, 1, 2, // - 0, 255, 255, 255, 255, 1, 2, 3, // - 255, 0, 255, 255, 255, 1, 2, 3, // - 0, 1, 255, 255, 255, 2, 3, 4, // - 255, 255, 0, 255, 255, 1, 2, 3, // - 0, 255, 1, 255, 255, 2, 3, 4, // - 255, 0, 1, 255, 255, 2, 3, 4, // - 0, 1, 2, 255, 255, 3, 4, 5, // - 255, 255, 255, 0, 255, 1, 2, 3, // - 0, 255, 255, 1, 255, 2, 3, 4, // - 255, 0, 255, 1, 255, 2, 3, 4, // - 0, 1, 255, 2, 255, 3, 4, 5, // - 255, 255, 0, 1, 255, 2, 3, 4, // - 0, 255, 1, 2, 255, 3, 4, 5, // - 255, 0, 1, 2, 255, 3, 4, 5, // - 0, 1, 2, 3, 255, 4, 5, 6, // - 255, 255, 255, 255, 0, 1, 2, 3, // - 0, 255, 255, 255, 1, 2, 3, 4, // - 255, 0, 255, 255, 1, 2, 3, 4, // - 0, 1, 255, 255, 2, 3, 4, 5, // - 255, 255, 0, 255, 1, 2, 3, 4, // - 0, 255, 1, 255, 2, 3, 4, 5, // - 255, 0, 1, 255, 2, 3, 4, 5, // - 0, 1, 2, 255, 3, 4, 5, 6, // - 255, 255, 255, 0, 1, 2, 3, 4, // - 0, 255, 255, 1, 2, 3, 4, 5, // - 255, 0, 255, 1, 2, 3, 4, 5, // - 0, 1, 255, 2, 3, 4, 5, 6, // - 255, 255, 0, 1, 2, 3, 4, 5, // - 0, 255, 1, 2, 3, 4, 5, 6, // - 255, 0, 1, 2, 3, 4, 5, 6, // - 0, 1, 2, 3, 4, 5, 6, 7}; - const svuint16_t indices = PromoteTo(du16, Load(du8, table + offset)); - return TableLookupLanes(v, indices); // already zeros mask=false lanes -#else - return detail::ExpandLoop(v, mask); -#endif -} - -template -HWY_API V Expand(V v, svbool_t mask) { -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE // 32x8 - const DFromV d; - const RebindToUnsigned du32; - // Convert mask into bitfield via horizontal sum (faster than ORV). - const svuint32_t bits = Shl(Set(du32, 1), Iota(du32, 0)); - const size_t code = detail::SumOfLanesM(mask, bits); - - alignas(16) constexpr uint32_t packed_array[256] = { - // PrintExpand32x8. - 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0, - 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10, - 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0, - 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210, - 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0, - 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10, - 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0, - 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210, - 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0, - 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10, - 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0, - 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210, - 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0, - 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10, - 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0, - 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210, - 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0, - 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10, - 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0, - 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210, - 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0, - 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10, - 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0, - 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210, - 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0, - 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10, - 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0, - 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210, - 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0, - 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10, - 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0, - 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210, - 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0, - 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10, - 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0, - 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210, - 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0, - 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10, - 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0, - 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210, - 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0, - 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10, - 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210}; - - // For lane i, shift the i-th 4-bit index down and mask with 0xF because - // svtbl zeros outputs if the index is out of bounds. - const svuint32_t packed = Set(du32, packed_array[code]); - const svuint32_t indices = detail::AndN(Shr(packed, svindex_u32(0, 4)), 0xF); - return TableLookupLanes(v, indices); // already zeros mask=false lanes -#elif HWY_TARGET == HWY_SVE2_128 // 32x4 - const DFromV d; - const RebindToUnsigned du32; - // Convert mask into bitfield via horizontal sum (faster than ORV). - const svuint32_t bits = Shl(Set(du32, 1), Iota(du32, 0)); - const size_t offset = detail::SumOfLanesM(mask, bits); - - alignas(16) constexpr uint32_t packed_array[16] = { - // PrintExpand64x4Nibble - same for 32x4. - 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, - 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, - 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; - - // For lane i, shift the i-th 4-bit index down and mask with 0xF because - // svtbl zeros outputs if the index is out of bounds. - const svuint32_t packed = Set(du32, packed_array[offset]); - const svuint32_t indices = detail::AndN(Shr(packed, svindex_u32(0, 4)), 0xF); - return TableLookupLanes(v, indices); // already zeros mask=false lanes -#else - return detail::ExpandLoop(v, mask); -#endif -} - -template -HWY_API V Expand(V v, svbool_t mask) { -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE // 64x4 - const DFromV d; - const RebindToUnsigned du64; - - // Convert mask into bitfield via horizontal sum (faster than ORV) of masked - // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for - // SetTableIndices. - const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2)); - const size_t offset = detail::SumOfLanesM(mask, bits); - - alignas(16) static constexpr uint64_t table[4 * 16] = { - // PrintExpand64x4Tables - small enough to store uncompressed. - 255, 255, 255, 255, 0, 255, 255, 255, 255, 0, 255, 255, 0, 1, 255, 255, - 255, 255, 0, 255, 0, 255, 1, 255, 255, 0, 1, 255, 0, 1, 2, 255, - 255, 255, 255, 0, 0, 255, 255, 1, 255, 0, 255, 1, 0, 1, 255, 2, - 255, 255, 0, 1, 0, 255, 1, 2, 255, 0, 1, 2, 0, 1, 2, 3}; - // This already zeros mask=false lanes. - return TableLookupLanes(v, SetTableIndices(d, table + offset)); -#elif HWY_TARGET == HWY_SVE2_128 // 64x2 - // Same as Compress, just zero out the mask=false lanes. - return IfThenElseZero(mask, Compress(v, mask)); -#else - return detail::ExpandLoop(v, mask); -#endif -} - -// ------------------------------ LoadExpand - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { - return Expand(LoadU(d, unaligned), mask); -} - -// ------------------------------ MulEven (InterleaveEven) - -#if HWY_SVE_HAVE_2 -namespace detail { -#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) \ - NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \ - return sv##OP##_##CHAR##BITS(a, b); \ - } - -HWY_SVE_FOREACH_UI16(HWY_SVE_MUL_EVEN, MulEvenNative, mullb) -HWY_SVE_FOREACH_UI32(HWY_SVE_MUL_EVEN, MulEvenNative, mullb) -HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb) -HWY_SVE_FOREACH_UI16(HWY_SVE_MUL_EVEN, MulOddNative, mullt) -HWY_SVE_FOREACH_UI32(HWY_SVE_MUL_EVEN, MulOddNative, mullt) -HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulOddNative, mullt) -#undef HWY_SVE_MUL_EVEN -} // namespace detail -#endif - -template >, - HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> -HWY_API VFromD MulEven(const V a, const V b) { -#if HWY_SVE_HAVE_2 - return BitCast(DW(), detail::MulEvenNative(a, b)); -#else - const auto lo = Mul(a, b); - const auto hi = MulHigh(a, b); - return BitCast(DW(), detail::InterleaveEven(lo, hi)); -#endif -} - -template >, - HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> -HWY_API VFromD MulOdd(const V a, const V b) { -#if HWY_SVE_HAVE_2 - return BitCast(DW(), detail::MulOddNative(a, b)); -#else - const auto lo = Mul(a, b); - const auto hi = MulHigh(a, b); - return BitCast(DW(), detail::InterleaveOdd(lo, hi)); -#endif -} - -HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) { - const auto lo = Mul(a, b); - const auto hi = MulHigh(a, b); - return detail::InterleaveEven(lo, hi); -} - -HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) { - const auto lo = Mul(a, b); - const auto hi = MulHigh(a, b); - return detail::InterleaveOdd(lo, hi); -} - -// ------------------------------ WidenMulPairwiseAdd - -template -HWY_API svfloat32_t WidenMulPairwiseAdd(Simd df32, VBF16 a, - VBF16 b) { -#if HWY_SVE_HAVE_BFLOAT16 - const svfloat32_t even = svbfmlalb_f32(Zero(df32), a, b); - return svbfmlalt_f32(even, a, b); -#else - const RebindToUnsigned du32; - // Using shift/and instead of Zip leads to the odd/even order that - // RearrangeToOddPlusEven prefers. - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), - Mul(BitCast(df32, ao), BitCast(df32, bo))); -#endif // HWY_SVE_HAVE_BFLOAT16 -} - -template -HWY_API svint32_t WidenMulPairwiseAdd(Simd d32, svint16_t a, - svint16_t b) { -#if HWY_SVE_HAVE_2 - (void)d32; - return svmlalt_s32(svmullb_s32(a, b), a, b); -#else - const svbool_t pg = detail::PTrue(d32); - // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers. - // Fortunately SVE has sign-extension for the even lanes. - const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a)); - const svint32_t be = svexth_s32_x(pg, BitCast(d32, b)); - const svint32_t ao = ShiftRight<16>(BitCast(d32, a)); - const svint32_t bo = ShiftRight<16>(BitCast(d32, b)); - return svmla_s32_x(pg, svmul_s32_x(pg, ao, bo), ae, be); -#endif -} - -template -HWY_API svuint32_t WidenMulPairwiseAdd(Simd d32, - svuint16_t a, svuint16_t b) { -#if HWY_SVE_HAVE_2 - (void)d32; - return svmlalt_u32(svmullb_u32(a, b), a, b); -#else - const svbool_t pg = detail::PTrue(d32); - // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers. - // Fortunately SVE has sign-extension for the even lanes. - const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a)); - const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b)); - const svuint32_t ao = ShiftRight<16>(BitCast(d32, a)); - const svuint32_t bo = ShiftRight<16>(BitCast(d32, b)); - return svmla_u32_x(pg, svmul_u32_x(pg, ao, bo), ae, be); -#endif -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -template -HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd df32, - VBF16 a, VBF16 b, - const svfloat32_t sum0, - svfloat32_t& sum1) { -#if HWY_SVE_HAVE_BFLOAT16 - (void)df32; - sum1 = svbfmlalt_f32(sum1, a, b); - return svbfmlalb_f32(sum0, a, b); -#else - const RebindToUnsigned du32; - // Using shift/and instead of Zip leads to the odd/even order that - // RearrangeToOddPlusEven prefers. - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -#endif // HWY_SVE_HAVE_BFLOAT16 -} - -template -HWY_API svint32_t ReorderWidenMulAccumulate(Simd d32, - svint16_t a, svint16_t b, - const svint32_t sum0, - svint32_t& sum1) { -#if HWY_SVE_HAVE_2 - (void)d32; - sum1 = svmlalt_s32(sum1, a, b); - return svmlalb_s32(sum0, a, b); -#else - const svbool_t pg = detail::PTrue(d32); - // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers. - // Fortunately SVE has sign-extension for the even lanes. - const svint32_t ae = svexth_s32_x(pg, BitCast(d32, a)); - const svint32_t be = svexth_s32_x(pg, BitCast(d32, b)); - const svint32_t ao = ShiftRight<16>(BitCast(d32, a)); - const svint32_t bo = ShiftRight<16>(BitCast(d32, b)); - sum1 = svmla_s32_x(pg, sum1, ao, bo); - return svmla_s32_x(pg, sum0, ae, be); -#endif -} - -template -HWY_API svuint32_t ReorderWidenMulAccumulate(Simd d32, - svuint16_t a, svuint16_t b, - const svuint32_t sum0, - svuint32_t& sum1) { -#if HWY_SVE_HAVE_2 - (void)d32; - sum1 = svmlalt_u32(sum1, a, b); - return svmlalb_u32(sum0, a, b); -#else - const svbool_t pg = detail::PTrue(d32); - // Shifting extracts the odd lanes as RearrangeToOddPlusEven prefers. - // Fortunately SVE has sign-extension for the even lanes. - const svuint32_t ae = svexth_u32_x(pg, BitCast(d32, a)); - const svuint32_t be = svexth_u32_x(pg, BitCast(d32, b)); - const svuint32_t ao = ShiftRight<16>(BitCast(d32, a)); - const svuint32_t bo = ShiftRight<16>(BitCast(d32, b)); - sum1 = svmla_u32_x(pg, sum1, ao, bo); - return svmla_u32_x(pg, sum0, ae, be); -#endif -} - -// ------------------------------ RearrangeToOddPlusEven -template -HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { - // sum0 is the sum of bottom/even lanes and sum1 of top/odd lanes. - return Add(sum0, sum1); -} - -// ------------------------------ SumOfMulQuadAccumulate - -#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate(DI32 /*di32*/, svint8_t a, - svint8_t b, svint32_t sum) { - return svdot_s32(sum, a, b); -} - -#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate(DU32 /*du32*/, svuint8_t a, - svuint8_t b, svuint32_t sum) { - return svdot_u32(sum, a, b); -} - -#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, svuint8_t a_u, - svint8_t b_i, svint32_t sum) { - // TODO: use svusdot_u32 on SVE targets that require support for both SVE2 - // and SVE I8MM. - - const RebindToUnsigned du32; - const Repartition du8; - - const auto b_u = BitCast(du8, b_i); - const auto result_sum0 = svdot_u32(BitCast(du32, sum), a_u, b_u); - const auto result_sum1 = - ShiftLeft<8>(svdot_u32(Zero(du32), a_u, ShiftRight<7>(b_u))); - - return BitCast(di32, Sub(result_sum0, result_sum1)); -} - -#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate(DI64 /*di64*/, svint16_t a, - svint16_t b, svint64_t sum) { - return svdot_s64(sum, a, b); -} - -#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate(DU64 /*du64*/, svuint16_t a, - svuint16_t b, svuint64_t sum) { - return svdot_u64(sum, a, b); -} - -// ------------------------------ AESRound / CLMul - -#if defined(__ARM_FEATURE_SVE2_AES) || \ - (HWY_SVE_HAVE_2 && HWY_HAVE_RUNTIME_DISPATCH) - -// Per-target flag to prevent generic_ops-inl.h from defining AESRound. -#ifdef HWY_NATIVE_AES -#undef HWY_NATIVE_AES -#else -#define HWY_NATIVE_AES -#endif - -HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) { - // It is not clear whether E and MC fuse like they did on NEON. - return Xor(svaesmc_u8(svaese_u8(state, svdup_n_u8(0))), round_key); -} - -HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) { - return Xor(svaese_u8(state, svdup_n_u8(0)), round_key); -} - -HWY_API svuint8_t AESInvMixColumns(svuint8_t state) { - return svaesimc_u8(state); -} - -HWY_API svuint8_t AESRoundInv(svuint8_t state, svuint8_t round_key) { - return Xor(svaesimc_u8(svaesd_u8(state, svdup_n_u8(0))), round_key); -} - -HWY_API svuint8_t AESLastRoundInv(svuint8_t state, svuint8_t round_key) { - return Xor(svaesd_u8(state, svdup_n_u8(0)), round_key); -} - -template -HWY_API svuint8_t AESKeyGenAssist(svuint8_t v) { - alignas(16) static constexpr uint8_t kRconXorMask[16] = { - 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; - alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { - 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; - const DFromV d; - const Repartition du32; - const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); - const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask)); - return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle)); -} - -HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) { - return svpmullb_pair(a, b); -} - -HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) { - return svpmullt_pair(a, b); -} - -#endif // __ARM_FEATURE_SVE2_AES - -// ------------------------------ Lt128 - -namespace detail { -#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP) \ - template \ - HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \ - return sv##OP##_b##BITS(m, m); \ - } - -HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1) // actually for bool -HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool -#undef HWY_SVE_DUP - -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE -template -HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t eqHx = Eq(a, b); // only odd lanes used - // Convert to vector: more pipelines can execute vector TRN* instructions - // than the predicate version. - const svuint64_t ltHL = VecFromMask(d, Lt(a, b)); - // Move into upper lane: ltL if the upper half is equal, otherwise ltH. - // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated. - const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL); - // Duplicate upper lane into lower. - return DupOdd(ltHx); -} -#endif -} // namespace detail - -template -HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) { -#if HWY_TARGET == HWY_SVE_256 - return MaskFromVec(detail::Lt128Vec(d, a, b)); -#else - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t eqHx = Eq(a, b); // only odd lanes used - const svbool_t ltHL = Lt(a, b); - // Move into upper lane: ltL if the upper half is equal, otherwise ltH. - const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL); - // Duplicate upper lane into lower. - return detail::DupOddB(d, ltHx); -#endif // HWY_TARGET != HWY_SVE_256 -} - -// ------------------------------ Lt128Upper - -template -HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t ltHL = Lt(a, b); - return detail::DupOddB(d, ltHL); -} - -// ------------------------------ Eq128, Ne128 - -#if HWY_TARGET == HWY_SVE_256 || HWY_IDE -namespace detail { - -template -HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - // Convert to vector: more pipelines can execute vector TRN* instructions - // than the predicate version. - const svuint64_t eqHL = VecFromMask(d, Eq(a, b)); - // Duplicate upper and lower. - const svuint64_t eqHH = DupOdd(eqHL); - const svuint64_t eqLL = DupEven(eqHL); - return And(eqLL, eqHH); -} - -template -HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - // Convert to vector: more pipelines can execute vector TRN* instructions - // than the predicate version. - const svuint64_t neHL = VecFromMask(d, Ne(a, b)); - // Duplicate upper and lower. - const svuint64_t neHH = DupOdd(neHL); - const svuint64_t neLL = DupEven(neHL); - return Or(neLL, neHH); -} - -} // namespace detail -#endif - -template -HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) { -#if HWY_TARGET == HWY_SVE_256 - return MaskFromVec(detail::Eq128Vec(d, a, b)); -#else - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t eqHL = Eq(a, b); - const svbool_t eqHH = detail::DupOddB(d, eqHL); - const svbool_t eqLL = detail::DupEvenB(d, eqHL); - return And(eqLL, eqHH); -#endif // HWY_TARGET != HWY_SVE_256 -} - -template -HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) { -#if HWY_TARGET == HWY_SVE_256 - return MaskFromVec(detail::Ne128Vec(d, a, b)); -#else - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t neHL = Ne(a, b); - const svbool_t neHH = detail::DupOddB(d, neHL); - const svbool_t neLL = detail::DupEvenB(d, neHL); - return Or(neLL, neHH); -#endif // HWY_TARGET != HWY_SVE_256 -} - -// ------------------------------ Eq128Upper, Ne128Upper - -template -HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t eqHL = Eq(a, b); - return detail::DupOddB(d, eqHL); -} - -template -HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const svbool_t neHL = Ne(a, b); - return detail::DupOddB(d, neHL); -} - -// ------------------------------ Min128, Max128 (Lt128) - -template -HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) { -#if HWY_TARGET == HWY_SVE_256 - return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); -#else - return IfThenElse(Lt128(d, a, b), a, b); -#endif -} - -template -HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) { -#if HWY_TARGET == HWY_SVE_256 - return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); -#else - return IfThenElse(Lt128(d, b, a), a, b); -#endif -} - -template -HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) { - return IfThenElse(Lt128Upper(d, a, b), a, b); -} - -template -HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) { - return IfThenElse(Lt128Upper(d, b, a), a, b); -} - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -#ifdef HWY_NATIVE_LEADING_ZERO_COUNT -#undef HWY_NATIVE_LEADING_ZERO_COUNT -#else -#define HWY_NATIVE_LEADING_ZERO_COUNT -#endif - -#define HWY_SVE_LEADING_ZERO_COUNT(BASE, CHAR, BITS, HALF, NAME, OP) \ - HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \ - const DFromV d; \ - return BitCast(d, sv##OP##_##CHAR##BITS##_x(detail::PTrue(d), v)); \ - } - -HWY_SVE_FOREACH_UI(HWY_SVE_LEADING_ZERO_COUNT, LeadingZeroCount, clz) -#undef HWY_SVE_LEADING_ZERO_COUNT - -template -HWY_API V TrailingZeroCount(V v) { - return LeadingZeroCount(ReverseBits(v)); -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - using T = TFromD; - return BitCast(d, Sub(Set(d, T{sizeof(T) * 8 - 1}), LeadingZeroCount(v))); -} - -// ================================================== END MACROS -namespace detail { // for code folding -#undef HWY_SVE_ALL_PTRUE -#undef HWY_SVE_D -#undef HWY_SVE_FOREACH -#undef HWY_SVE_FOREACH_BF16 -#undef HWY_SVE_FOREACH_F -#undef HWY_SVE_FOREACH_F16 -#undef HWY_SVE_FOREACH_F32 -#undef HWY_SVE_FOREACH_F64 -#undef HWY_SVE_FOREACH_I -#undef HWY_SVE_FOREACH_I08 -#undef HWY_SVE_FOREACH_I16 -#undef HWY_SVE_FOREACH_I32 -#undef HWY_SVE_FOREACH_I64 -#undef HWY_SVE_FOREACH_IF -#undef HWY_SVE_FOREACH_U -#undef HWY_SVE_FOREACH_U08 -#undef HWY_SVE_FOREACH_U16 -#undef HWY_SVE_FOREACH_U32 -#undef HWY_SVE_FOREACH_U64 -#undef HWY_SVE_FOREACH_UI -#undef HWY_SVE_FOREACH_UI08 -#undef HWY_SVE_FOREACH_UI16 -#undef HWY_SVE_FOREACH_UI32 -#undef HWY_SVE_FOREACH_UI64 -#undef HWY_SVE_FOREACH_UIF3264 -#undef HWY_SVE_HAVE_2 -#undef HWY_SVE_PTRUE -#undef HWY_SVE_RETV_ARGPV -#undef HWY_SVE_RETV_ARGPVN -#undef HWY_SVE_RETV_ARGPVV -#undef HWY_SVE_RETV_ARGV -#undef HWY_SVE_RETV_ARGVN -#undef HWY_SVE_RETV_ARGVV -#undef HWY_SVE_RETV_ARGVVV -#undef HWY_SVE_T -#undef HWY_SVE_UNDEFINED -#undef HWY_SVE_V - -} // namespace detail -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/emu128-inl.h b/deps/highway/include/hwy/ops/emu128-inl.h deleted file mode 100644 index 1aba5ec4..00000000 --- a/deps/highway/include/hwy/ops/emu128-inl.h +++ /dev/null @@ -1,2728 +0,0 @@ -// Copyright 2022 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Single-element vectors and operations. -// External include guard in highway.h - see comment there. - -#include // std::abs, std::isnan - -#include "hwy/ops/shared-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -template -using Full128 = Simd; - -// (Wrapper class required for overloading comparison operators.) -template -struct Vec128 { - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = N; // only for DFromV - - HWY_INLINE Vec128() = default; - Vec128(const Vec128&) = default; - Vec128& operator=(const Vec128&) = default; - - HWY_INLINE Vec128& operator*=(const Vec128 other) { - return *this = (*this * other); - } - HWY_INLINE Vec128& operator/=(const Vec128 other) { - return *this = (*this / other); - } - HWY_INLINE Vec128& operator+=(const Vec128 other) { - return *this = (*this + other); - } - HWY_INLINE Vec128& operator-=(const Vec128 other) { - return *this = (*this - other); - } - HWY_INLINE Vec128& operator&=(const Vec128 other) { - return *this = (*this & other); - } - HWY_INLINE Vec128& operator|=(const Vec128 other) { - return *this = (*this | other); - } - HWY_INLINE Vec128& operator^=(const Vec128 other) { - return *this = (*this ^ other); - } - - // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h - // relies on this for LoadInterleaved*. CAVEAT: this method of padding - // prevents using range for, especially in SumOfLanes, where it would be - // incorrect. Moving padding to another field would require handling the case - // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward. - T raw[16 / sizeof(T)] = {}; -}; - -// 0 or FF..FF, same size as Vec128. -template -struct Mask128 { - using Raw = hwy::MakeUnsigned; - static HWY_INLINE Raw FromBool(bool b) { - return b ? static_cast(~Raw{0}) : 0; - } - - // Must match the size of Vec128. - Raw bits[16 / sizeof(T)] = {}; -}; - -template -using DFromV = Simd; - -template -using TFromV = typename V::PrivateT; - -// ------------------------------ Zero - -// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. -template -HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { - Vec128, HWY_MAX_LANES_D(D)> v; // zero-initialized - return v; -} - -template -using VFromD = decltype(Zero(D())); - -// ------------------------------ Tuple (VFromD) -#include "hwy/ops/tuple-inl.h" - -// ------------------------------ BitCast - -template -HWY_API VFromD BitCast(D /* tag */, VFrom v) { - VFromD to; - CopySameSize(&v, &to); - return to; -} - -// ------------------------------ ResizeBitCast - -template -HWY_API VFromD ResizeBitCast(D d, VFrom v) { - using DFrom = DFromV; - using TFrom = TFromD; - using TTo = TFromD; - - constexpr size_t kFromByteLen = sizeof(TFrom) * HWY_MAX_LANES_D(DFrom); - constexpr size_t kToByteLen = sizeof(TTo) * HWY_MAX_LANES_D(D); - constexpr size_t kCopyByteLen = HWY_MIN(kFromByteLen, kToByteLen); - - VFromD to = Zero(d); - CopyBytes(&v, &to); - return to; -} - -namespace detail { - -// ResizeBitCast on the HWY_EMU128 target has zero-extending semantics if -// VFromD is a larger vector than FromV -template -HWY_INLINE VFromD ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, - ToSizeTag /* to_size_tag */, - DTo d_to, DFrom /* d_from */, - VFromD v) { - return ResizeBitCast(d_to, v); -} - -} // namespace detail - -// ------------------------------ Set -template -HWY_API VFromD Set(D d, const T2 t) { - VFromD v; - for (size_t i = 0; i < MaxLanes(d); ++i) { - v.raw[i] = static_cast>(t); - } - return v; -} - -// ------------------------------ Undefined -template -HWY_API VFromD Undefined(D d) { - return Zero(d); -} - -// ------------------------------ Iota - -template , typename T2> -HWY_API VFromD Iota(D d, T2 first) { - VFromD v; - for (size_t i = 0; i < MaxLanes(d); ++i) { - v.raw[i] = - AddWithWraparound(hwy::IsFloatTag(), static_cast(first), i); - } - return v; -} - -// ================================================== LOGICAL - -// ------------------------------ Not -template -HWY_API Vec128 Not(Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - VFromD vu = BitCast(du, v); - for (size_t i = 0; i < N; ++i) { - vu.raw[i] = static_cast(~vu.raw[i]); - } - return BitCast(d, vu); -} - -// ------------------------------ And -template -HWY_API Vec128 And(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - auto au = BitCast(du, a); - auto bu = BitCast(du, b); - for (size_t i = 0; i < N; ++i) { - au.raw[i] &= bu.raw[i]; - } - return BitCast(d, au); -} -template -HWY_API Vec128 operator&(Vec128 a, Vec128 b) { - return And(a, b); -} - -// ------------------------------ AndNot -template -HWY_API Vec128 AndNot(Vec128 a, Vec128 b) { - return And(Not(a), b); -} - -// ------------------------------ Or -template -HWY_API Vec128 Or(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - auto au = BitCast(du, a); - auto bu = BitCast(du, b); - for (size_t i = 0; i < N; ++i) { - au.raw[i] |= bu.raw[i]; - } - return BitCast(d, au); -} -template -HWY_API Vec128 operator|(Vec128 a, Vec128 b) { - return Or(a, b); -} - -// ------------------------------ Xor -template -HWY_API Vec128 Xor(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - auto au = BitCast(du, a); - auto bu = BitCast(du, b); - for (size_t i = 0; i < N; ++i) { - au.raw[i] ^= bu.raw[i]; - } - return BitCast(d, au); -} -template -HWY_API Vec128 operator^(Vec128 a, Vec128 b) { - return Xor(a, b); -} - -// ------------------------------ Xor3 -template -HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { - return Xor(x1, Xor(x2, x3)); -} - -// ------------------------------ Or3 -template -HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { - return Or(o1, Or(o2, o3)); -} - -// ------------------------------ OrAnd -template -HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { - return Or(o, And(a1, a2)); -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, - Vec128 no) { - return Or(And(mask, yes), AndNot(mask, no)); -} - -// ------------------------------ CopySign -template -HWY_API Vec128 CopySign(Vec128 magn, Vec128 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return BitwiseIfThenElse(SignBit(d), sign, magn); -} - -// ------------------------------ CopySignToAbs -template -HWY_API Vec128 CopySignToAbs(Vec128 abs, Vec128 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return OrAnd(abs, SignBit(d), sign); -} - -// ------------------------------ BroadcastSignBit -template -HWY_API Vec128 BroadcastSignBit(Vec128 v) { - // This is used inside ShiftRight, so we cannot implement in terms of it. - for (size_t i = 0; i < N; ++i) { - v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0); - } - return v; -} - -// ------------------------------ Mask - -// v must be 0 or FF..FF. -template -HWY_API Mask128 MaskFromVec(Vec128 v) { - Mask128 mask; - CopySameSize(&v, &mask); - return mask; -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -template -HWY_API MFromD RebindMask(DTo /* tag */, MFrom mask) { - MFromD to; - CopySameSize(&mask, &to); - return to; -} - -template -Vec128 VecFromMask(Mask128 mask) { - Vec128 v; - CopySameSize(&mask, &v); - return v; -} - -template -VFromD VecFromMask(D /* tag */, MFromD mask) { - return VecFromMask(mask); -} - -template -HWY_API MFromD FirstN(D d, size_t n) { - MFromD m; - for (size_t i = 0; i < MaxLanes(d); ++i) { - m.bits[i] = MFromD::FromBool(i < n); - } - return m; -} - -// Returns mask ? yes : no. -template -HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, - Vec128 no) { - return IfVecThenElse(VecFromMask(mask), yes, no); -} - -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { - const DFromV d; - return IfVecThenElse(VecFromMask(mask), yes, Zero(d)); -} - -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { - const DFromV d; - return IfVecThenElse(VecFromMask(mask), Zero(d), no); -} - -template -HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, - Vec128 no) { - const DFromV d; - const RebindToSigned di; - const auto vi = BitCast(di, v); - - for (size_t i = 0; i < N; ++i) { - v.raw[i] = vi.raw[i] < 0 ? yes.raw[i] : no.raw[i]; - } - return v; -} - -template -HWY_API Vec128 ZeroIfNegative(Vec128 v) { - const DFromV d; - return IfNegativeThenElse(v, Zero(d), v); -} - -// ------------------------------ Mask logical - -template -HWY_API Mask128 Not(Mask128 m) { - return MaskFromVec(Not(VecFromMask(Simd(), m))); -} - -template -HWY_API Mask128 And(Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 AndNot(Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Or(Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Xor(Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 ExclusiveNeither(Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); -} - -// ================================================== SHIFTS - -// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) - -template -HWY_API Vec128 ShiftLeft(Vec128 v) { - static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); - using TU = hwy::MakeUnsigned; - for (size_t i = 0; i < N; ++i) { - const TU raw_u = static_cast(v.raw[i]); - v.raw[i] = static_cast(raw_u << kBits); - } - return v; -} - -template -HWY_API Vec128 ShiftRight(Vec128 v) { - static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); -#if __cplusplus >= 202002L - // Signed right shift is now guaranteed to be arithmetic (rounding toward - // negative infinity, i.e. shifting in the sign bit). - for (size_t i = 0; i < N; ++i) { - v.raw[i] = static_cast(v.raw[i] >> kBits); - } -#else - if (IsSigned()) { - // Emulate arithmetic shift using only logical (unsigned) shifts, because - // signed shifts are still implementation-defined. - using TU = hwy::MakeUnsigned; - for (size_t i = 0; i < N; ++i) { - const TU shifted = static_cast(static_cast(v.raw[i]) >> kBits); - const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; - const size_t sign_shift = - static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); - const TU upper = static_cast(sign << sign_shift); - v.raw[i] = static_cast(shifted | upper); - } - } else { // T is unsigned - for (size_t i = 0; i < N; ++i) { - v.raw[i] = static_cast(v.raw[i] >> kBits); - } - } -#endif - return v; -} - -// ------------------------------ RotateRight (ShiftRight) -template -HWY_API Vec128 RotateRight(const Vec128 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -// ------------------------------ ShiftLeftSame - -template -HWY_API Vec128 ShiftLeftSame(Vec128 v, int bits) { - for (size_t i = 0; i < N; ++i) { - const auto shifted = static_cast>(v.raw[i]) << bits; - v.raw[i] = static_cast(shifted); - } - return v; -} - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, int bits) { -#if __cplusplus >= 202002L - // Signed right shift is now guaranteed to be arithmetic (rounding toward - // negative infinity, i.e. shifting in the sign bit). - for (size_t i = 0; i < N; ++i) { - v.raw[i] = static_cast(v.raw[i] >> bits); - } -#else - if (IsSigned()) { - // Emulate arithmetic shift using only logical (unsigned) shifts, because - // signed shifts are still implementation-defined. - using TU = hwy::MakeUnsigned; - for (size_t i = 0; i < N; ++i) { - const TU shifted = static_cast(static_cast(v.raw[i]) >> bits); - const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; - const size_t sign_shift = - static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); - const TU upper = static_cast(sign << sign_shift); - v.raw[i] = static_cast(shifted | upper); - } - } else { - for (size_t i = 0; i < N; ++i) { - v.raw[i] = static_cast(v.raw[i] >> bits); // unsigned, logical shift - } - } -#endif - return v; -} - -// ------------------------------ Shl - -template -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - for (size_t i = 0; i < N; ++i) { - const auto shifted = static_cast>(v.raw[i]) - << bits.raw[i]; - v.raw[i] = static_cast(shifted); - } - return v; -} - -template -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { -#if __cplusplus >= 202002L - // Signed right shift is now guaranteed to be arithmetic (rounding toward - // negative infinity, i.e. shifting in the sign bit). - for (size_t i = 0; i < N; ++i) { - v.raw[i] = static_cast(v.raw[i] >> bits.raw[i]); - } -#else - if (IsSigned()) { - // Emulate arithmetic shift using only logical (unsigned) shifts, because - // signed shifts are still implementation-defined. - using TU = hwy::MakeUnsigned; - for (size_t i = 0; i < N; ++i) { - const TU shifted = - static_cast(static_cast(v.raw[i]) >> bits.raw[i]); - const TU sign = v.raw[i] < 0 ? static_cast(~TU{0}) : 0; - const size_t sign_shift = static_cast( - static_cast(sizeof(TU)) * 8 - 1 - bits.raw[i]); - const TU upper = static_cast(sign << sign_shift); - v.raw[i] = static_cast(shifted | upper); - } - } else { // T is unsigned - for (size_t i = 0; i < N; ++i) { - v.raw[i] = static_cast(v.raw[i] >> bits.raw[i]); - } - } -#endif - return v; -} - -// ================================================== ARITHMETIC - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_INLINE Vec128 Add(hwy::NonFloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - const uint64_t a64 = static_cast(a.raw[i]); - const uint64_t b64 = static_cast(b.raw[i]); - a.raw[i] = static_cast((a64 + b64) & static_cast(~T(0))); - } - return a; -} -template -HWY_INLINE Vec128 Sub(hwy::NonFloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - const uint64_t a64 = static_cast(a.raw[i]); - const uint64_t b64 = static_cast(b.raw[i]); - a.raw[i] = static_cast((a64 - b64) & static_cast(~T(0))); - } - return a; -} - -template -HWY_INLINE Vec128 Add(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] += b.raw[i]; - } - return a; -} - -template -HWY_INLINE Vec128 Sub(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] -= b.raw[i]; - } - return a; -} - -} // namespace detail - -template -HWY_API Vec128 operator-(Vec128 a, Vec128 b) { - return detail::Sub(hwy::IsFloatTag(), a, b); -} -template -HWY_API Vec128 operator+(Vec128 a, Vec128 b) { - return detail::Add(hwy::IsFloatTag(), a, b); -} - -// ------------------------------ SumsOf8 - -template -HWY_API Vec128 SumsOf8(Vec128 v) { - Vec128 sums; - for (size_t i = 0; i < N; ++i) { - sums.raw[i / 8] += v.raw[i]; - } - return sums; -} - -// ------------------------------ SaturatedAdd -template -HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { - using TW = MakeSigned>; - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast(HWY_MIN( - HWY_MAX(hwy::LowestValue(), static_cast(a.raw[i]) + b.raw[i]), - hwy::HighestValue())); - } - return a; -} - -// ------------------------------ SaturatedSub -template -HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { - using TW = MakeSigned>; - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast(HWY_MIN( - HWY_MAX(hwy::LowestValue(), static_cast(a.raw[i]) - b.raw[i]), - hwy::HighestValue())); - } - return a; -} - -// ------------------------------ AverageRound -template -HWY_API Vec128 AverageRound(Vec128 a, Vec128 b) { - static_assert(!IsSigned(), "Only for unsigned"); - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast((a.raw[i] + b.raw[i] + 1) / 2); - } - return a; -} - -// ------------------------------ Abs - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_INLINE Vec128 Abs(SignedTag /*tag*/, Vec128 a) { - for (size_t i = 0; i < N; ++i) { - const T s = a.raw[i]; - const T min = hwy::LimitsMin(); - a.raw[i] = static_cast((s >= 0 || s == min) ? a.raw[i] : -s); - } - return a; -} - -template -HWY_INLINE Vec128 Abs(hwy::FloatTag /*tag*/, Vec128 v) { - for (size_t i = 0; i < N; ++i) { - v.raw[i] = std::abs(v.raw[i]); - } - return v; -} - -} // namespace detail - -template -HWY_API Vec128 Abs(Vec128 a) { - return detail::Abs(hwy::TypeTag(), a); -} - -// ------------------------------ Min/Max - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_INLINE Vec128 Min(hwy::NonFloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); - } - return a; -} -template -HWY_INLINE Vec128 Max(hwy::NonFloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); - } - return a; -} - -template -HWY_INLINE Vec128 Min(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - if (std::isnan(a.raw[i])) { - a.raw[i] = b.raw[i]; - } else if (std::isnan(b.raw[i])) { - // no change - } else { - a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]); - } - } - return a; -} -template -HWY_INLINE Vec128 Max(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - if (std::isnan(a.raw[i])) { - a.raw[i] = b.raw[i]; - } else if (std::isnan(b.raw[i])) { - // no change - } else { - a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]); - } - } - return a; -} - -} // namespace detail - -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return detail::Min(hwy::IsFloatTag(), a, b); -} - -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return detail::Max(hwy::IsFloatTag(), a, b); -} - -// ------------------------------ Neg - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_API Vec128 Neg(hwy::NonFloatTag /*tag*/, Vec128 v) { - const DFromV d; - return Zero(d) - v; -} - -template -HWY_API Vec128 Neg(hwy::FloatTag /*tag*/, Vec128 v) { - const DFromV d; - return Xor(v, SignBit(d)); -} - -template -HWY_API Vec128 Neg(hwy::SpecialTag /*tag*/, Vec128 v) { - const DFromV d; - return Xor(v, SignBit(d)); -} - -} // namespace detail - -template -HWY_API Vec128 Neg(Vec128 v) { - return detail::Neg(hwy::IsFloatTag(), v); -} - -// ------------------------------ Mul/Div - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_INLINE Vec128 Mul(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] *= b.raw[i]; - } - return a; -} - -template -HWY_INLINE Vec128 Mul(SignedTag /*tag*/, Vec128 a, Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast(static_cast(a.raw[i]) * - static_cast(b.raw[i])); - } - return a; -} - -template -HWY_INLINE Vec128 Mul(UnsignedTag /*tag*/, Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast(static_cast(a.raw[i]) * - static_cast(b.raw[i])); - } - return a; -} - -} // namespace detail - -// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -template -HWY_API Vec128 operator*(Vec128 a, Vec128 b) { - return detail::Mul(hwy::TypeTag(), a, b); -} - -template -HWY_API Vec128 operator/(Vec128 a, Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = (b.raw[i] == T{0}) ? 0 : a.raw[i] / b.raw[i]; - } - return a; -} - -// Returns the upper 16 bits of a * b in each lane. -template -HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast((int32_t{a.raw[i]} * b.raw[i]) >> 16); - } - return a; -} -template -HWY_API Vec128 MulHigh(Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - // Cast to uint32_t first to prevent overflow. Otherwise the result of - // uint16_t * uint16_t is in "int" which may overflow. In practice the - // result is the same but this way it is also defined. - a.raw[i] = static_cast( - (static_cast(a.raw[i]) * static_cast(b.raw[i])) >> - 16); - } - return a; -} - -template -HWY_API Vec128 MulFixedPoint15(Vec128 a, - Vec128 b) { - for (size_t i = 0; i < N; ++i) { - a.raw[i] = static_cast((a.raw[i] * b.raw[i] + 16384) >> 15); - } - return a; -} - -// Multiplies even lanes (0, 2, ..) and returns the double-wide result. -template -HWY_API Vec128, (N + 1) / 2> MulEven(Vec128 a, - Vec128 b) { - using TW = MakeWide; - Vec128 mul; - for (size_t i = 0; i < N; i += 2) { - const TW a_wide = a.raw[i]; - mul.raw[i / 2] = static_cast(a_wide * b.raw[i]); - } - return mul; -} - -// Multiplies odd lanes (1, 3, ..) and returns the double-wide result. -template -HWY_API Vec128, (N + 1) / 2> MulOdd(Vec128 a, - Vec128 b) { - using TW = MakeWide; - Vec128 mul; - for (size_t i = 0; i < N; i += 2) { - const TW a_wide = a.raw[i + 1]; - mul.raw[i / 2] = static_cast(a_wide * b.raw[i + 1]); - } - return mul; -} - -template -HWY_API Vec128 ApproximateReciprocal(Vec128 v) { - for (size_t i = 0; i < N; ++i) { - // Zero inputs are allowed, but callers are responsible for replacing the - // return value with something else (typically using IfThenElse). This check - // avoids a ubsan error. The result is arbitrary. - v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i]; - } - return v; -} - -// generic_ops takes care of integer T. -template -HWY_API Vec128 AbsDiff(Vec128 a, Vec128 b) { - return Abs(a - b); -} - -// ------------------------------ Floating-point multiply-add variants - -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return mul * x + add; -} - -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return add - mul * x; -} - -template -HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return mul * x - sub; -} - -template -HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return Neg(mul) * x - sub; -} - -// ------------------------------ Floating-point square root - -template -HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { - for (size_t i = 0; i < N; ++i) { - const float half = v.raw[i] * 0.5f; - uint32_t bits; - CopySameSize(&v.raw[i], &bits); - // Initial guess based on log2(f) - bits = 0x5F3759DF - (bits >> 1); - CopySameSize(&bits, &v.raw[i]); - // One Newton-Raphson iteration - v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i])); - } - return v; -} - -template -HWY_API Vec128 Sqrt(Vec128 v) { - for (size_t i = 0; i < N; ++i) { - v.raw[i] = std::sqrt(v.raw[i]); - } - return v; -} - -// ------------------------------ Floating-point rounding - -template -HWY_API Vec128 Round(Vec128 v) { - using TI = MakeSigned; - const Vec128 a = Abs(v); - for (size_t i = 0; i < N; ++i) { - if (!(a.raw[i] < MantissaEnd())) { // Huge or NaN - continue; - } - const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5); - const TI rounded = static_cast(v.raw[i] + bias); - if (rounded == 0) { - v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0}; - continue; - } - const T rounded_f = static_cast(rounded); - // Round to even - if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) { - v.raw[i] = static_cast(rounded - (v.raw[i] < T(0) ? -1 : 1)); - continue; - } - v.raw[i] = rounded_f; - } - return v; -} - -// Round-to-nearest even. -template -HWY_API Vec128 NearestInt(Vec128 v) { - using T = float; - using TI = int32_t; - - const Vec128 abs = Abs(v); - Vec128 ret; - for (size_t i = 0; i < N; ++i) { - const bool signbit = std::signbit(v.raw[i]); - - if (!(abs.raw[i] < MantissaEnd())) { // Huge or NaN - // Check if too large to cast or NaN - if (!(abs.raw[i] <= static_cast(LimitsMax()))) { - ret.raw[i] = signbit ? LimitsMin() : LimitsMax(); - continue; - } - ret.raw[i] = static_cast(v.raw[i]); - continue; - } - const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5); - const TI rounded = static_cast(v.raw[i] + bias); - if (rounded == 0) { - ret.raw[i] = 0; - continue; - } - const T rounded_f = static_cast(rounded); - // Round to even - if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) { - ret.raw[i] = rounded - (signbit ? -1 : 1); - continue; - } - ret.raw[i] = rounded; - } - return ret; -} - -template -HWY_API Vec128 Trunc(Vec128 v) { - using TI = MakeSigned; - const Vec128 abs = Abs(v); - for (size_t i = 0; i < N; ++i) { - if (!(abs.raw[i] <= MantissaEnd())) { // Huge or NaN - continue; - } - const TI truncated = static_cast(v.raw[i]); - if (truncated == 0) { - v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0}; - continue; - } - v.raw[i] = static_cast(truncated); - } - return v; -} - -// Toward +infinity, aka ceiling -template -Vec128 Ceil(Vec128 v) { - constexpr int kMantissaBits = MantissaBits(); - using Bits = MakeUnsigned; - const Bits kExponentMask = MaxExponentField(); - const Bits kMantissaMask = MantissaMask(); - const Bits kBias = kExponentMask / 2; - - for (size_t i = 0; i < N; ++i) { - const bool positive = v.raw[i] > Float(0.0); - - Bits bits; - CopySameSize(&v.raw[i], &bits); - - const int exponent = - static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); - // Already an integer. - if (exponent >= kMantissaBits) continue; - // |v| <= 1 => 0 or 1. - if (exponent < 0) { - v.raw[i] = positive ? Float{1} : Float{-0.0}; - continue; - } - - const Bits mantissa_mask = kMantissaMask >> exponent; - // Already an integer - if ((bits & mantissa_mask) == 0) continue; - - // Clear fractional bits and round up - if (positive) bits += (kMantissaMask + 1) >> exponent; - bits &= ~mantissa_mask; - - CopySameSize(&bits, &v.raw[i]); - } - return v; -} - -// Toward -infinity, aka floor -template -Vec128 Floor(Vec128 v) { - constexpr int kMantissaBits = MantissaBits(); - using Bits = MakeUnsigned; - const Bits kExponentMask = MaxExponentField(); - const Bits kMantissaMask = MantissaMask(); - const Bits kBias = kExponentMask / 2; - - for (size_t i = 0; i < N; ++i) { - const bool negative = v.raw[i] < Float(0.0); - - Bits bits; - CopySameSize(&v.raw[i], &bits); - - const int exponent = - static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); - // Already an integer. - if (exponent >= kMantissaBits) continue; - // |v| <= 1 => -1 or 0. - if (exponent < 0) { - v.raw[i] = negative ? Float(-1.0) : Float(0.0); - continue; - } - - const Bits mantissa_mask = kMantissaMask >> exponent; - // Already an integer - if ((bits & mantissa_mask) == 0) continue; - - // Clear fractional bits and round down - if (negative) bits += (kMantissaMask + 1) >> exponent; - bits &= ~mantissa_mask; - - CopySameSize(&bits, &v.raw[i]); - } - return v; -} - -// ------------------------------ Floating-point classification - -template -HWY_API Mask128 IsNaN(Vec128 v) { - Mask128 ret; - for (size_t i = 0; i < N; ++i) { - // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. - MakeUnsigned bits; - CopySameSize(&v.raw[i], &bits); - bits += bits; - bits >>= 1; // clear sign bit - // NaN if all exponent bits are set and the mantissa is not zero. - ret.bits[i] = Mask128::FromBool(bits > ExponentMask()); - } - return ret; -} - -template -HWY_API Mask128 IsInf(Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - const RebindToSigned di; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask128 IsFinite(Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - using VI = VFromD; - using VU = VFromD; - const VU vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). - const VI exp = - BitCast(di, ShiftRight() + 1>(Add(vu, vu))); - return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); -} - -// ================================================== COMPARE - -template -HWY_API Mask128 operator==(Vec128 a, Vec128 b) { - Mask128 m; - for (size_t i = 0; i < N; ++i) { - m.bits[i] = Mask128::FromBool(a.raw[i] == b.raw[i]); - } - return m; -} - -template -HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { - Mask128 m; - for (size_t i = 0; i < N; ++i) { - m.bits[i] = Mask128::FromBool(a.raw[i] != b.raw[i]); - } - return m; -} - -template -HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} - -template -HWY_API Mask128 operator<(Vec128 a, Vec128 b) { - Mask128 m; - for (size_t i = 0; i < N; ++i) { - m.bits[i] = Mask128::FromBool(a.raw[i] < b.raw[i]); - } - return m; -} -template -HWY_API Mask128 operator>(Vec128 a, Vec128 b) { - Mask128 m; - for (size_t i = 0; i < N; ++i) { - m.bits[i] = Mask128::FromBool(a.raw[i] > b.raw[i]); - } - return m; -} - -template -HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { - Mask128 m; - for (size_t i = 0; i < N; ++i) { - m.bits[i] = Mask128::FromBool(a.raw[i] <= b.raw[i]); - } - return m; -} -template -HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { - Mask128 m; - for (size_t i = 0; i < N; ++i) { - m.bits[i] = Mask128::FromBool(a.raw[i] >= b.raw[i]); - } - return m; -} - -// ------------------------------ Lt128 - -// Only makes sense for full vectors of u64. -template -HWY_API MFromD Lt128(D /* tag */, Vec128 a, Vec128 b) { - const bool lt = - (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]); - Mask128 ret; - ret.bits[0] = ret.bits[1] = Mask128::FromBool(lt); - return ret; -} - -template -HWY_API MFromD Lt128Upper(D /* tag */, Vec128 a, - Vec128 b) { - const bool lt = a.raw[1] < b.raw[1]; - Mask128 ret; - ret.bits[0] = ret.bits[1] = Mask128::FromBool(lt); - return ret; -} - -// ------------------------------ Eq128 - -// Only makes sense for full vectors of u64. -template -HWY_API MFromD Eq128(D /* tag */, Vec128 a, Vec128 b) { - const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0]; - Mask128 ret; - ret.bits[0] = ret.bits[1] = Mask128::FromBool(eq); - return ret; -} - -template -HWY_API Mask128 Ne128(D /* tag */, Vec128 a, - Vec128 b) { - const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0]; - Mask128 ret; - ret.bits[0] = ret.bits[1] = Mask128::FromBool(ne); - return ret; -} - -template -HWY_API MFromD Eq128Upper(D /* tag */, Vec128 a, - Vec128 b) { - const bool eq = a.raw[1] == b.raw[1]; - Mask128 ret; - ret.bits[0] = ret.bits[1] = Mask128::FromBool(eq); - return ret; -} - -template -HWY_API MFromD Ne128Upper(D /* tag */, Vec128 a, - Vec128 b) { - const bool ne = a.raw[1] != b.raw[1]; - Mask128 ret; - ret.bits[0] = ret.bits[1] = Mask128::FromBool(ne); - return ret; -} - -// ------------------------------ Min128, Max128 (Lt128) - -template -HWY_API VFromD Min128(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128(d, a, b), a, b); -} - -template -HWY_API VFromD Max128(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128(d, b, a), a, b); -} - -template -HWY_API VFromD Min128Upper(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128Upper(d, a, b), a, b); -} - -template -HWY_API VFromD Max128Upper(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128Upper(d, b, a), a, b); -} - -// ================================================== MEMORY - -// ------------------------------ Load - -template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { - VFromD v; - CopyBytes(aligned, v.raw); // copy from array - return v; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElseZero(m, LoadU(d, p)); -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElse(m, LoadU(d, p), v); -} - -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} - -// In some use cases, "load single lane" is sufficient; otherwise avoid this. -template -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT aligned) { - return Load(d, aligned); -} - -#ifdef HWY_NATIVE_LOAD_N -#undef HWY_NATIVE_LOAD_N -#else -#define HWY_NATIVE_LOAD_N -#endif - -template -HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, - size_t max_lanes_to_load) { - VFromD v = Zero(d); - const size_t N = Lanes(d); - const size_t num_of_lanes_to_load = HWY_MIN(max_lanes_to_load, N); - CopyBytes(p, v.raw, num_of_lanes_to_load * sizeof(TFromD)); - return v; -} - -// ------------------------------ Store - -template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - CopyBytes(v.raw, aligned); // copy to array -} - -template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (m.bits[i]) p[i] = v.raw[i]; - } -} - -#ifdef HWY_NATIVE_STORE_N -#undef HWY_NATIVE_STORE_N -#else -#define HWY_NATIVE_STORE_N -#endif - -template -HWY_API void StoreN(VFromD v, D d, TFromD* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const size_t N = Lanes(d); - const size_t num_of_lanes_to_store = HWY_MIN(max_lanes_to_store, N); - CopyBytes(v.raw, p, num_of_lanes_to_store * sizeof(TFromD)); -} - -// ------------------------------ LoadInterleaved2/3/4 - -// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. -// We implement those here because scalar code is likely faster than emulation -// via shuffles. -#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#else -#define HWY_NATIVE_LOAD_STORE_INTERLEAVED -#endif - -template > -HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1) { - alignas(16) T buf0[MaxLanes(d)]; - alignas(16) T buf1[MaxLanes(d)]; - for (size_t i = 0; i < MaxLanes(d); ++i) { - buf0[i] = *unaligned++; - buf1[i] = *unaligned++; - } - v0 = Load(d, buf0); - v1 = Load(d, buf1); -} - -template > -HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - alignas(16) T buf0[MaxLanes(d)]; - alignas(16) T buf1[MaxLanes(d)]; - alignas(16) T buf2[MaxLanes(d)]; - for (size_t i = 0; i < MaxLanes(d); ++i) { - buf0[i] = *unaligned++; - buf1[i] = *unaligned++; - buf2[i] = *unaligned++; - } - v0 = Load(d, buf0); - v1 = Load(d, buf1); - v2 = Load(d, buf2); -} - -template > -HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - alignas(16) T buf0[MaxLanes(d)]; - alignas(16) T buf1[MaxLanes(d)]; - alignas(16) T buf2[MaxLanes(d)]; - alignas(16) T buf3[MaxLanes(d)]; - for (size_t i = 0; i < MaxLanes(d); ++i) { - buf0[i] = *unaligned++; - buf1[i] = *unaligned++; - buf2[i] = *unaligned++; - buf3[i] = *unaligned++; - } - v0 = Load(d, buf0); - v1 = Load(d, buf1); - v2 = Load(d, buf2); - v3 = Load(d, buf3); -} - -// ------------------------------ StoreInterleaved2/3/4 - -template -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - TFromD* HWY_RESTRICT unaligned) { - for (size_t i = 0; i < MaxLanes(d); ++i) { - *unaligned++ = v0.raw[i]; - *unaligned++ = v1.raw[i]; - } -} - -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - for (size_t i = 0; i < MaxLanes(d); ++i) { - *unaligned++ = v0.raw[i]; - *unaligned++ = v1.raw[i]; - *unaligned++ = v2.raw[i]; - } -} - -template -HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, - VFromD v3, D d, - TFromD* HWY_RESTRICT unaligned) { - for (size_t i = 0; i < MaxLanes(d); ++i) { - *unaligned++ = v0.raw[i]; - *unaligned++ = v1.raw[i]; - *unaligned++ = v2.raw[i]; - *unaligned++ = v3.raw[i]; - } -} - -// ------------------------------ Stream -template -HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - Store(v, d, aligned); -} - -// ------------------------------ Scatter in generic_ops-inl.h -// ------------------------------ Gather in generic_ops-inl.h - -// ================================================== CONVERT - -// ConvertTo and DemoteTo with floating-point input and integer output truncate -// (rounding toward zero). - -template -HWY_API VFromD PromoteTo(DTo d, Vec128 from) { - static_assert(sizeof(TFromD) > sizeof(TFrom), "Not promoting"); - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); ++i) { - // For bits Y > X, floatX->floatY and intX->intY are always representable. - ret.raw[i] = static_cast>(from.raw[i]); - } - return ret; -} - -// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, -// so we overload for TFrom=double and ToT={float,int32_t}. -template -HWY_API VFromD DemoteTo(D d, VFromD> from) { - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); ++i) { - // Prevent ubsan errors when converting float to narrower integer/float - if (std::isinf(from.raw[i]) || - std::fabs(from.raw[i]) > static_cast(HighestValue())) { - ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue() - : HighestValue(); - continue; - } - ret.raw[i] = static_cast(from.raw[i]); - } - return ret; -} -template -HWY_API VFromD DemoteTo(D d, VFromD> from) { - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); ++i) { - // Prevent ubsan errors when converting int32_t to narrower integer/int32_t - if (std::isinf(from.raw[i]) || - std::fabs(from.raw[i]) > static_cast(HighestValue())) { - ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue() - : HighestValue(); - continue; - } - ret.raw[i] = static_cast(from.raw[i]); - } - return ret; -} - -template )> -HWY_API VFromD DemoteTo(DTo /* tag */, Vec128 from) { - using TTo = TFromD; - static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); - - VFromD ret; - for (size_t i = 0; i < N; ++i) { - // Int to int: choose closest value in ToT to `from` (avoids UB) - from.raw[i] = - HWY_MIN(HWY_MAX(LimitsMin(), from.raw[i]), LimitsMax()); - ret.raw[i] = static_cast(from.raw[i]); - } - return ret; -} - -template -HWY_API VFromD DemoteTo(DTo /* tag */, Vec128 from) { - using TTo = TFromD; - static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); - - VFromD ret; - for (size_t i = 0; i < N; ++i) { - // Int to int: choose closest value in ToT to `from` (avoids UB) - from.raw[i] = HWY_MIN(from.raw[i], LimitsMax()); - ret.raw[i] = static_cast(from.raw[i]); - } - return ret; -} - -template -HWY_API VFromD ReorderDemote2To(DBF16 dbf16, VF32 a, VF32 b) { - const Repartition du32; - const VFromD b_in_lower = ShiftRight<16>(BitCast(du32, b)); - // Avoid OddEven - we want the upper half of `a` even on big-endian systems. - const VFromD a_mask = Set(du32, 0xFFFF0000); - return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower)); -} - -template ), class V, - HWY_IF_SIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const RepartitionToWide dw; - const size_t NW = Lanes(dw); - using TN = TFromD; - const TN min = LimitsMin(); - const TN max = LimitsMax(); - VFromD ret; - for (size_t i = 0; i < NW; ++i) { - ret.raw[i] = static_cast(HWY_MIN(HWY_MAX(min, a.raw[i]), max)); - } - for (size_t i = 0; i < NW; ++i) { - ret.raw[NW + i] = static_cast(HWY_MIN(HWY_MAX(min, b.raw[i]), max)); - } - return ret; -} - -template ) * 2), - HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const RepartitionToWide dw; - const size_t NW = Lanes(dw); - using TN = TFromD; - const TN max = LimitsMax(); - VFromD ret; - for (size_t i = 0; i < NW; ++i) { - ret.raw[i] = static_cast(HWY_MIN(a.raw[i], max)); - } - for (size_t i = 0; i < NW; ++i) { - ret.raw[NW + i] = static_cast(HWY_MIN(b.raw[i], max)); - } - return ret; -} - -template ), class V, - HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedDemote2To(DN dn, V a, V b) { - return ReorderDemote2To(dn, a, b); -} - -template ), - HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedDemote2To(DN dn, V a, V b) { - const RebindToUnsigned> du32; - const size_t NW = Lanes(du32); - VFromD> ret; - - const auto a_bits = BitCast(du32, a); - const auto b_bits = BitCast(du32, b); - - for (size_t i = 0; i < NW; ++i) { - ret.raw[i] = static_cast(a_bits.raw[i] >> 16); - } - for (size_t i = 0; i < NW; ++i) { - ret.raw[NW + i] = static_cast(b_bits.raw[i] >> 16); - } - return BitCast(dn, ret); -} - -namespace detail { - -HWY_INLINE void StoreU16ToF16(const uint16_t val, - hwy::float16_t* HWY_RESTRICT to) { - CopySameSize(&val, to); -} - -HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) { - uint16_t bits16; - CopySameSize(from, &bits16); - return bits16; -} - -} // namespace detail - -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = F32FromBF16(v.raw[i]); - } - return ret; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = BF16FromF32(v.raw[i]); - } - return ret; -} - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_API VFromD ConvertTo(hwy::FloatTag /*tag*/, DTo /*tag*/, - Vec128 from) { - using ToT = TFromD; - static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size"); - VFromD ret; - constexpr size_t N = HWY_MAX_LANES_D(DTo); - for (size_t i = 0; i < N; ++i) { - // float## -> int##: return closest representable value. We cannot exactly - // represent LimitsMax in TFrom, so use double. - const double f = static_cast(from.raw[i]); - if (std::isinf(from.raw[i]) || - std::fabs(f) > static_cast(LimitsMax())) { - ret.raw[i] = - std::signbit(from.raw[i]) ? LimitsMin() : LimitsMax(); - continue; - } - ret.raw[i] = static_cast(from.raw[i]); - } - return ret; -} - -template -HWY_API VFromD ConvertTo(hwy::NonFloatTag /*tag*/, DTo /* tag */, - Vec128 from) { - using ToT = TFromD; - static_assert(sizeof(ToT) == sizeof(TFrom), "Should have same size"); - VFromD ret; - constexpr size_t N = HWY_MAX_LANES_D(DTo); - for (size_t i = 0; i < N; ++i) { - // int## -> float##: no check needed - ret.raw[i] = static_cast(from.raw[i]); - } - return ret; -} - -} // namespace detail - -template -HWY_API VFromD ConvertTo(DTo d, Vec128 from) { - return detail::ConvertTo(hwy::IsFloatTag(), d, from); -} - -template -HWY_API Vec128 U8FromU32(Vec128 v) { - return DemoteTo(Simd(), v); -} - -// ------------------------------ Truncations - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = static_cast(v.raw[i] & 0xFF); - } - return ret; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = static_cast(v.raw[i] & 0xFFFF); - } - return ret; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = static_cast(v.raw[i] & 0xFFFFFFFFu); - } - return ret; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = static_cast(v.raw[i] & 0xFF); - } - return ret; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = static_cast(v.raw[i] & 0xFFFF); - } - return ret; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - VFromD ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = static_cast(v.raw[i] & 0xFF); - } - return ret; -} - -#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#else -#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#endif - -template ) * 2), - HWY_IF_LANES_D(DN, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedTruncate2To(DN dn, V a, V b) { - const RepartitionToWide dw; - const size_t NW = Lanes(dw); - using TW = TFromD; - using TN = TFromD; - VFromD ret; - constexpr TW max_val{LimitsMax()}; - - for (size_t i = 0; i < NW; ++i) { - ret.raw[i] = static_cast(a.raw[i] & max_val); - } - for (size_t i = 0; i < NW; ++i) { - ret.raw[NW + i] = static_cast(b.raw[i] & max_val); - } - return ret; -} - -// ================================================== COMBINE - -template -HWY_API Vec128 LowerHalf(Vec128 v) { - Vec128 ret; - CopyBytes(v.raw, ret.raw); - return ret; -} - -template -HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { - return LowerHalf(v); -} - -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - VFromD ret; - CopyBytes(&v.raw[MaxLanes(d)], ret.raw); - return ret; -} - -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> v) { - const Half dh; - VFromD ret; // zero-initialized - CopyBytes(v.raw, ret.raw); - return ret; -} - -template >> -HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { - const Half dh; - VFromD ret; - CopyBytes(lo_half.raw, &ret.raw[0]); - CopyBytes(hi_half.raw, &ret.raw[MaxLanes(dh)]); - return ret; -} - -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - const Half dh; - VFromD ret; - CopyBytes(lo.raw, &ret.raw[0]); - CopyBytes(hi.raw, &ret.raw[MaxLanes(dh)]); - return ret; -} - -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - const Half dh; - VFromD ret; - CopyBytes(&lo.raw[MaxLanes(dh)], &ret.raw[0]); - CopyBytes(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]); - return ret; -} - -template -HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { - const Half dh; - VFromD ret; - CopyBytes(&lo.raw[MaxLanes(dh)], &ret.raw[0]); - CopyBytes(hi.raw, &ret.raw[MaxLanes(dh)]); - return ret; -} - -template -HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { - const Half dh; - VFromD ret; - CopyBytes(lo.raw, &ret.raw[0]); - CopyBytes(&hi.raw[MaxLanes(dh)], &ret.raw[MaxLanes(dh)]); - return ret; -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const Half dh; - VFromD ret; - for (size_t i = 0; i < MaxLanes(dh); ++i) { - ret.raw[i] = lo.raw[2 * i]; - } - for (size_t i = 0; i < MaxLanes(dh); ++i) { - ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i]; - } - return ret; -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const Half dh; - VFromD ret; - for (size_t i = 0; i < MaxLanes(dh); ++i) { - ret.raw[i] = lo.raw[2 * i + 1]; - } - for (size_t i = 0; i < MaxLanes(dh); ++i) { - ret.raw[MaxLanes(dh) + i] = hi.raw[2 * i + 1]; - } - return ret; -} - -// ------------------------------ CombineShiftRightBytes -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - VFromD ret; - const uint8_t* HWY_RESTRICT lo8 = - reinterpret_cast(lo.raw); - uint8_t* HWY_RESTRICT ret8 = - reinterpret_cast(ret.raw); - CopyBytes(lo8 + kBytes, ret8); - CopyBytes(hi.raw, ret8 + d.MaxBytes() - kBytes); - return ret; -} - -// ------------------------------ ShiftLeftBytes - -template -HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - VFromD ret; - uint8_t* HWY_RESTRICT ret8 = - reinterpret_cast(ret.raw); - ZeroBytes(ret8); - CopyBytes(v.raw, ret8 + kBytes); - return ret; -} - -template -HWY_API Vec128 ShiftLeftBytes(Vec128 v) { - return ShiftLeftBytes(DFromV(), v); -} - -// ------------------------------ ShiftLeftLanes - -template > -HWY_API VFromD ShiftLeftLanes(D d, VFromD v) { - const Repartition d8; - return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); -} - -template -HWY_API Vec128 ShiftLeftLanes(Vec128 v) { - return ShiftLeftLanes(DFromV(), v); -} - -// ------------------------------ ShiftRightBytes -template -HWY_API VFromD ShiftRightBytes(D d, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - VFromD ret; - const uint8_t* HWY_RESTRICT v8 = - reinterpret_cast(v.raw); - uint8_t* HWY_RESTRICT ret8 = - reinterpret_cast(ret.raw); - CopyBytes(v8 + kBytes, ret8); - ZeroBytes(ret8 + d.MaxBytes() - kBytes); - return ret; -} - -// ------------------------------ ShiftRightLanes -template -HWY_API VFromD ShiftRightLanes(D d, VFromD v) { - const Repartition d8; - constexpr size_t kBytes = kLanes * sizeof(TFromD); - return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); -} - -// ================================================== SWIZZLE - -template -HWY_API T GetLane(Vec128 v) { - return v.raw[0]; -} - -template -HWY_API Vec128 InsertLane(Vec128 v, size_t i, T t) { - v.raw[i] = t; - return v; -} - -template -HWY_API T ExtractLane(Vec128 v, size_t i) { - return v.raw[i]; -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - for (size_t i = 0; i < N; i += 2) { - v.raw[i + 1] = v.raw[i]; - } - return v; -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - for (size_t i = 0; i < N; i += 2) { - v.raw[i] = v.raw[i + 1]; - } - return v; -} - -template -HWY_API Vec128 OddEven(Vec128 odd, Vec128 even) { - for (size_t i = 0; i < N; i += 2) { - odd.raw[i] = even.raw[i]; - } - return odd; -} - -template -HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { - return even; -} - -// ------------------------------ SwapAdjacentBlocks -template -HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { - return v; -} - -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices for use by TableLookupLanes. -template -struct Indices128 { - MakeSigned raw[N]; -}; - -template -HWY_API Indices128, N> IndicesFromVec(D d, Vec128 vec) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index/lane size must match"); - Indices128, N> ret; - CopyBytes(vec.raw, ret.raw); - return ret; -} - -template -HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( - D d, const TI* idx) { - return IndicesFromVec(d, LoadU(Rebind(), idx)); -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { - Vec128 ret; - for (size_t i = 0; i < N; ++i) { - ret.raw[i] = v.raw[idx.raw[i]]; - } - return ret; -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - using TI = MakeSigned; - Vec128 ret; - constexpr TI kVecLaneIdxMask = static_cast(N - 1); - for (size_t i = 0; i < N; ++i) { - const auto src_idx = idx.raw[i]; - const auto masked_src_lane_idx = src_idx & kVecLaneIdxMask; - ret.raw[i] = (src_idx < static_cast(N)) ? a.raw[masked_src_lane_idx] - : b.raw[masked_src_lane_idx]; - } - return ret; -} - -// ------------------------------ ReverseBlocks -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return v; // Single block: no change -} - -// ------------------------------ Reverse - -template -HWY_API VFromD Reverse(D d, VFromD v) { - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); ++i) { - ret.raw[i] = v.raw[MaxLanes(d) - 1 - i]; - } - return ret; -} - -// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -template -HWY_API VFromD Reverse2(D d, VFromD v) { - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); i += 2) { - ret.raw[i + 0] = v.raw[i + 1]; - ret.raw[i + 1] = v.raw[i + 0]; - } - return ret; -} - -template -HWY_API VFromD Reverse4(D d, VFromD v) { - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); i += 4) { - ret.raw[i + 0] = v.raw[i + 3]; - ret.raw[i + 1] = v.raw[i + 2]; - ret.raw[i + 2] = v.raw[i + 1]; - ret.raw[i + 3] = v.raw[i + 0]; - } - return ret; -} - -template -HWY_API VFromD Reverse8(D d, VFromD v) { - VFromD ret; - for (size_t i = 0; i < MaxLanes(d); i += 8) { - ret.raw[i + 0] = v.raw[i + 7]; - ret.raw[i + 1] = v.raw[i + 6]; - ret.raw[i + 2] = v.raw[i + 5]; - ret.raw[i + 3] = v.raw[i + 4]; - ret.raw[i + 4] = v.raw[i + 3]; - ret.raw[i + 5] = v.raw[i + 2]; - ret.raw[i + 6] = v.raw[i + 1]; - ret.raw[i + 7] = v.raw[i + 0]; - } - return ret; -} - -// ------------------------------ SlideUpLanes - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { - VFromD ret = Zero(d); - constexpr size_t N = HWY_MAX_LANES_D(D); - const size_t clamped_amt = HWY_MIN(amt, N); - CopyBytes(v.raw, ret.raw + clamped_amt, - (N - clamped_amt) * sizeof(TFromD)); - return ret; -} - -// ------------------------------ SlideDownLanes - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { - VFromD ret = Zero(d); - constexpr size_t N = HWY_MAX_LANES_D(D); - const size_t clamped_amt = HWY_MIN(amt, N); - CopyBytes(v.raw + clamped_amt, ret.raw, - (N - clamped_amt) * sizeof(TFromD)); - return ret; -} - -// ================================================== BLOCKWISE - -// ------------------------------ Shuffle* - -// Swap 32-bit halves in 64-bit halves. -template -HWY_API Vec128 Shuffle2301(Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit"); - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Reverse2(DFromV(), v); -} - -// Swap 64-bit halves -template -HWY_API Vec128 Shuffle1032(Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit"); - Vec128 ret; - ret.raw[3] = v.raw[1]; - ret.raw[2] = v.raw[0]; - ret.raw[1] = v.raw[3]; - ret.raw[0] = v.raw[2]; - return ret; -} -template -HWY_API Vec128 Shuffle01(Vec128 v) { - static_assert(sizeof(T) == 8, "Only for 64-bit"); - return Reverse2(DFromV(), v); -} - -// Rotate right 32 bits -template -HWY_API Vec128 Shuffle0321(Vec128 v) { - Vec128 ret; - ret.raw[3] = v.raw[0]; - ret.raw[2] = v.raw[3]; - ret.raw[1] = v.raw[2]; - ret.raw[0] = v.raw[1]; - return ret; -} - -// Rotate left 32 bits -template -HWY_API Vec128 Shuffle2103(Vec128 v) { - Vec128 ret; - ret.raw[3] = v.raw[2]; - ret.raw[2] = v.raw[1]; - ret.raw[1] = v.raw[0]; - ret.raw[0] = v.raw[3]; - return ret; -} - -template -HWY_API Vec128 Shuffle0123(Vec128 v) { - return Reverse4(DFromV(), v); -} - -// ------------------------------ Broadcast -template -HWY_API Vec128 Broadcast(Vec128 v) { - for (size_t i = 0; i < N; ++i) { - v.raw[i] = v.raw[kLane]; - } - return v; -} - -// ------------------------------ TableLookupBytes, TableLookupBytesOr0 - -template -HWY_API Vec128 TableLookupBytes(Vec128 v, - Vec128 indices) { - const uint8_t* HWY_RESTRICT v_bytes = - reinterpret_cast(v.raw); - const uint8_t* HWY_RESTRICT idx_bytes = - reinterpret_cast(indices.raw); - Vec128 ret; - uint8_t* HWY_RESTRICT ret_bytes = - reinterpret_cast(ret.raw); - for (size_t i = 0; i < NI * sizeof(TI); ++i) { - const size_t idx = idx_bytes[i]; - // Avoid out of bounds reads. - ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0; - } - return ret; -} - -template -HWY_API Vec128 TableLookupBytesOr0(Vec128 v, - Vec128 indices) { - // Same as TableLookupBytes, which already returns 0 if out of bounds. - return TableLookupBytes(v, indices); -} - -// ------------------------------ InterleaveLower/InterleaveUpper - -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - Vec128 ret; - for (size_t i = 0; i < N / 2; ++i) { - ret.raw[2 * i + 0] = a.raw[i]; - ret.raw[2 * i + 1] = b.raw[i]; - } - return ret; -} - -// Additional overload for the optional tag. -template -HWY_API V InterleaveLower(DFromV /* tag */, V a, V b) { - return InterleaveLower(a, b); -} - -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const Half dh; - VFromD ret; - for (size_t i = 0; i < MaxLanes(dh); ++i) { - ret.raw[2 * i + 0] = a.raw[MaxLanes(dh) + i]; - ret.raw[2 * i + 1] = b.raw[MaxLanes(dh) + i]; - } - return ret; -} - -// ------------------------------ ZipLower/ZipUpper (InterleaveLower) - -// Same as Interleave*, except that the return lanes are double-width integers; -// this is necessary because the single-lane scalar cannot return two values. -template >> -HWY_API VFromD ZipLower(V a, V b) { - return BitCast(DW(), InterleaveLower(a, b)); -} -template , class DW = RepartitionToWide> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - return BitCast(dw, InterleaveLower(D(), a, b)); -} - -template , class DW = RepartitionToWide> -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - return BitCast(dw, InterleaveUpper(D(), a, b)); -} - -// ================================================== MASK - -template -HWY_API bool AllFalse(D d, MFromD mask) { - typename MFromD::Raw or_sum = 0; - for (size_t i = 0; i < MaxLanes(d); ++i) { - or_sum |= mask.bits[i]; - } - return or_sum == 0; -} - -template -HWY_API bool AllTrue(D d, MFromD mask) { - constexpr uint64_t kAll = LimitsMax::Raw>(); - uint64_t and_sum = kAll; - for (size_t i = 0; i < MaxLanes(d); ++i) { - and_sum &= mask.bits[i]; - } - return and_sum == kAll; -} - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - MFromD m; - for (size_t i = 0; i < MaxLanes(d); ++i) { - const size_t bit = size_t{1} << (i & 7); - const size_t idx_byte = i >> 3; - m.bits[i] = MFromD::FromBool((bits[idx_byte] & bit) != 0); - } - return m; -} - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - bits[0] = 0; - if (MaxLanes(d) > 8) bits[1] = 0; // MaxLanes(d) <= 16, so max two bytes - for (size_t i = 0; i < MaxLanes(d); ++i) { - const size_t bit = size_t{1} << (i & 7); - const size_t idx_byte = i >> 3; - if (mask.bits[i]) { - bits[idx_byte] = static_cast(bits[idx_byte] | bit); - } - } - return MaxLanes(d) > 8 ? 2 : 1; -} - -template -HWY_API size_t CountTrue(D d, MFromD mask) { - size_t count = 0; - for (size_t i = 0; i < MaxLanes(d); ++i) { - count += mask.bits[i] != 0; - } - return count; -} - -template -HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask.bits[i] != 0) return i; - } - HWY_DASSERT(false); - return 0; -} - -template -HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask.bits[i] != 0) return static_cast(i); - } - return intptr_t{-1}; -} - -template -HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { - for (intptr_t i = static_cast(MaxLanes(d) - 1); i >= 0; i--) { - if (mask.bits[i] != 0) return static_cast(i); - } - HWY_DASSERT(false); - return 0; -} - -template -HWY_API intptr_t FindLastTrue(D d, MFromD mask) { - for (intptr_t i = static_cast(MaxLanes(d) - 1); i >= 0; i--) { - if (mask.bits[i] != 0) return i; - } - return intptr_t{-1}; -} - -// ------------------------------ Compress - -template -struct CompressIsPartition { - enum { value = (sizeof(T) != 1) }; -}; - -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - size_t count = 0; - Vec128 ret; - for (size_t i = 0; i < N; ++i) { - if (mask.bits[i]) { - ret.raw[count++] = v.raw[i]; - } - } - for (size_t i = 0; i < N; ++i) { - if (!mask.bits[i]) { - ret.raw[count++] = v.raw[i]; - } - } - HWY_DASSERT(count == N); - return ret; -} - -// ------------------------------ Expand - -// Could also just allow generic_ops-inl.h to implement these, but use our -// simple implementation below to ensure the test is correct. -#ifdef HWY_NATIVE_EXPAND -#undef HWY_NATIVE_EXPAND -#else -#define HWY_NATIVE_EXPAND -#endif - -template -HWY_API Vec128 Expand(Vec128 v, const Mask128 mask) { - size_t in_pos = 0; - Vec128 ret; - for (size_t i = 0; i < N; ++i) { - if (mask.bits[i]) { - ret.raw[i] = v.raw[in_pos++]; - } else { - ret.raw[i] = T(); // zero, also works for float16_t - } - } - return ret; -} - -// ------------------------------ LoadExpand - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { - size_t in_pos = 0; - VFromD ret; - for (size_t i = 0; i < Lanes(d); ++i) { - if (mask.bits[i]) { - ret.raw[i] = unaligned[in_pos++]; - } else { - ret.raw[i] = TFromD(); // zero, also works for float16_t - } - } - return ret; -} - -// ------------------------------ CompressNot -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - size_t count = 0; - Vec128 ret; - for (size_t i = 0; i < N; ++i) { - if (!mask.bits[i]) { - ret.raw[count++] = v.raw[i]; - } - } - for (size_t i = 0; i < N; ++i) { - if (mask.bits[i]) { - ret.raw[count++] = v.raw[i]; - } - } - HWY_DASSERT(count == N); - return ret; -} - -// ------------------------------ CompressBlocksNot -HWY_API Vec128 CompressBlocksNot(Vec128 v, - Mask128 /* m */) { - return v; -} - -// ------------------------------ CompressBits -template -HWY_API Vec128 CompressBits(Vec128 v, - const uint8_t* HWY_RESTRICT bits) { - return Compress(v, LoadMaskBits(Simd(), bits)); -} - -// ------------------------------ CompressStore - -// generic_ops-inl defines the 8-bit versions. -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - size_t count = 0; - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask.bits[i]) { - unaligned[count++] = v.raw[i]; - } - } - return count; -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - return CompressStore(v, mask, d, unaligned); -} - -// ------------------------------ CompressBitsStore -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - const MFromD mask = LoadMaskBits(d, bits); - StoreU(Compress(v, mask), d, unaligned); - return CountTrue(d, mask); -} - -// ------------------------------ Additional mask logical operations -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - return mask; -} - -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - using TU = hwy::MakeUnsigned; - - Mask128 result; - TU result_lane_mask{0}; - for (size_t i = 0; i < N; i++) { - result_lane_mask = static_cast(result_lane_mask | mask.bits[i]); - result.bits[i] = result_lane_mask; - } - return result; -} - -template -HWY_API Mask128 SetBeforeFirst(Mask128 mask) { - return Not(SetAtOrAfterFirst(mask)); -} - -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - using TU = hwy::MakeUnsigned; - using TI = hwy::MakeSigned; - - Mask128 result; - TU result_lane_mask = static_cast(~TU{0}); - for (size_t i = 0; i < N; i++) { - const auto curr_lane_mask_bits = mask.bits[i]; - result.bits[i] = static_cast(curr_lane_mask_bits & result_lane_mask); - result_lane_mask = - static_cast(result_lane_mask & - static_cast(-static_cast(mask.bits[i] == 0))); - } - return result; -} - -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { - using TU = hwy::MakeUnsigned; - using TI = hwy::MakeSigned; - - Mask128 result; - TU result_lane_mask = static_cast(~TU{0}); - for (size_t i = 0; i < N; i++) { - result.bits[i] = result_lane_mask; - result_lane_mask = - static_cast(result_lane_mask & - static_cast(-static_cast(mask.bits[i] == 0))); - } - return result; -} - -// ------------------------------ WidenMulPairwiseAdd - -template -HWY_API VFromD WidenMulPairwiseAdd(D df32, VBF16 a, VBF16 b) { - const Rebind du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 - // Avoid ZipLower/Upper so this also works on big-endian systems. - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return Mul(BitCast(df32, ae), BitCast(df32, be)) + - Mul(BitCast(df32, ao), BitCast(df32, bo)); -} - -template -HWY_API VFromD WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) { - using VI32 = VFromD; - // Manual sign extension requires two shifts for even lanes. - const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a))); - const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b))); - const VI32 ao = ShiftRight<16>(BitCast(d32, a)); - const VI32 bo = ShiftRight<16>(BitCast(d32, b)); - return Add(Mul(ae, be), Mul(ao, bo)); -} - -template -HWY_API VFromD WidenMulPairwiseAdd(D du32, VU16 a, VU16 b) { - const auto lo16_mask = Set(du32, 0x0000FFFFu); - - const auto a0 = And(BitCast(du32, a), lo16_mask); - const auto b0 = And(BitCast(du32, b), lo16_mask); - - const auto a1 = ShiftRight<16>(BitCast(du32, a)); - const auto b1 = ShiftRight<16>(BitCast(du32, b)); - - return Add(Mul(a0, b0), Mul(a1, b1)); -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -template -HWY_API VFromD ReorderWidenMulAccumulate(D df32, VBF16 a, VBF16 b, - const Vec128 sum0, - Vec128& sum1) { - const Rebind du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 - // Avoid ZipLower/Upper so this also works on big-endian systems. - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -} - -template -HWY_API VFromD ReorderWidenMulAccumulate(D d32, VI16 a, VI16 b, - const Vec128 sum0, - Vec128& sum1) { - using VI32 = VFromD; - // Manual sign extension requires two shifts for even lanes. - const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a))); - const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b))); - const VI32 ao = ShiftRight<16>(BitCast(d32, a)); - const VI32 bo = ShiftRight<16>(BitCast(d32, b)); - sum1 = Add(Mul(ao, bo), sum1); - return Add(Mul(ae, be), sum0); -} - -template -HWY_API VFromD ReorderWidenMulAccumulate(D du32, VU16 a, VU16 b, - const Vec128 sum0, - Vec128& sum1) { - using VU32 = VFromD; - const VU32 lo16_mask = Set(du32, uint32_t{0x0000FFFFu}); - const VU32 ae = And(BitCast(du32, a), lo16_mask); - const VU32 be = And(BitCast(du32, b), lo16_mask); - const VU32 ao = ShiftRight<16>(BitCast(du32, a)); - const VU32 bo = ShiftRight<16>(BitCast(du32, b)); - sum1 = Add(Mul(ao, bo), sum1); - return Add(Mul(ae, be), sum0); -} - -// ------------------------------ RearrangeToOddPlusEven -template -HWY_API VW RearrangeToOddPlusEven(VW sum0, VW sum1) { - return Add(sum0, sum1); -} - -// ================================================== REDUCTIONS - -template > -HWY_API VFromD SumOfLanes(D d, VFromD v) { - T sum = T{0}; - for (size_t i = 0; i < MaxLanes(d); ++i) { - sum += v.raw[i]; - } - return Set(d, sum); -} -template > -HWY_API T ReduceSum(D d, VFromD v) { - T sum = T{0}; - for (size_t i = 0; i < MaxLanes(d); ++i) { - sum += v.raw[i]; - } - return sum; -} -template > -HWY_API VFromD MinOfLanes(D d, VFromD v) { - T min = HighestValue(); - for (size_t i = 0; i < MaxLanes(d); ++i) { - min = HWY_MIN(min, v.raw[i]); - } - return Set(d, min); -} -template > -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - T max = LowestValue(); - for (size_t i = 0; i < MaxLanes(d); ++i) { - max = HWY_MAX(max, v.raw[i]); - } - return Set(d, max); -} - -// ================================================== OPS WITH DEPENDENCIES - -// ------------------------------ MulEven/Odd 64x64 (UpperHalf) - -HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { - alignas(16) uint64_t mul[2]; - mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); - return Load(Full128(), mul); -} - -HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { - alignas(16) uint64_t mul[2]; - const Half> d2; - mul[0] = - Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); - return Load(Full128(), mul); -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/generic_ops-inl.h b/deps/highway/include/hwy/ops/generic_ops-inl.h deleted file mode 100644 index c0e8caa9..00000000 --- a/deps/highway/include/hwy/ops/generic_ops-inl.h +++ /dev/null @@ -1,4596 +0,0 @@ -// Copyright 2021 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Target-independent types/functions defined after target-specific ops. - -#include "hwy/base.h" - -// Define detail::Shuffle1230 etc, but only when viewing the current header; -// normally this is included via highway.h, which includes ops/*.h. -#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) -#include "hwy/detect_targets.h" -#include "hwy/ops/emu128-inl.h" -#endif // HWY_IDE - -// Relies on the external include guard in highway.h. -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -// The lane type of a vector type, e.g. float for Vec>. -template -using LaneType = decltype(GetLane(V())); - -// Vector type, e.g. Vec128 for CappedTag. Useful as the return -// type of functions that do not take a vector argument, or as an argument type -// if the function only has a template argument for D, or for explicit type -// names instead of auto. This may be a built-in type. -template -using Vec = decltype(Zero(D())); - -// Mask type. Useful as the return type of functions that do not take a mask -// argument, or as an argument type if the function only has a template argument -// for D, or for explicit type names instead of auto. -template -using Mask = decltype(MaskFromVec(Zero(D()))); - -// Returns the closest value to v within [lo, hi]. -template -HWY_API V Clamp(const V v, const V lo, const V hi) { - return Min(Max(lo, v), hi); -} - -// CombineShiftRightBytes (and -Lanes) are not available for the scalar target, -// and RVV has its own implementation of -Lanes. -#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV - -template -HWY_API VFromD CombineShiftRightLanes(D d, VFromD hi, VFromD lo) { - constexpr size_t kBytes = kLanes * sizeof(TFromD); - static_assert(kBytes < 16, "Shift count is per-block"); - return CombineShiftRightBytes(d, hi, lo); -} - -#endif - -// Returns lanes with the most significant bit set and all other bits zero. -template -HWY_API Vec SignBit(D d) { - const RebindToUnsigned du; - return BitCast(d, Set(du, SignMask>())); -} - -// Returns quiet NaN. -template -HWY_API Vec NaN(D d) { - const RebindToSigned di; - // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus - // mantissa MSB (to indicate quiet) would be sufficient. - return BitCast(d, Set(di, LimitsMax>())); -} - -// Returns positive infinity. -template -HWY_API Vec Inf(D d) { - const RebindToUnsigned du; - using T = TFromD; - using TU = TFromD; - const TU max_x2 = static_cast(MaxExponentTimes2()); - return BitCast(d, Set(du, max_x2 >> 1)); -} - -// ------------------------------ ZeroExtendResizeBitCast - -// The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 -// target is in emu128-inl.h, and the implementation of -// detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h -#if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR -namespace detail { - -#if HWY_HAVE_SCALABLE -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag /* from_size_tag */, - hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, - VFromD v) { - const Repartition d_to_u8; - const auto resized = ResizeBitCast(d_to_u8, v); - // Zero the upper bytes which were not present/valid in d_from. - const size_t num_bytes = Lanes(Repartition()); - return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized)); -} -#else // target that uses fixed-size vectors -// Truncating or same-size resizing cast: same as ResizeBitCast -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag /* from_size_tag */, - hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, - VFromD v) { - return ResizeBitCast(d_to, v); -} - -// Resizing cast to vector that has twice the number of lanes of the source -// vector -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag /* from_size_tag */, - hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, - VFromD v) { - const Twice dt_from; - return BitCast(d_to, ZeroExtendVector(dt_from, v)); -} - -// Resizing cast to vector that has more than twice the number of lanes of the -// source vector -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag /* from_size_tag */, - hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, - VFromD v) { - using TFrom = TFromD; - constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); - const Repartition d_resize_to; - return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), - ResizeBitCast(d_resize_to, v))); -} -#endif // HWY_HAVE_SCALABLE - -} // namespace detail -#endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR - -template -HWY_API VFromD ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, - VFromD v) { - return detail::ZeroExtendResizeBitCast(hwy::SizeTag(), - hwy::SizeTag(), d_to, - d_from, v); -} - -// ------------------------------ SafeFillN - -template > -HWY_API void SafeFillN(const size_t num, const T value, D d, - T* HWY_RESTRICT to) { -#if HWY_MEM_OPS_MIGHT_FAULT - (void)d; - for (size_t i = 0; i < num; ++i) { - to[i] = value; - } -#else - BlendedStore(Set(d, value), FirstN(d, num), d, to); -#endif -} - -// ------------------------------ SafeCopyN - -template > -HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, - T* HWY_RESTRICT to) { -#if HWY_MEM_OPS_MIGHT_FAULT - (void)d; - for (size_t i = 0; i < num; ++i) { - to[i] = from[i]; - } -#else - const Mask mask = FirstN(d, num); - BlendedStore(MaskedLoad(mask, d, from), mask, d, to); -#endif -} - -// ------------------------------ BitwiseIfThenElse -#if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#else -#define HWY_NATIVE_BITWISE_IF_THEN_ELSE -#endif - -template -HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { - return Or(And(mask, yes), AndNot(mask, no)); -} - -#endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE - -// "Include guard": skip if native instructions are available. The generic -// implementation is currently shared between x86_* and wasm_*, and is too large -// to duplicate. - -#if HWY_IDE || \ - (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#else -#define HWY_NATIVE_LOAD_STORE_INTERLEAVED -#endif - -// ------------------------------ LoadInterleaved2 - -template -HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1) { - const VFromD A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] - const VFromD B = LoadU(d, unaligned + Lanes(d)); - v0 = ConcatEven(d, B, A); - v1 = ConcatOdd(d, B, A); -} - -template -HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1) { - v0 = LoadU(d, unaligned + 0); - v1 = LoadU(d, unaligned + 1); -} - -// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) - -namespace detail { - -#if HWY_IDE -template -HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { - return a; -} -template -HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { - return a; -} -template -HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { - return a; -} -#endif // HWY_IDE - -// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. -template -HWY_INLINE void LoadTransposedBlocks3(D d, - const TFromD* HWY_RESTRICT unaligned, - VFromD& A, VFromD& B, - VFromD& C) { - constexpr size_t kN = MaxLanes(d); - A = LoadU(d, unaligned + 0 * kN); - B = LoadU(d, unaligned + 1 * kN); - C = LoadU(d, unaligned + 2 * kN); -} - -} // namespace detail - -template -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - const RebindToUnsigned du; - using V = VFromD; - // Compact notation so these fit on one line: 12 := v1[2]. - V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 - V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 - V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a - detail::LoadTransposedBlocks3(d, unaligned, A, B, C); - // Compress all lanes belonging to v0 into consecutive lanes. - constexpr uint8_t Z = 0x80; - alignas(16) static constexpr uint8_t kIdx_v0A[16] = { - 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v0B[16] = { - Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v0C[16] = { - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13}; - alignas(16) static constexpr uint8_t kIdx_v1A[16] = { - 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v1B[16] = { - Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v1C[16] = { - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14}; - alignas(16) static constexpr uint8_t kIdx_v2A[16] = { - 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v2B[16] = { - Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v2C[16] = { - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15}; - const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); - const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); - const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); - const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); - const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); - const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); - const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); - const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); - const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); - v0 = Xor3(v0L, v0M, v0U); - v1 = Xor3(v1L, v1M, v1U); - v2 = Xor3(v2L, v2M, v2U); -} - -// 8-bit lanes x8 -template -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - const RebindToUnsigned du; - using V = VFromD; - V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] - V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] - V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] - detail::LoadTransposedBlocks3(d, unaligned, A, B, C); - // Compress all lanes belonging to v0 into consecutive lanes. - constexpr uint8_t Z = 0x80; - alignas(16) static constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5}; - alignas(16) static constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6}; - alignas(16) static constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7}; - const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A))); - const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B))); - const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C))); - const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A))); - const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B))); - const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C))); - const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A))); - const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B))); - const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C))); - v0 = Xor3(v0L, v0M, v0U); - v1 = Xor3(v1L, v1M, v1U); - v2 = Xor3(v2L, v2M, v2U); -} - -// 16-bit lanes x8 -template -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - const RebindToUnsigned du; - const Repartition du8; - using V = VFromD; - V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] - V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] - V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] - detail::LoadTransposedBlocks3(d, unaligned, A, B, C); - // Compress all lanes belonging to v0 into consecutive lanes. Same as above, - // but each element of the array contains a byte index for a byte of a lane. - constexpr uint8_t Z = 0x80; - alignas(16) static constexpr uint8_t kIdx_v0A[16] = { - 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v0B[16] = { - Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v0C[16] = { - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B}; - alignas(16) static constexpr uint8_t kIdx_v1A[16] = { - 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v1B[16] = { - Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v1C[16] = { - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D}; - alignas(16) static constexpr uint8_t kIdx_v2A[16] = { - 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v2B[16] = { - Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z}; - alignas(16) static constexpr uint8_t kIdx_v2C[16] = { - Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F}; - const V v0L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v0A))); - const V v0M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v0B))); - const V v0U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v0C))); - const V v1L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v1A))); - const V v1M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v1B))); - const V v1U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v1C))); - const V v2L = TableLookupBytesOr0(A, BitCast(d, LoadDup128(du8, kIdx_v2A))); - const V v2M = TableLookupBytesOr0(B, BitCast(d, LoadDup128(du8, kIdx_v2B))); - const V v2U = TableLookupBytesOr0(C, BitCast(d, LoadDup128(du8, kIdx_v2C))); - v0 = Xor3(v0L, v0M, v0U); - v1 = Xor3(v1L, v1M, v1U); - v2 = Xor3(v2L, v2M, v2U); -} - -template -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - using V = VFromD; - V A; // v0[1] v2[0] v1[0] v0[0] - V B; // v1[2] v0[2] v2[1] v1[1] - V C; // v2[3] v1[3] v0[3] v2[2] - detail::LoadTransposedBlocks3(d, unaligned, A, B, C); - - const V vxx_02_03_xx = OddEven(C, B); - v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx); - - // Shuffle2301 takes the upper/lower halves of the output from one input, so - // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use - // OddEven because it may have higher throughput than Shuffle. - const V vxx_xx_10_11 = OddEven(A, B); - const V v12_13_xx_xx = OddEven(B, C); - v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx); - - const V vxx_20_21_xx = OddEven(B, A); - v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C); -} - -template -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - VFromD A; // v1[0] v0[0] - VFromD B; // v0[1] v2[0] - VFromD C; // v2[1] v1[1] - detail::LoadTransposedBlocks3(d, unaligned, A, B, C); - v0 = OddEven(B, A); - v1 = CombineShiftRightBytes)>(d, C, A); - v2 = OddEven(C, B); -} - -template , HWY_IF_LANES_D(D, 1)> -HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - v0 = LoadU(d, unaligned + 0); - v1 = LoadU(d, unaligned + 1); - v2 = LoadU(d, unaligned + 2); -} - -// ------------------------------ LoadInterleaved4 - -namespace detail { - -// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. -template -HWY_INLINE void LoadTransposedBlocks4(D d, - const TFromD* HWY_RESTRICT unaligned, - VFromD& vA, VFromD& vB, - VFromD& vC, VFromD& vD) { - constexpr size_t kN = MaxLanes(d); - vA = LoadU(d, unaligned + 0 * kN); - vB = LoadU(d, unaligned + 1 * kN); - vC = LoadU(d, unaligned + 2 * kN); - vD = LoadU(d, unaligned + 3 * kN); -} - -} // namespace detail - -template -HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - const Repartition d64; - using V64 = VFromD; - using V = VFromD; - // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. - // Here int[i] means the four interleaved values of the i-th 4-tuple and - // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). - V vA; // int[13..10] int[3..0] - V vB; // int[17..14] int[7..4] - V vC; // int[1b..18] int[b..8] - V vD; // int[1f..1c] int[f..c] - detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); - - // For brevity, the comments only list the lower block (upper = lower + 0x10) - const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] - const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] - const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] - const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a] - - const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] - const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] - const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] - const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] - - const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] - const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] - const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] - const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] - - v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); - v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); - v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); - v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); -} - -template -HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - // In the last step, we interleave by half of the block size, which is usually - // 8 bytes but half that for 8-bit x8 vectors. - using TW = hwy::UnsignedFromSize; - const Repartition dw; - using VW = VFromD; - - // (Comments are for 256-bit vectors.) - // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD. - VFromD vA; // v3210[9]v3210[8] v3210[1]v3210[0] - VFromD vB; // v3210[b]v3210[a] v3210[3]v3210[2] - VFromD vC; // v3210[d]v3210[c] v3210[5]v3210[4] - VFromD vD; // v3210[f]v3210[e] v3210[7]v3210[6] - detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); - - const VFromD va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0] - const VFromD vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4] - const VFromD vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1] - const VFromD vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5] - - const VW v10_b830 = // v10[b..8] v10[3..0] - BitCast(dw, InterleaveLower(d, va820, vb931)); - const VW v10_fc74 = // v10[f..c] v10[7..4] - BitCast(dw, InterleaveLower(d, vec64, vfd75)); - const VW v32_b830 = // v32[b..8] v32[3..0] - BitCast(dw, InterleaveUpper(d, va820, vb931)); - const VW v32_fc74 = // v32[f..c] v32[7..4] - BitCast(dw, InterleaveUpper(d, vec64, vfd75)); - - v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); - v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); - v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); - v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); -} - -template -HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - using V = VFromD; - V vA; // v3210[4] v3210[0] - V vB; // v3210[5] v3210[1] - V vC; // v3210[6] v3210[2] - V vD; // v3210[7] v3210[3] - detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); - const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] - const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] - const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] - const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] - - v0 = InterleaveLower(d, v10e, v10o); - v1 = InterleaveUpper(d, v10e, v10o); - v2 = InterleaveLower(d, v32e, v32o); - v3 = InterleaveUpper(d, v32e, v32o); -} - -template -HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - VFromD vA, vB, vC, vD; - detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); - v0 = InterleaveLower(d, vA, vC); - v1 = InterleaveUpper(d, vA, vC); - v2 = InterleaveLower(d, vB, vD); - v3 = InterleaveUpper(d, vB, vD); -} - -// Any T x1 -template , HWY_IF_LANES_D(D, 1)> -HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - v0 = LoadU(d, unaligned + 0); - v1 = LoadU(d, unaligned + 1); - v2 = LoadU(d, unaligned + 2); - v3 = LoadU(d, unaligned + 3); -} - -// ------------------------------ StoreInterleaved2 - -namespace detail { - -// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. -template -HWY_INLINE void StoreTransposedBlocks2(VFromD A, VFromD B, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t kN = MaxLanes(d); - StoreU(A, d, unaligned + 0 * kN); - StoreU(B, d, unaligned + 1 * kN); -} - -} // namespace detail - -// >= 128 bit vector -template -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - TFromD* HWY_RESTRICT unaligned) { - const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] - const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2] - detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); -} - -// <= 64 bits -template -HWY_API void StoreInterleaved2(V part0, V part1, D d, - TFromD* HWY_RESTRICT unaligned) { - const Twice d2; - const auto v0 = ZeroExtendVector(d2, part0); - const auto v1 = ZeroExtendVector(d2, part1); - const auto v10 = InterleaveLower(d2, v0, v1); - StoreU(v10, d2, unaligned); -} - -// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, -// TableLookupBytes) - -namespace detail { - -// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. -template -HWY_INLINE void StoreTransposedBlocks3(VFromD A, VFromD B, VFromD C, - D d, TFromD* HWY_RESTRICT unaligned) { - constexpr size_t kN = MaxLanes(d); - StoreU(A, d, unaligned + 0 * kN); - StoreU(B, d, unaligned + 1 * kN); - StoreU(C, d, unaligned + 2 * kN); -} - -} // namespace detail - -// >= 128-bit vector, 8-bit lanes -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - using TU = TFromD; - const auto k5 = Set(du, TU{5}); - const auto k6 = Set(du, TU{6}); - - // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): - // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes - // to their place, with 0x80 so lanes to be filled from other vectors are 0 - // to enable blending by ORing together. - alignas(16) static constexpr uint8_t tbl_v0[16] = { - 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // - 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; - alignas(16) static constexpr uint8_t tbl_v1[16] = { - 0x80, 0, 0x80, 0x80, 1, 0x80, // - 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; - // The interleaved vectors will be named A, B, C; temporaries with suffix - // 0..2 indicate which input vector's lanes they hold. - const auto shuf_A0 = LoadDup128(du, tbl_v0); - const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5) - const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); - const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 - const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. - const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. - const VFromD A = BitCast(d, A0 | A1 | A2); - - // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5] - const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6.. - const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5 - const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5. - const auto B0 = TableLookupBytesOr0(v0, shuf_B0); - const auto B1 = TableLookupBytesOr0(v1, shuf_B1); - const auto B2 = TableLookupBytesOr0(v2, shuf_B2); - const VFromD B = BitCast(d, B0 | B1 | B2); - - // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10] - const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B. - const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B.. - const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A - const auto C0 = TableLookupBytesOr0(v0, shuf_C0); - const auto C1 = TableLookupBytesOr0(v1, shuf_C1); - const auto C2 = TableLookupBytesOr0(v2, shuf_C2); - const VFromD C = BitCast(d, C0 | C1 | C2); - - detail::StoreTransposedBlocks3(A, B, C, d, unaligned); -} - -// >= 128-bit vector, 16-bit lanes -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - const Repartition du8; - const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD)}); - const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD)}); - - // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): - // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be - // filled from other vectors are 0 for blending. Note that these are byte - // indices for 16-bit lanes. - alignas(16) static constexpr uint8_t tbl_v1[16] = { - 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, - 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; - alignas(16) static constexpr uint8_t tbl_v2[16] = { - 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, - 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; - - // The interleaved vectors will be named A, B, C; temporaries with suffix - // 0..2 indicate which input vector's lanes they hold. - const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0. - // .2..1..0 - const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); - const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0.. - - const auto A0 = TableLookupBytesOr0(v0, shuf_A0); - const auto A1 = TableLookupBytesOr0(v1, shuf_A1); - const auto A2 = TableLookupBytesOr0(v2, shuf_A2); - const VFromD A = BitCast(d, A0 | A1 | A2); - - // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2] - const auto shuf_B0 = shuf_A1 + k3; // 5..4..3. - const auto shuf_B1 = shuf_A2 + k3; // ..4..3.. - const auto shuf_B2 = shuf_A0 + k2; // .4..3..2 - const auto B0 = TableLookupBytesOr0(v0, shuf_B0); - const auto B1 = TableLookupBytesOr0(v1, shuf_B1); - const auto B2 = TableLookupBytesOr0(v2, shuf_B2); - const VFromD B = BitCast(d, B0 | B1 | B2); - - // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] - const auto shuf_C0 = shuf_B1 + k3; // ..7..6.. - const auto shuf_C1 = shuf_B2 + k3; // .7..6..5 - const auto shuf_C2 = shuf_B0 + k2; // 7..6..5. - const auto C0 = TableLookupBytesOr0(v0, shuf_C0); - const auto C1 = TableLookupBytesOr0(v1, shuf_C1); - const auto C2 = TableLookupBytesOr0(v2, shuf_C2); - const VFromD C = BitCast(d, C0 | C1 | C2); - - detail::StoreTransposedBlocks3(A, B, C, d, unaligned); -} - -// >= 128-bit vector, 32-bit lanes -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - const RepartitionToWide dw; - - const VFromD v10_v00 = InterleaveLower(d, v0, v1); - const VFromD v01_v20 = OddEven(v0, v2); - // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0) - const VFromD A = BitCast( - d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20))); - - const VFromD v1_321 = ShiftRightLanes<1>(d, v1); - const VFromD v0_32 = ShiftRightLanes<2>(d, v0); - const VFromD v21_v11 = OddEven(v2, v1_321); - const VFromD v12_v02 = OddEven(v1_321, v0_32); - // B: v1[2],v0[2], v2[1],v1[1] - const VFromD B = BitCast( - d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02))); - - // Notation refers to the upper 2 lanes of the vector for InterleaveUpper. - const VFromD v23_v13 = OddEven(v2, v1_321); - const VFromD v03_v22 = OddEven(v0, v2); - // C: v2[3],v1[3],v0[3], v2[2] - const VFromD C = BitCast( - d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13))); - - detail::StoreTransposedBlocks3(A, B, C, d, unaligned); -} - -// >= 128-bit vector, 64-bit lanes -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - const VFromD A = InterleaveLower(d, v0, v1); - const VFromD B = OddEven(v0, v2); - const VFromD C = InterleaveUpper(d, v1, v2); - detail::StoreTransposedBlocks3(A, B, C, d, unaligned); -} - -// 64-bit vector, 8-bit lanes -template -HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, - VFromD part2, D d, - TFromD* HWY_RESTRICT unaligned) { - // Use full vectors for the shuffles and first result. - constexpr size_t kFullN = 16 / sizeof(TFromD); - const Full128 du; - const Full128> d_full; - const auto k5 = Set(du, uint8_t{5}); - const auto k6 = Set(du, uint8_t{6}); - - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - - // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): - // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be - // filled from other vectors are 0 for blending. - alignas(16) static constexpr uint8_t tbl_v0[16] = { - 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // - 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; - alignas(16) static constexpr uint8_t tbl_v1[16] = { - 0x80, 0, 0x80, 0x80, 1, 0x80, // - 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; - // The interleaved vectors will be named A, B, C; temporaries with suffix - // 0..2 indicate which input vector's lanes they hold. - const auto shuf_A0 = Load(du, tbl_v0); - const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB) - const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); - const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 - const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. - const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. - const auto A = BitCast(d_full, A0 | A1 | A2); - StoreU(A, d_full, unaligned + 0 * kFullN); - - // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] - const auto shuf_B0 = shuf_A2 + k6; // ..7..6.. - const auto shuf_B1 = shuf_A0 + k5; // .7..6..5 - const auto shuf_B2 = shuf_A1 + k5; // 7..6..5. - const auto B0 = TableLookupBytesOr0(v0, shuf_B0); - const auto B1 = TableLookupBytesOr0(v1, shuf_B1); - const auto B2 = TableLookupBytesOr0(v2, shuf_B2); - const VFromD B{BitCast(d_full, B0 | B1 | B2).raw}; - StoreU(B, d, unaligned + 1 * kFullN); -} - -// 64-bit vector, 16-bit lanes -template -HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, - VFromD part2, D dh, - TFromD* HWY_RESTRICT unaligned) { - const Twice d_full; - const Full128 du8; - const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD)}); - const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD)}); - - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - - // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right): - // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes - // to their place, with 0x80 so lanes to be filled from other vectors are 0 - // to enable blending by ORing together. - alignas(16) static constexpr uint8_t tbl_v1[16] = { - 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, - 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; - alignas(16) static constexpr uint8_t tbl_v2[16] = { - 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, - 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; - - // The interleaved vectors will be named A, B; temporaries with suffix - // 0..2 indicate which input vector's lanes they hold. - const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0. - // .2..1..0 - const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); - const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0.. - - const auto A0 = TableLookupBytesOr0(v0, shuf_A0); - const auto A1 = TableLookupBytesOr0(v1, shuf_A1); - const auto A2 = TableLookupBytesOr0(v2, shuf_A2); - const VFromD A = BitCast(d_full, A0 | A1 | A2); - StoreU(A, d_full, unaligned); - - // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2] - const auto shuf_B0 = shuf_A1 + k3; // ..3. - const auto shuf_B1 = shuf_A2 + k3; // .3.. - const auto shuf_B2 = shuf_A0 + k2; // 3..2 - const auto B0 = TableLookupBytesOr0(v0, shuf_B0); - const auto B1 = TableLookupBytesOr0(v1, shuf_B1); - const auto B2 = TableLookupBytesOr0(v2, shuf_B2); - const VFromD B = BitCast(d_full, B0 | B1 | B2); - StoreU(VFromD{B.raw}, dh, unaligned + MaxLanes(d_full)); -} - -// 64-bit vector, 32-bit lanes -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - // (same code as 128-bit vector, 64-bit lanes) - const VFromD v10_v00 = InterleaveLower(d, v0, v1); - const VFromD v01_v20 = OddEven(v0, v2); - const VFromD v21_v11 = InterleaveUpper(d, v1, v2); - constexpr size_t kN = MaxLanes(d); - StoreU(v10_v00, d, unaligned + 0 * kN); - StoreU(v01_v20, d, unaligned + 1 * kN); - StoreU(v21_v11, d, unaligned + 2 * kN); -} - -// 64-bit lanes are handled by the N=1 case below. - -// <= 32-bit vector, 8-bit lanes -template -HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, - VFromD part2, D d, - TFromD* HWY_RESTRICT unaligned) { - // Use full vectors for the shuffles and result. - const Full128 du; - const Full128> d_full; - - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - - // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 - // so lanes to be filled from other vectors are 0 to enable blending by ORing - // together. - alignas(16) static constexpr uint8_t tbl_v0[16] = { - 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, - 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - // The interleaved vector will be named A; temporaries with suffix - // 0..2 indicate which input vector's lanes they hold. - const auto shuf_A0 = Load(du, tbl_v0); - const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0); - const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0); - const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0 - const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0. - const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0.. - const VFromD A = BitCast(d_full, A0 | A1 | A2); - alignas(16) TFromD buf[MaxLanes(d_full)]; - StoreU(A, d_full, buf); - CopyBytes(buf, unaligned); -} - -// 32-bit vector, 16-bit lanes -template -HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, - VFromD part2, D d, - TFromD* HWY_RESTRICT unaligned) { - // Use full vectors for the shuffles and result. - const Full128 du8; - const Full128> d_full; - - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - - // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 - // so lanes to be filled from other vectors are 0 to enable blending by ORing - // together. - alignas(16) static constexpr uint8_t tbl_v2[16] = { - 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, - 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; - // The interleaved vector will be named A; temporaries with suffix - // 0..2 indicate which input vector's lanes they hold. - const auto shuf_A2 = // ..1..0.. - Load(du8, tbl_v2); - const auto shuf_A1 = // ...1..0. - CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); - const auto shuf_A0 = // ....1..0 - CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); - const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0 - const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0. - const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0.. - const auto A = BitCast(d_full, A0 | A1 | A2); - alignas(16) TFromD buf[MaxLanes(d_full)]; - StoreU(A, d_full, buf); - CopyBytes(buf, unaligned); -} - -// Single-element vector, any lane size: just store directly -template -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(v0, d, unaligned + 0); - StoreU(v1, d, unaligned + 1); - StoreU(v2, d, unaligned + 2); -} - -// ------------------------------ StoreInterleaved4 - -namespace detail { - -// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. -template -HWY_INLINE void StoreTransposedBlocks4(VFromD vA, VFromD vB, VFromD vC, - VFromD vD, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t kN = MaxLanes(d); - StoreU(vA, d, unaligned + 0 * kN); - StoreU(vB, d, unaligned + 1 * kN); - StoreU(vC, d, unaligned + 2 * kN); - StoreU(vD, d, unaligned + 3 * kN); -} - -} // namespace detail - -// >= 128-bit vector, 8..32-bit lanes -template -HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, - VFromD v3, D d, - TFromD* HWY_RESTRICT unaligned) { - const RepartitionToWide dw; - const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0] - const auto v32L = ZipLower(dw, v2, v3); - const auto v10U = ZipUpper(dw, v0, v1); - const auto v32U = ZipUpper(dw, v2, v3); - // The interleaved vectors are vA, vB, vC, vD. - const VFromD vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210 - const VFromD vB = BitCast(d, InterleaveUpper(dw, v10L, v32L)); - const VFromD vC = BitCast(d, InterleaveLower(dw, v10U, v32U)); - const VFromD vD = BitCast(d, InterleaveUpper(dw, v10U, v32U)); - detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); -} - -// >= 128-bit vector, 64-bit lanes -template -HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, - VFromD v3, D d, - TFromD* HWY_RESTRICT unaligned) { - // The interleaved vectors are vA, vB, vC, vD. - const VFromD vA = InterleaveLower(d, v0, v1); // v1[0] v0[0] - const VFromD vB = InterleaveLower(d, v2, v3); - const VFromD vC = InterleaveUpper(d, v0, v1); - const VFromD vD = InterleaveUpper(d, v2, v3); - detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); -} - -// 64-bit vector, 8..32-bit lanes -template -HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, - VFromD part2, VFromD part3, D /* tag */, - TFromD* HWY_RESTRICT unaligned) { - // Use full vectors to reduce the number of stores. - const Full128> d_full; - const RepartitionToWide dw; - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - const VFromD v3{part3.raw}; - const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0] - const auto v32 = ZipLower(dw, v2, v3); - const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32)); - const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32)); - StoreU(A, d_full, unaligned); - StoreU(B, d_full, unaligned + MaxLanes(d_full)); -} - -// 64-bit vector, 64-bit lane -template -HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, - VFromD part2, VFromD part3, D /* tag */, - TFromD* HWY_RESTRICT unaligned) { - // Use full vectors to reduce the number of stores. - const Full128> d_full; - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - const VFromD v3{part3.raw}; - const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0] - const auto B = InterleaveLower(d_full, v2, v3); - StoreU(A, d_full, unaligned); - StoreU(B, d_full, unaligned + MaxLanes(d_full)); -} - -// <= 32-bit vectors -template -HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, - VFromD part2, VFromD part3, D d, - TFromD* HWY_RESTRICT unaligned) { - // Use full vectors to reduce the number of stores. - const Full128> d_full; - const RepartitionToWide dw; - const VFromD v0{part0.raw}; - const VFromD v1{part1.raw}; - const VFromD v2{part2.raw}; - const VFromD v3{part3.raw}; - const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0] - const auto v32 = ZipLower(dw, v2, v3); - const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32)); - alignas(16) TFromD buf[MaxLanes(d_full)]; - StoreU(v3210, d_full, buf); - CopyBytes(buf, unaligned); -} - -#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED - -// ------------------------------ LoadN -#if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_LOAD_N -#undef HWY_NATIVE_LOAD_N -#else -#define HWY_NATIVE_LOAD_N -#endif - -#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE -namespace detail { - -template -HWY_INLINE VFromD LoadNResizeBitCast(DTo d_to, DFrom d_from, - VFromD v) { -#if HWY_TARGET <= HWY_SSE2 - // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw - // past the first (lowest-index) Lanes(d_from) lanes of v.raw if - // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true - (void)d_from; - return ResizeBitCast(d_to, v); -#else - // On other targets such as PPC/NEON, the contents of any lanes past the first - // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if - // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true. - return ZeroExtendResizeBitCast(d_to, d_from, v); -#endif -} - -} // namespace detail - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - return (max_lanes_to_load > 0) ? LoadU(d, p) : Zero(d); -} - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - const FixedTag, 1> d1; - - if (max_lanes_to_load >= 2) { - return LoadU(d, p); - } else { - return (max_lanes_to_load == 1) - ? detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)) - : Zero(d); - } -} - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - const FixedTag, 2> d2; - const Half d1; - - if (max_lanes_to_load <= 1) - return (max_lanes_to_load == 1) - ? detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)) - : Zero(d); - else if (max_lanes_to_load > 3) - return LoadU(d, p); - - const auto v_lo = LoadU(d2, p); - if (max_lanes_to_load == 3) { - return Combine(d, detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + 2)), - v_lo); - } else { - return detail::LoadNResizeBitCast(d, d2, v_lo); - } -} - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - const FixedTag, 4> d4; - const Half d2; - const Half d1; - - if (max_lanes_to_load <= 1) - return (max_lanes_to_load == 1) - ? detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)) - : Zero(d); - else if (max_lanes_to_load >= 8) - return LoadU(d, p); - - const size_t leading_len = max_lanes_to_load & 4; - VFromD v_trailing = Zero(d4); - - if ((max_lanes_to_load & 2) != 0) { - const auto v_trailing_lo2 = LoadU(d2, p + leading_len); - if ((max_lanes_to_load & 1) != 0) { - v_trailing = Combine( - d4, - detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)), - v_trailing_lo2); - } else { - v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2); - } - } else if ((max_lanes_to_load & 1) != 0) { - v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len)); - } - - if (leading_len != 0) { - return Combine(d, v_trailing, LoadU(d4, p)); - } else { - return detail::LoadNResizeBitCast(d, d4, v_trailing); - } -} - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - const FixedTag, 8> d8; - const Half d4; - const Half d2; - const Half d1; - - if (max_lanes_to_load <= 1) - return (max_lanes_to_load == 1) - ? detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)) - : Zero(d); - else if (max_lanes_to_load >= 16) - return LoadU(d, p); - - const size_t leading_len = max_lanes_to_load & 12; - VFromD v_trailing = Zero(d4); - - if ((max_lanes_to_load & 2) != 0) { - const auto v_trailing_lo2 = LoadU(d2, p + leading_len); - if ((max_lanes_to_load & 1) != 0) { - v_trailing = Combine( - d4, - detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)), - v_trailing_lo2); - } else { - v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2); - } - } else if ((max_lanes_to_load & 1) != 0) { - v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len)); - } - - if (leading_len != 0) { - if (leading_len >= 8) { - const auto v_hi7 = ((leading_len & 4) != 0) - ? Combine(d8, v_trailing, LoadU(d4, p + 8)) - : detail::LoadNResizeBitCast(d8, d4, v_trailing); - return Combine(d, v_hi7, LoadU(d8, p)); - } else { - return detail::LoadNResizeBitCast(d, d8, - Combine(d8, v_trailing, LoadU(d4, p))); - } - } else { - return detail::LoadNResizeBitCast(d, d4, v_trailing); - } -} - -#if HWY_MAX_BYTES >= 32 -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - const size_t N = Lanes(d); - if (max_lanes_to_load >= N) { - return LoadU(d, p); - } - - const Half dh; - const size_t half_N = Lanes(dh); - if (max_lanes_to_load <= half_N) { - return ZeroExtendVector(d, LoadN(dh, p, max_lanes_to_load)); - } else { - const auto v_lo = LoadU(dh, p); - const auto v_hi = LoadN(dh, p + half_N, max_lanes_to_load - half_N); - return Combine(d, v_hi, v_lo); - } -} -#endif // HWY_MAX_BYTES >= 32 -#else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { -#if HWY_MEM_OPS_MIGHT_FAULT - if (max_lanes_to_load <= 0) return Zero(d); -#endif - - const size_t N = Lanes(d); - return MaskedLoad(FirstN(d, HWY_MIN(max_lanes_to_load, N)), d, p); -} -#endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE - -#endif - -// ------------------------------ StoreN -#if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_STORE_N -#undef HWY_NATIVE_STORE_N -#else -#define HWY_NATIVE_STORE_N -#endif - -#if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE -namespace detail { - -template -HWY_INLINE VFromD StoreNGetUpperHalf(DH dh, VFromD> v) { - constexpr size_t kMinShrVectBytes = - (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8 : 16; - const FixedTag d_shift; - return ResizeBitCast( - dh, ShiftRightBytes(d_shift, ResizeBitCast(d_shift, v))); -} - -template -HWY_INLINE VFromD StoreNGetUpperHalf(DH dh, VFromD> v) { - return UpperHalf(dh, v); -} - -} // namespace detail - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - if (max_lanes_to_store > 0) { - StoreU(v, d, p); - } -} - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - if (max_lanes_to_store > 1) { - StoreU(v, d, p); - } else if (max_lanes_to_store == 1) { - const FixedTag, 1> d1; - StoreU(LowerHalf(d1, v), d1, p); - } -} - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const FixedTag, 2> d2; - const Half d1; - - if (max_lanes_to_store > 1) { - if (max_lanes_to_store >= 4) { - StoreU(v, d, p); - } else { - StoreU(ResizeBitCast(d2, v), d2, p); - if (max_lanes_to_store == 3) { - StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2); - } - } - } else if (max_lanes_to_store == 1) { - StoreU(ResizeBitCast(d1, v), d1, p); - } -} - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const FixedTag, 4> d4; - const Half d2; - const Half d1; - - if (max_lanes_to_store <= 1) { - if (max_lanes_to_store == 1) { - StoreU(ResizeBitCast(d1, v), d1, p); - } - } else if (max_lanes_to_store >= 8) { - StoreU(v, d, p); - } else if (max_lanes_to_store >= 4) { - StoreU(LowerHalf(d4, v), d4, p); - StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4, - max_lanes_to_store - 4); - } else { - StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store); - } -} - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const FixedTag, 8> d8; - const Half d4; - const Half d2; - const Half d1; - - if (max_lanes_to_store <= 1) { - if (max_lanes_to_store == 1) { - StoreU(ResizeBitCast(d1, v), d1, p); - } - } else if (max_lanes_to_store >= 16) { - StoreU(v, d, p); - } else if (max_lanes_to_store >= 8) { - StoreU(LowerHalf(d8, v), d8, p); - StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8, - max_lanes_to_store - 8); - } else { - StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store); - } -} - -#if HWY_MAX_BYTES >= 32 -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const size_t N = Lanes(d); - if (max_lanes_to_store >= N) { - StoreU(v, d, p); - return; - } - - const Half dh; - const size_t half_N = Lanes(dh); - if (max_lanes_to_store <= half_N) { - StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store); - } else { - StoreU(LowerHalf(dh, v), dh, p); - StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N); - } -} -#endif // HWY_MAX_BYTES >= 32 - -#else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const size_t N = Lanes(d); - const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N); -#if HWY_MEM_OPS_MIGHT_FAULT - if (clamped_max_lanes_to_store == 0) return; -#endif - - BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p); - -#if HWY_MEM_OPS_MIGHT_FAULT - detail::MaybeUnpoison(p, clamped_max_lanes_to_store); -#endif -} -#endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE - -#endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) - -// ------------------------------ Scatter - -#if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_SCATTER -#undef HWY_NATIVE_SCATTER -#else -#define HWY_NATIVE_SCATTER -#endif - -template > -HWY_API void ScatterOffset(VFromD v, D d, T* HWY_RESTRICT base, - VFromD> offset) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - HWY_ALIGN T lanes[MaxLanes(d)]; - Store(v, d, lanes); - - HWY_ALIGN TI offset_lanes[MaxLanes(d)]; - Store(offset, di, offset_lanes); - - uint8_t* base_bytes = reinterpret_cast(base); - for (size_t i = 0; i < MaxLanes(d); ++i) { - CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); - } -} - -template > -HWY_API void ScatterIndex(VFromD v, D d, T* HWY_RESTRICT base, - VFromD> index) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - HWY_ALIGN T lanes[MaxLanes(d)]; - Store(v, d, lanes); - - HWY_ALIGN TI index_lanes[MaxLanes(d)]; - Store(index, di, index_lanes); - - for (size_t i = 0; i < MaxLanes(d); ++i) { - base[index_lanes[i]] = lanes[i]; - } -} - -template > -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D d, - T* HWY_RESTRICT base, - VFromD> index) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - HWY_ALIGN T lanes[MaxLanes(d)]; - Store(v, d, lanes); - - HWY_ALIGN TI index_lanes[MaxLanes(d)]; - Store(index, di, index_lanes); - - HWY_ALIGN TI mask_lanes[MaxLanes(di)]; - Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); - - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask_lanes[i]) base[index_lanes[i]] = lanes[i]; - } -} - -#endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) - -// ------------------------------ Gather - -#if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_GATHER -#undef HWY_NATIVE_GATHER -#else -#define HWY_NATIVE_GATHER -#endif - -template > -HWY_API VFromD GatherOffset(D d, const T* HWY_RESTRICT base, - VFromD> offset) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - HWY_ALIGN TI offset_lanes[MaxLanes(d)]; - Store(offset, di, offset_lanes); - - HWY_ALIGN T lanes[MaxLanes(d)]; - const uint8_t* base_bytes = reinterpret_cast(base); - for (size_t i = 0; i < MaxLanes(d); ++i) { - CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); - } - return Load(d, lanes); -} - -template > -HWY_API VFromD GatherIndex(D d, const T* HWY_RESTRICT base, - VFromD> index) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - HWY_ALIGN TI index_lanes[MaxLanes(d)]; - Store(index, di, index_lanes); - - HWY_ALIGN T lanes[MaxLanes(d)]; - for (size_t i = 0; i < MaxLanes(d); ++i) { - lanes[i] = base[index_lanes[i]]; - } - return Load(d, lanes); -} - -template > -HWY_API VFromD MaskedGatherIndex(MFromD m, D d, - const T* HWY_RESTRICT base, - VFromD> index) { - const RebindToSigned di; - using TI = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - - HWY_ALIGN TI index_lanes[MaxLanes(di)]; - Store(index, di, index_lanes); - - HWY_ALIGN TI mask_lanes[MaxLanes(di)]; - Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); - - HWY_ALIGN T lanes[MaxLanes(d)]; - for (size_t i = 0; i < MaxLanes(d); ++i) { - lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0}; - } - return Load(d, lanes); -} - -#endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE)) - -// ------------------------------ Integer AbsDiff and SumsOf8AbsDiff - -#if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_INTEGER_ABS_DIFF -#undef HWY_NATIVE_INTEGER_ABS_DIFF -#else -#define HWY_NATIVE_INTEGER_ABS_DIFF -#endif - -template -HWY_API V AbsDiff(V a, V b) { - return Sub(Max(a, b), Min(a, b)); -} - -#endif // HWY_NATIVE_INTEGER_ABS_DIFF - -#if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF -#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF -#else -#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF -#endif - -template ), - HWY_IF_V_SIZE_GT_D(DFromV, (HWY_TARGET == HWY_SCALAR ? 0 : 4))> -HWY_API Vec>> SumsOf8AbsDiff(V a, V b) { - return SumsOf8(AbsDiff(a, b)); -} - -#endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF - -// ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64 - -#if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB -#undef HWY_NATIVE_I32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I32_SATURATED_ADDSUB -#endif - -template )> -HWY_API V SaturatedAdd(V a, V b) { - const DFromV d; - const auto sum = Add(a, b); - const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); - const auto overflow_result = - Xor(BroadcastSignBit(a), Set(d, LimitsMax())); - return IfNegativeThenElse(overflow_mask, overflow_result, sum); -} - -template )> -HWY_API V SaturatedSub(V a, V b) { - const DFromV d; - const auto diff = Sub(a, b); - const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); - const auto overflow_result = - Xor(BroadcastSignBit(a), Set(d, LimitsMax())); - return IfNegativeThenElse(overflow_mask, overflow_result, diff); -} - -#endif // HWY_NATIVE_I32_SATURATED_ADDSUB - -#if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB -#undef HWY_NATIVE_I64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I64_SATURATED_ADDSUB -#endif - -template )> -HWY_API V SaturatedAdd(V a, V b) { - const DFromV d; - const auto sum = Add(a, b); - const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); - const auto overflow_result = - Xor(BroadcastSignBit(a), Set(d, LimitsMax())); - return IfNegativeThenElse(overflow_mask, overflow_result, sum); -} - -template )> -HWY_API V SaturatedSub(V a, V b) { - const DFromV d; - const auto diff = Sub(a, b); - const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); - const auto overflow_result = - Xor(BroadcastSignBit(a), Set(d, LimitsMax())); - return IfNegativeThenElse(overflow_mask, overflow_result, diff); -} - -#endif // HWY_NATIVE_I64_SATURATED_ADDSUB - -#if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB -#undef HWY_NATIVE_U32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U32_SATURATED_ADDSUB -#endif - -template )> -HWY_API V SaturatedAdd(V a, V b) { - return Add(a, Min(b, Not(a))); -} - -template )> -HWY_API V SaturatedSub(V a, V b) { - return Sub(a, Min(a, b)); -} - -#endif // HWY_NATIVE_U32_SATURATED_ADDSUB - -#if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB -#undef HWY_NATIVE_U64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U64_SATURATED_ADDSUB -#endif - -template )> -HWY_API V SaturatedAdd(V a, V b) { - return Add(a, Min(b, Not(a))); -} - -template )> -HWY_API V SaturatedSub(V a, V b) { - return Sub(a, Min(a, b)); -} - -#endif // HWY_NATIVE_U64_SATURATED_ADDSUB - -// ------------------------------ Unsigned to signed demotions - -template , DN>>, - hwy::EnableIf<(sizeof(TFromD) < sizeof(TFromV))>* = nullptr, - HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_D(DFromV))> -HWY_API VFromD DemoteTo(DN dn, V v) { - const DFromV d; - const RebindToSigned di; - const RebindToUnsigned dn_u; - - // First, do a signed to signed demotion. This will convert any values - // that are greater than hwy::HighestValue>>() to a - // negative value. - const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v)); - - // Second, convert any negative values to hwy::HighestValue>() - // using an unsigned Min operation. - const auto max_signed_val = Set(dn, hwy::HighestValue>()); - - return BitCast( - dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); -} - -#if HWY_TARGET != HWY_SCALAR || HWY_IDE -template , DN>>, - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_D(DFromV))> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const DFromV d; - const RebindToSigned di; - const RebindToUnsigned dn_u; - - // First, do a signed to signed demotion. This will convert any values - // that are greater than hwy::HighestValue>>() to a - // negative value. - const auto i2i_demote_result = - ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b)); - - // Second, convert any negative values to hwy::HighestValue>() - // using an unsigned Min operation. - const auto max_signed_val = Set(dn, hwy::HighestValue>()); - - return BitCast( - dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); -} -#endif - -// ------------------------------ PromoteLowerTo - -// There is no codegen advantage for a native version of this. It is provided -// only for convenience. -template -HWY_API VFromD PromoteLowerTo(D d, V v) { - // Lanes(d) may differ from Lanes(DFromV()). Use the lane type from V - // because it cannot be deduced from D (could be either bf16 or f16). - const Rebind, decltype(d)> dh; - return PromoteTo(d, LowerHalf(dh, v)); -} - -// ------------------------------ PromoteUpperTo - -#if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_PROMOTE_UPPER_TO -#undef HWY_NATIVE_PROMOTE_UPPER_TO -#else -#define HWY_NATIVE_PROMOTE_UPPER_TO -#endif - -// This requires UpperHalf. -#if HWY_TARGET != HWY_SCALAR || HWY_IDE - -template -HWY_API VFromD PromoteUpperTo(D d, V v) { - // Lanes(d) may differ from Lanes(DFromV()). Use the lane type from V - // because it cannot be deduced from D (could be either bf16 or f16). - const Rebind, decltype(d)> dh; - return PromoteTo(d, UpperHalf(dh, v)); -} - -#endif // HWY_TARGET != HWY_SCALAR -#endif // HWY_NATIVE_PROMOTE_UPPER_TO - -// ------------------------------ float16_t <-> float - -#if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif - -template -HWY_API VFromD PromoteTo(D df32, VFromD> v) { - const RebindToSigned di32; - const RebindToUnsigned du32; - const Rebind du16; - using VU32 = VFromD; - - const VU32 bits16 = PromoteTo(du32, BitCast(du16, v)); - const VU32 sign = ShiftRight<15>(bits16); - const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F)); - const VU32 mantissa = And(bits16, Set(du32, 0x3FF)); - const VU32 subnormal = - BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)), - Set(df32, 1.0f / 16384 / 1024))); - - const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15)); - const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa); - const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32); - const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal); - return BitCast(df32, Or(ShiftLeft<31>(sign), bits32)); -} - -template -HWY_API VFromD DemoteTo(D df16, VFromD> v) { - const RebindToUnsigned du16; - const Rebind du32; - const RebindToSigned di32; - using VU32 = VFromD; - using VI32 = VFromD; - - const VU32 bits32 = BitCast(du32, v); - const VU32 sign = ShiftRight<31>(bits32); - const VU32 biased_exp32 = And(ShiftRight<23>(bits32), Set(du32, 0xFF)); - const VU32 mantissa32 = And(bits32, Set(du32, 0x7FFFFF)); - - const VI32 k15 = Set(di32, 15); - const VI32 exp = Min(Sub(BitCast(di32, biased_exp32), Set(di32, 127)), k15); - const MFromD is_tiny = Lt(exp, Set(di32, -24)); - - const MFromD is_subnormal = Lt(exp, Set(di32, -14)); - const VU32 biased_exp16 = - BitCast(du32, IfThenZeroElse(is_subnormal, Add(exp, k15))); - const VU32 sub_exp = BitCast(du32, Sub(Set(di32, -14), exp)); // [1, 11) - // Clamp shift counts to prevent warnings in emu_128 Shr. - const VU32 k31 = Set(du32, 31); - const VU32 shift_m = Min(Add(Set(du32, 13), sub_exp), k31); - const VU32 shift_1 = Min(Sub(Set(du32, 10), sub_exp), k31); - const VU32 sub_m = Add(Shl(Set(du32, 1), shift_1), Shr(mantissa32, shift_m)); - const VU32 mantissa16 = IfThenElse(RebindMask(du32, is_subnormal), sub_m, - ShiftRight<13>(mantissa32)); // <1024 - - const VU32 sign16 = ShiftLeft<15>(sign); - const VU32 normal16 = Or3(sign16, ShiftLeft<10>(biased_exp16), mantissa16); - const VI32 bits16 = IfThenZeroElse(is_tiny, BitCast(di32, normal16)); - return BitCast(df16, DemoteTo(du16, bits16)); -} - -#endif // HWY_NATIVE_F16C - -// ------------------------------ OrderedTruncate2To - -#if HWY_IDE || \ - (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#else -#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#endif - -// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) -#if HWY_TARGET != HWY_SCALAR || HWY_IDE -template ) * 2), - HWY_IF_LANES_D(DFromV>, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedTruncate2To(DN dn, V a, V b) { - return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); -} -#endif // HWY_TARGET != HWY_SCALAR -#endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -#if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_LEADING_ZERO_COUNT -#undef HWY_NATIVE_LEADING_ZERO_COUNT -#else -#define HWY_NATIVE_LEADING_ZERO_COUNT -#endif - -namespace detail { - -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { - const RebindToFloat df; -#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 - const RebindToSigned di; - const Repartition di16; - - // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed - // by a unsigned right shift of the uint32_t bit representation of the - // floating point values by 23, followed by an int16_t Min - // operation as we are only interested in the biased exponent that would - // result from a uint32_t to float conversion. - - // An int32_t to float vector conversion is also much more efficient on - // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion - // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2 - // requires multiple instructions whereas an int32_t to float vector - // conversion can be carried out using a single instruction on - // SSE2/SSSE3/SSE4/AVX2. - - const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v))); - return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)), - BitCast(di16, Set(d, 158)))); -#else - const auto f32_bits = BitCast(d, ConvertTo(df, v)); - return BitCast(d, ShiftRight<23>(f32_bits)); -#endif -} - -template )> -HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) { - // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but - // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647. - const DFromV d; - const RebindToFloat df; -#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 - const RebindToSigned d_src; -#else - const RebindToUnsigned d_src; -#endif - const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v))); - return ShiftRight<23>(f32_bits); -} - -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { - const Rebind du32; - const auto f32_biased_exp_as_u32 = - I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); - return TruncateTo(d, f32_biased_exp_as_u32); -} - -#if HWY_TARGET != HWY_SCALAR -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { - const Half dh; - const Rebind du32; - - const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); - const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); - - const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); - const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); -#if HWY_TARGET <= HWY_SSE2 - const RebindToSigned di32; - const RebindToSigned di; - return BitCast(d, - OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32), - BitCast(di32, hi_f32_biased_exp_as_u32))); -#else - return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32, - hi_f32_biased_exp_as_u32); -#endif -} -#endif // HWY_TARGET != HWY_SCALAR - -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { - const Rebind du32; - const auto f32_biased_exp_as_u32 = - I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); - return U8FromU32(f32_biased_exp_as_u32); -} - -#if HWY_TARGET != HWY_SCALAR -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { - const Half dh; - const Rebind du32; - const Repartition du16; - - const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); - const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); - - const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); - const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); - -#if HWY_TARGET <= HWY_SSE2 - const RebindToSigned di32; - const RebindToSigned di16; - const auto f32_biased_exp_as_i16 = - OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32), - BitCast(di32, hi_f32_biased_exp_as_u32)); - return DemoteTo(d, f32_biased_exp_as_i16); -#else - const auto f32_biased_exp_as_u16 = OrderedTruncate2To( - du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32); - return TruncateTo(d, f32_biased_exp_as_u16); -#endif -} - -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { - const Half dh; - const Half dq; - const Rebind du32; - const Repartition du16; - - const auto lo_half = LowerHalf(dh, v); - const auto hi_half = UpperHalf(dh, v); - - const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half)); - const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half)); - const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half)); - const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half)); - - const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0); - const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1); - const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2); - const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3); - -#if HWY_TARGET <= HWY_SSE2 - const RebindToSigned di32; - const RebindToSigned di16; - - const auto lo_f32_biased_exp_as_i16 = - OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0), - BitCast(di32, f32_biased_exp_as_u32_q1)); - const auto hi_f32_biased_exp_as_i16 = - OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2), - BitCast(di32, f32_biased_exp_as_u32_q3)); - return OrderedDemote2To(d, lo_f32_biased_exp_as_i16, - hi_f32_biased_exp_as_i16); -#else - const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To( - du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1); - const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To( - du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3); - return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16, - hi_f32_biased_exp_as_u16); -#endif -} -#endif // HWY_TARGET != HWY_SCALAR - -#if HWY_TARGET == HWY_SCALAR -template -using F32ExpLzcntMinMaxRepartition = RebindToUnsigned; -#elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2 -template -using F32ExpLzcntMinMaxRepartition = Repartition; -#else -template -using F32ExpLzcntMinMaxRepartition = - Repartition), 4)>, D>; -#endif - -template -using F32ExpLzcntMinMaxCmpV = VFromD>>; - -template -HWY_INLINE F32ExpLzcntMinMaxCmpV F32ExpLzcntMinMaxBitCast(V v) { - const DFromV d; - const F32ExpLzcntMinMaxRepartition d2; - return BitCast(d2, v); -} - -template -HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { -#if HWY_TARGET == HWY_SCALAR - const uint64_t u64_val = GetLane(v); - const float f32_val = static_cast(u64_val); - uint32_t f32_bits; - CopySameSize(&f32_val, &f32_bits); - return Set(d, static_cast(f32_bits >> 23)); -#else - const Repartition du32; - const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v)); - const auto f32_biased_exp_adj = - IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)), - BitCast(du32, Set(d, 0x0000002000000000u))); - const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj); - - return ShiftRight<32>(BitCast( - d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp), - F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp))))); -#endif -} - -template -HWY_INLINE V UIntToF32BiasedExp(V v) { - const DFromV d; - return UIntToF32BiasedExp(d, v); -} - -template -HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { - return v; -} - -template -HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { - // If v[i] >= 16777216 is true, make sure that the bit at - // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact - // conversion to single-precision floating point is rounded down. - - // This zeroing-out can be accomplished through the AndNot operation below. - return AndNot(ShiftRight<24>(v), v); -} - -} // namespace detail - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - - const auto f32_biased_exp = detail::UIntToF32BiasedExp( - detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); - return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127}))); -} - -template -HWY_API V LeadingZeroCount(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - - constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; - const auto f32_biased_exp = detail::UIntToF32BiasedExp( - detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); - const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp); - - return BitCast(d, - Min(detail::F32ExpLzcntMinMaxBitCast(lz_count), - detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); -} - -template -HWY_API V TrailingZeroCount(V v) { - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; - using TU = TFromD; - - const auto vi = BitCast(di, v); - const auto lowest_bit = BitCast(du, And(vi, Neg(vi))); - - constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; - const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit); - const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127})); - - return BitCast(d, - Min(detail::F32ExpLzcntMinMaxBitCast(tz_count), - detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); -} -#endif // HWY_NATIVE_LEADING_ZERO_COUNT - -// ------------------------------ AESRound - -// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. -#if HWY_TARGET != HWY_SCALAR || HWY_IDE - -// Define for white-box testing, even if native instructions are available. -namespace detail { - -// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with -// Vector Permute Instructions" and the accompanying assembly language -// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan: -// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html . -// -// A brute-force 256 byte table lookup can also be made constant-time, and -// possibly competitive on NEON, but this is more performance-portable -// especially for x86 and large vectors. - -template // u8 -HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL, - V affine_tblU) { - const DFromV du; - const auto mask = Set(du, uint8_t{0xF}); - - // Change polynomial basis to GF(2^4) - { - alignas(16) static constexpr uint8_t basisL[16] = { - 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2, - 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA}; - alignas(16) static constexpr uint8_t basisU[16] = { - 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C, - 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD}; - const auto sL = And(state, mask); - const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero - const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL); - const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU); - state = Xor(gf4L, gf4U); - } - - // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and - // cause TableLookupBytesOr0 to return 0. - alignas(16) static constexpr uint8_t kZetaInv[16] = { - 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3}; - alignas(16) static constexpr uint8_t kInv[16] = { - 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4}; - const auto tbl = LoadDup128(du, kInv); - const auto sL = And(state, mask); // L=low nibble, U=upper - const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero - const auto sX = Xor(sU, sL); - const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL); - const auto invU = TableLookupBytes(tbl, sU); - const auto invX = TableLookupBytes(tbl, sX); - const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU))); - const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX))); - - const auto affL = TableLookupBytesOr0(affine_tblL, outL); - const auto affU = TableLookupBytesOr0(affine_tblU, outU); - return Xor(affL, affU); -} - -template // u8 -HWY_INLINE V SubBytes(V state) { - const DFromV du; - // Linear skew (cannot bake 0x63 bias into the table because out* indices - // may have the infinity flag set). - alignas(16) static constexpr uint8_t kAffineL[16] = { - 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0, - 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15}; - alignas(16) static constexpr uint8_t kAffineU[16] = { - 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF, - 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E}; - return Xor(SubBytesMulInverseAndAffineLookup(state, LoadDup128(du, kAffineL), - LoadDup128(du, kAffineU)), - Set(du, uint8_t{0x63})); -} - -template // u8 -HWY_INLINE V InvSubBytes(V state) { - const DFromV du; - alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvL[16]{ - 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13, - 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7}; - alignas(16) static constexpr uint8_t kGF2P4InvToGF2P8InvU[16]{ - 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12, - 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA}; - - // Apply the inverse affine transformation - const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)), - Or(ShiftLeft<3>(state), ShiftRight<5>(state)), - Or(ShiftLeft<6>(state), ShiftRight<2>(state))), - Set(du, uint8_t{0x05})); - - // The GF(2^8) multiplicative inverse is computed as follows: - // - Changing the polynomial basis to GF(2^4) - // - Computing the GF(2^4) multiplicative inverse - // - Converting the GF(2^4) multiplicative inverse to the GF(2^8) - // multiplicative inverse through table lookups using the - // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables - return SubBytesMulInverseAndAffineLookup( - b, LoadDup128(du, kGF2P4InvToGF2P8InvL), - LoadDup128(du, kGF2P4InvToGF2P8InvU)); -} - -} // namespace detail - -#endif // HWY_TARGET != HWY_SCALAR - -// "Include guard": skip if native AES instructions are available. -#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_AES -#undef HWY_NATIVE_AES -#else -#define HWY_NATIVE_AES -#endif - -// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) -#if HWY_TARGET != HWY_SCALAR - -namespace detail { - -template // u8 -HWY_API V ShiftRows(const V state) { - const DFromV du; - alignas(16) static constexpr uint8_t kShiftRow[16] = { - 0, 5, 10, 15, // transposed: state is column major - 4, 9, 14, 3, // - 8, 13, 2, 7, // - 12, 1, 6, 11}; - const auto shift_row = LoadDup128(du, kShiftRow); - return TableLookupBytes(state, shift_row); -} - -template // u8 -HWY_API V InvShiftRows(const V state) { - const DFromV du; - alignas(16) static constexpr uint8_t kShiftRow[16] = { - 0, 13, 10, 7, // transposed: state is column major - 4, 1, 14, 11, // - 8, 5, 2, 15, // - 12, 9, 6, 3}; - const auto shift_row = LoadDup128(du, kShiftRow); - return TableLookupBytes(state, shift_row); -} - -template // u8 -HWY_API V GF2P8Mod11BMulBy2(V v) { - const DFromV du; - const RebindToSigned di; // can only do signed comparisons - const auto msb = Lt(BitCast(di, v), Zero(di)); - const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B}))); - return Xor(Add(v, v), overflow); // = v*2 in GF(2^8). -} - -template // u8 -HWY_API V MixColumns(const V state) { - const DFromV du; - // For each column, the rows are the sum of GF(2^8) matrix multiplication by: - // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3. - // 1 2 3 1 // d are on diagonal, no permutation needed. - // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows. - // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301). - alignas(16) static constexpr uint8_t k2301[16] = { - 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - alignas(16) static constexpr uint8_t k1230[16] = { - 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12}; - const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8). - const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301)); - const auto d_s2301 = Xor(d, s2301); - const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)} - const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230)); - return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms -} - -template // u8 -HWY_API V InvMixColumns(const V state) { - const DFromV du; - // For each column, the rows are the sum of GF(2^8) matrix multiplication by: - // 14 11 13 9 - // 9 14 11 13 - // 13 9 14 11 - // 11 13 9 14 - alignas(16) static constexpr uint8_t k2301[16] = { - 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; - alignas(16) static constexpr uint8_t k1230[16] = { - 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12}; - const auto v1230 = LoadDup128(du, k1230); - - const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */ - const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */ - const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */ - const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */ - const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */ - const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */ - const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */ - - const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230)); - const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230)); - const auto sx13_2301_sx9_3012 = - TableLookupBytes(sx13_0123_sx9_1230, LoadDup128(du, k2301)); - return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012); -} - -} // namespace detail - -template // u8 -HWY_API V AESRound(V state, const V round_key) { - // Intel docs swap the first two steps, but it does not matter because - // ShiftRows is a permutation and SubBytes is independent of lane index. - state = detail::SubBytes(state); - state = detail::ShiftRows(state); - state = detail::MixColumns(state); - state = Xor(state, round_key); // AddRoundKey - return state; -} - -template // u8 -HWY_API V AESLastRound(V state, const V round_key) { - // LIke AESRound, but without MixColumns. - state = detail::SubBytes(state); - state = detail::ShiftRows(state); - state = Xor(state, round_key); // AddRoundKey - return state; -} - -template -HWY_API V AESInvMixColumns(V state) { - return detail::InvMixColumns(state); -} - -template // u8 -HWY_API V AESRoundInv(V state, const V round_key) { - state = detail::InvSubBytes(state); - state = detail::InvShiftRows(state); - state = detail::InvMixColumns(state); - state = Xor(state, round_key); // AddRoundKey - return state; -} - -template // u8 -HWY_API V AESLastRoundInv(V state, const V round_key) { - // Like AESRoundInv, but without InvMixColumns. - state = detail::InvSubBytes(state); - state = detail::InvShiftRows(state); - state = Xor(state, round_key); // AddRoundKey - return state; -} - -template )> -HWY_API V AESKeyGenAssist(V v) { - alignas(16) static constexpr uint8_t kRconXorMask[16] = { - 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0}; - alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { - 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12}; - const DFromV d; - const auto sub_word_result = detail::SubBytes(v); - const auto rot_word_result = - TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle)); - return Xor(rot_word_result, LoadDup128(d, kRconXorMask)); -} - -// Constant-time implementation inspired by -// https://www.bearssl.org/constanttime.html, but about half the cost because we -// use 64x64 multiplies and 128-bit XORs. -template -HWY_API V CLMulLower(V a, V b) { - const DFromV d; - static_assert(IsSame, uint64_t>(), "V must be u64"); - const auto k1 = Set(d, 0x1111111111111111ULL); - const auto k2 = Set(d, 0x2222222222222222ULL); - const auto k4 = Set(d, 0x4444444444444444ULL); - const auto k8 = Set(d, 0x8888888888888888ULL); - const auto a0 = And(a, k1); - const auto a1 = And(a, k2); - const auto a2 = And(a, k4); - const auto a3 = And(a, k8); - const auto b0 = And(b, k1); - const auto b1 = And(b, k2); - const auto b2 = And(b, k4); - const auto b3 = And(b, k8); - - auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3)); - auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0)); - auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1)); - auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2)); - m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1))); - m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2))); - m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3))); - m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0))); - return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); -} - -template -HWY_API V CLMulUpper(V a, V b) { - const DFromV d; - static_assert(IsSame, uint64_t>(), "V must be u64"); - const auto k1 = Set(d, 0x1111111111111111ULL); - const auto k2 = Set(d, 0x2222222222222222ULL); - const auto k4 = Set(d, 0x4444444444444444ULL); - const auto k8 = Set(d, 0x8888888888888888ULL); - const auto a0 = And(a, k1); - const auto a1 = And(a, k2); - const auto a2 = And(a, k4); - const auto a3 = And(a, k8); - const auto b0 = And(b, k1); - const auto b1 = And(b, k2); - const auto b2 = And(b, k4); - const auto b3 = And(b, k8); - - auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3)); - auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0)); - auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1)); - auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2)); - m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1))); - m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2))); - m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3))); - m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0))); - return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); -} - -#endif // HWY_NATIVE_AES -#endif // HWY_TARGET != HWY_SCALAR - -// ------------------------------ PopulationCount - -// "Include guard": skip if native POPCNT-related instructions are available. -#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -// This overload requires vectors to be at least 16 bytes, which is the case -// for LMUL >= 2. -#undef HWY_IF_POPCNT -#if HWY_TARGET == HWY_RVV -#define HWY_IF_POPCNT(D) \ - hwy::EnableIf= 1 && D().MaxLanes() >= 16>* = nullptr -#else -// Other targets only have these two overloads which are mutually exclusive, so -// no further conditions are required. -#define HWY_IF_POPCNT(D) void* = nullptr -#endif // HWY_TARGET == HWY_RVV - -template , HWY_IF_U8_D(D), - HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)> -HWY_API V PopulationCount(V v) { - const D d; - HWY_ALIGN constexpr uint8_t kLookup[16] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - }; - const auto lo = And(v, Set(d, uint8_t{0xF})); - const auto hi = ShiftRight<4>(v); - const auto lookup = LoadDup128(d, kLookup); - return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo)); -} - -// RVV has a specialization that avoids the Set(). -#if HWY_TARGET != HWY_RVV -// Slower fallback for capped vectors. -template , HWY_IF_U8_D(D), - HWY_IF_V_SIZE_LE_D(D, 8)> -HWY_API V PopulationCount(V v) { - const D d; - // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 - const V k33 = Set(d, uint8_t{0x33}); - v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55}))); - v = Add(And(ShiftRight<2>(v), k33), And(v, k33)); - return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F})); -} -#endif // HWY_TARGET != HWY_RVV - -template , HWY_IF_U16_D(D)> -HWY_API V PopulationCount(V v) { - const D d; - const Repartition d8; - const auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); - return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF}))); -} - -template , HWY_IF_U32_D(D)> -HWY_API V PopulationCount(V v) { - const D d; - Repartition d16; - auto vals = BitCast(d, PopulationCount(BitCast(d16, v))); - return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF}))); -} - -#if HWY_HAVE_INTEGER64 -template , HWY_IF_U64_D(D)> -HWY_API V PopulationCount(V v) { - const D d; - Repartition d32; - auto vals = BitCast(d, PopulationCount(BitCast(d32, v))); - return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL))); -} -#endif - -#endif // HWY_NATIVE_POPCNT - -// ------------------------------ 8-bit multiplication - -// "Include guard": skip if native 8-bit mul instructions are available. -#if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif - -// 8 bit and fits in wider reg: promote -template -HWY_API V operator*(const V a, const V b) { - const DFromV d; - const Rebind>, decltype(d)> dw; - const RebindToUnsigned du; // TruncateTo result - const RebindToUnsigned dwu; // TruncateTo input - const VFromD mul = PromoteTo(dw, a) * PromoteTo(dw, b); - // TruncateTo is cheaper than ConcatEven. - return BitCast(d, TruncateTo(du, BitCast(dwu, mul))); -} - -// 8 bit full reg: promote halves -template -HWY_API V operator*(const V a, const V b) { - const DFromV d; - const Half dh; - const Twice> dw; - const VFromD a0 = PromoteTo(dw, LowerHalf(dh, a)); - const VFromD a1 = PromoteTo(dw, UpperHalf(dh, a)); - const VFromD b0 = PromoteTo(dw, LowerHalf(dh, b)); - const VFromD b1 = PromoteTo(dw, UpperHalf(dh, b)); - const VFromD m0 = a0 * b0; - const VFromD m1 = a1 * b1; - return ConcatEven(d, BitCast(d, m1), BitCast(d, m0)); -} - -#endif // HWY_NATIVE_MUL_8 - -// ------------------------------ 64-bit multiplication - -// "Include guard": skip if native 64-bit mul instructions are available. -#if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -// Single-lane i64 or u64 -template -HWY_API V operator*(V x, V y) { - const DFromV d; - using T = TFromD; - using TU = MakeUnsigned; - const TU xu = static_cast(GetLane(x)); - const TU yu = static_cast(GetLane(y)); - return Set(d, static_cast(xu * yu)); -} - -template , HWY_IF_U64_D(D64), - HWY_IF_V_SIZE_GT_D(D64, 8)> -HWY_API V operator*(V x, V y) { - RepartitionToNarrow d32; - auto x32 = BitCast(d32, x); - auto y32 = BitCast(d32, y); - auto lolo = BitCast(d32, MulEven(x32, y32)); - auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y)))); - auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32)); - auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo))); - return BitCast(D64{}, lolo + hi); -} -template , HWY_IF_I64_D(DI64), - HWY_IF_V_SIZE_GT_D(DI64, 8)> -HWY_API V operator*(V x, V y) { - RebindToUnsigned du64; - return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y)); -} - -#endif // HWY_NATIVE_MUL_64 - -// ------------------------------ MulAdd / NegMulAdd - -// "Include guard": skip if native int MulAdd instructions are available. -#if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_INT_FMA -#undef HWY_NATIVE_INT_FMA -#else -#define HWY_NATIVE_INT_FMA -#endif - -template -HWY_API V MulAdd(V mul, V x, V add) { - return Add(Mul(mul, x), add); -} - -template -HWY_API V NegMulAdd(V mul, V x, V add) { - return Sub(add, Mul(mul, x)); -} - -#endif // HWY_NATIVE_INT_FMA - -// ------------------------------ SatWidenMulPairwiseAdd - -#if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \ - defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#else -#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#endif - -template >, HWY_IF_I16_D(DI16), - HWY_IF_U8_D(DFromV), HWY_IF_I8_D(DFromV), - HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(VI8)), - HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(VU8_2))> -HWY_API Vec SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) { - const RebindToUnsigned du16; - - const auto a0 = And(BitCast(di16, a), Set(di16, int16_t{0x00FF})); - const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b))); - - const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a))); - const auto b1 = ShiftRight<8>(BitCast(di16, b)); - - return SaturatedAdd(Mul(a0, b0), Mul(a1, b1)); -} - -#endif - -// ------------------------------ SumOfMulQuadAccumulate - -#if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \ - defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, - VFromD> a, - VFromD> b, - VFromD sum) { - const Repartition di16; - - const auto a0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, a))); - const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b))); - - const auto a1 = ShiftRight<8>(BitCast(di16, a)); - const auto b1 = ShiftRight<8>(BitCast(di16, b)); - - return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0), - WidenMulPairwiseAdd(di32, a1, b1))); -} - -#endif - -#if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \ - defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate( - DU32 du32, VFromD> a, - VFromD> b, VFromD sum) { - const Repartition du16; - const RebindToSigned di16; - const RebindToSigned di32; - - const auto lo8_mask = Set(di16, int16_t{0x00FF}); - const auto a0 = And(BitCast(di16, a), lo8_mask); - const auto b0 = And(BitCast(di16, b), lo8_mask); - - const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a))); - const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b))); - - return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)), - BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1)))); -} - -#endif - -#if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \ - defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate( - DI32 di32, VFromD> a_u, - VFromD> b_i, VFromD sum) { - const Repartition di16; - const RebindToUnsigned du16; - - const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF})); - const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i))); - - const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u))); - const auto b1 = ShiftRight<8>(BitCast(di16, b_i)); - - // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in - // SumOfMulQuadAccumulate as it is possible for - // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0], - // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same - // sign. - - return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0), - WidenMulPairwiseAdd(di32, a1, b1))); -} - -#endif - -#if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \ - defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE -#endif - -#if HWY_HAVE_INTEGER64 -template -HWY_API VFromD SumOfMulQuadAccumulate( - DI64 di64, VFromD> a, - VFromD> b, VFromD sum) { - const Repartition di32; - - // WidenMulPairwiseAdd(di32, a, b) is okay here as - // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as - // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if - // a[0], b[0], a[1], and b[1] are all equal to -32768. - - const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b); - const auto i32_pairwise_sum_overflow = - VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin()))); - - // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of - // overflow. - const auto hi32_mask = Set(di64, static_cast(~int64_t{0xFFFFFFFF})); - const auto p0_zero_out_mask = - ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow)); - const auto p1_zero_out_mask = - And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask); - - const auto p0 = - AndNot(p0_zero_out_mask, - ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum)))); - const auto p1 = - AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum))); - - return Add(sum, Add(p0, p1)); -} -#endif // HWY_HAVE_INTEGER64 -#endif // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE - -#if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \ - defined(HWY_TARGET_TOGGLE)) - -#ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE -#endif - -#if HWY_HAVE_INTEGER64 -template -HWY_API VFromD SumOfMulQuadAccumulate( - DU64 du64, VFromD> a, - VFromD> b, VFromD sum) { - const auto u32_even_prod = MulEven(a, b); - const auto u32_odd_prod = MulOdd(a, b); - - const auto lo32_mask = Set(du64, uint64_t{0xFFFFFFFFu}); - - const auto p0 = Add(And(BitCast(du64, u32_even_prod), lo32_mask), - And(BitCast(du64, u32_odd_prod), lo32_mask)); - const auto p1 = Add(ShiftRight<32>(BitCast(du64, u32_even_prod)), - ShiftRight<32>(BitCast(du64, u32_odd_prod))); - - return Add(sum, Add(p0, p1)); -} -#endif // HWY_HAVE_INTEGER64 -#endif // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE - -// ------------------------------ F64 ApproximateReciprocal - -#if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_F64_APPROX_RECIP -#undef HWY_NATIVE_F64_APPROX_RECIP -#else -#define HWY_NATIVE_F64_APPROX_RECIP -#endif - -#if HWY_HAVE_FLOAT64 -template )> -HWY_API V ApproximateReciprocal(V v) { - const DFromV d; - return Div(Set(d, 1.0), v); -} -#endif // HWY_HAVE_FLOAT64 - -#endif // HWY_NATIVE_F64_APPROX_RECIP - -// ------------------------------ F64 ApproximateReciprocalSqrt - -#if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_F64_APPROX_RSQRT -#undef HWY_NATIVE_F64_APPROX_RSQRT -#else -#define HWY_NATIVE_F64_APPROX_RSQRT -#endif - -#if HWY_HAVE_FLOAT64 -template )> -HWY_API V ApproximateReciprocalSqrt(V v) { - const DFromV d; - const RebindToUnsigned du; - const auto half = Mul(v, Set(d, 0.5)); - // Initial guess based on log2(f) - const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}), - ShiftRight<1>(BitCast(du, v)))); - // One Newton-Raphson iteration - return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5))); -} -#endif // HWY_HAVE_FLOAT64 - -#endif // HWY_NATIVE_F64_APPROX_RSQRT - -// ------------------------------ Compress* - -// "Include guard": skip if native 8-bit compress instructions are available. -#if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_COMPRESS8 -#undef HWY_NATIVE_COMPRESS8 -#else -#define HWY_NATIVE_COMPRESS8 -#endif - -template -HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d, - T* unaligned) { - HWY_ALIGN T lanes[MaxLanes(d)]; - Store(v, d, lanes); - - const Simd d8; - T* HWY_RESTRICT pos = unaligned; - - HWY_ALIGN constexpr T table[2048] = { - 0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7, // - 1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7, // - 1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7, // - 2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7, // - 1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7, // - 1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7, // - 2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7, // - 1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7, // - 3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7, // - 1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7, // - 2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7, // - 1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7, // - 1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7, // - 2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7, // - 1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7, // - 3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7, // - 1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7, // - 2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7, // - 1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7, // - 4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7, // - 1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7, // - 2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7, // - 1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7, // - 3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7, // - 1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7, // - 2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7, // - 1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7, // - 1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7, // - 2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7, // - 1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7, // - 3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7, // - 1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7, // - 2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7, // - 1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7, // - 4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7, // - 1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7, // - 2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7, // - 1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7, // - 3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7, // - 1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7, // - 2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7, // - 1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7, // - 5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7, // - 1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7, // - 2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7, // - 1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7, // - 3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7, // - 1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7, // - 2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7, // - 1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7, // - 4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7, // - 1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7, // - 2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7, // - 1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7, // - 3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7, // - 1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7, // - 2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7, // - 1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // - 7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6, // - 1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6, // - 2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6, // - 1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6, // - 3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6, // - 1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6, // - 2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6, // - 1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6, // - 4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6, // - 1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6, // - 2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6, // - 1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6, // - 3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6, // - 1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6, // - 2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6, // - 1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6, // - 5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6, // - 1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6, // - 2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6, // - 1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6, // - 3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6, // - 1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6, // - 2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6, // - 1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6, // - 4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6, // - 1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6, // - 2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6, // - 1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6, // - 3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6, // - 1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6, // - 2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6, // - 1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6, // - 6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5, // - 1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5, // - 2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5, // - 1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5, // - 3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5, // - 1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5, // - 2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5, // - 1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5, // - 4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5, // - 1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5, // - 2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5, // - 1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5, // - 3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5, // - 1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5, // - 2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5, // - 1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5, // - 5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4, // - 1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4, // - 2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4, // - 1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4, // - 3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4, // - 1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4, // - 2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4, // - 1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4, // - 4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3, // - 1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3, // - 2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3, // - 1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3, // - 3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2, // - 1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2, // - 2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1, // - 1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7}; - - for (size_t i = 0; i < Lanes(d); i += 8) { - // Each byte worth of bits is the index of one of 256 8-byte ranges, and its - // population count determines how far to advance the write position. - const size_t bits8 = bits[i / 8]; - const auto indices = Load(d8, table + bits8 * 8); - const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices); - StoreU(compressed, d8, pos); - pos += PopCount(bits8); - } - return static_cast(pos - unaligned); -} - -template -HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) { - uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)]; - (void)StoreMaskBits(d, mask, bits); - return CompressBitsStore(v, bits, d, unaligned); -} - -template -HWY_API size_t CompressBlendedStore(V v, M mask, D d, - T* HWY_RESTRICT unaligned) { - HWY_ALIGN T buf[MaxLanes(d)]; - const size_t bytes = CompressStore(v, mask, d, buf); - BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned); - return bytes; -} - -// For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE. -template , HWY_IF_T_SIZE(T, 1)> -HWY_API V Compress(V v, const M mask) { - const DFromV d; - HWY_ALIGN T lanes[MaxLanes(d)]; - (void)CompressStore(v, mask, d, lanes); - return Load(d, lanes); -} - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { - const DFromV d; - HWY_ALIGN T lanes[MaxLanes(d)]; - (void)CompressBitsStore(v, bits, d, lanes); - return Load(d, lanes); -} - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API V CompressNot(V v, M mask) { - return Compress(v, Not(mask)); -} - -#endif // HWY_NATIVE_COMPRESS8 - -// ------------------------------ Expand - -// "Include guard": skip if native 8/16-bit Expand/LoadExpand are available. -// Note that this generic implementation assumes <= 128 bit fixed vectors; -// the SVE and RVV targets provide their own native implementations. -#if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE -#ifdef HWY_NATIVE_EXPAND -#undef HWY_NATIVE_EXPAND -#else -#define HWY_NATIVE_EXPAND -#endif - -namespace detail { - -#if HWY_IDE -template -HWY_INLINE uint64_t BitsFromMask(M /* mask */) { - return 0; -} -#endif // HWY_IDE - -template -HWY_INLINE Vec128 IndicesForExpandFromBits(uint64_t mask_bits) { - static_assert(N <= 8, "Should only be called for half-vectors"); - const Simd du8; - HWY_DASSERT(mask_bits < 0x100); - alignas(16) static constexpr uint8_t table[2048] = { - // PrintExpand8x8Tables - 128, 128, 128, 128, 128, 128, 128, 128, // - 0, 128, 128, 128, 128, 128, 128, 128, // - 128, 0, 128, 128, 128, 128, 128, 128, // - 0, 1, 128, 128, 128, 128, 128, 128, // - 128, 128, 0, 128, 128, 128, 128, 128, // - 0, 128, 1, 128, 128, 128, 128, 128, // - 128, 0, 1, 128, 128, 128, 128, 128, // - 0, 1, 2, 128, 128, 128, 128, 128, // - 128, 128, 128, 0, 128, 128, 128, 128, // - 0, 128, 128, 1, 128, 128, 128, 128, // - 128, 0, 128, 1, 128, 128, 128, 128, // - 0, 1, 128, 2, 128, 128, 128, 128, // - 128, 128, 0, 1, 128, 128, 128, 128, // - 0, 128, 1, 2, 128, 128, 128, 128, // - 128, 0, 1, 2, 128, 128, 128, 128, // - 0, 1, 2, 3, 128, 128, 128, 128, // - 128, 128, 128, 128, 0, 128, 128, 128, // - 0, 128, 128, 128, 1, 128, 128, 128, // - 128, 0, 128, 128, 1, 128, 128, 128, // - 0, 1, 128, 128, 2, 128, 128, 128, // - 128, 128, 0, 128, 1, 128, 128, 128, // - 0, 128, 1, 128, 2, 128, 128, 128, // - 128, 0, 1, 128, 2, 128, 128, 128, // - 0, 1, 2, 128, 3, 128, 128, 128, // - 128, 128, 128, 0, 1, 128, 128, 128, // - 0, 128, 128, 1, 2, 128, 128, 128, // - 128, 0, 128, 1, 2, 128, 128, 128, // - 0, 1, 128, 2, 3, 128, 128, 128, // - 128, 128, 0, 1, 2, 128, 128, 128, // - 0, 128, 1, 2, 3, 128, 128, 128, // - 128, 0, 1, 2, 3, 128, 128, 128, // - 0, 1, 2, 3, 4, 128, 128, 128, // - 128, 128, 128, 128, 128, 0, 128, 128, // - 0, 128, 128, 128, 128, 1, 128, 128, // - 128, 0, 128, 128, 128, 1, 128, 128, // - 0, 1, 128, 128, 128, 2, 128, 128, // - 128, 128, 0, 128, 128, 1, 128, 128, // - 0, 128, 1, 128, 128, 2, 128, 128, // - 128, 0, 1, 128, 128, 2, 128, 128, // - 0, 1, 2, 128, 128, 3, 128, 128, // - 128, 128, 128, 0, 128, 1, 128, 128, // - 0, 128, 128, 1, 128, 2, 128, 128, // - 128, 0, 128, 1, 128, 2, 128, 128, // - 0, 1, 128, 2, 128, 3, 128, 128, // - 128, 128, 0, 1, 128, 2, 128, 128, // - 0, 128, 1, 2, 128, 3, 128, 128, // - 128, 0, 1, 2, 128, 3, 128, 128, // - 0, 1, 2, 3, 128, 4, 128, 128, // - 128, 128, 128, 128, 0, 1, 128, 128, // - 0, 128, 128, 128, 1, 2, 128, 128, // - 128, 0, 128, 128, 1, 2, 128, 128, // - 0, 1, 128, 128, 2, 3, 128, 128, // - 128, 128, 0, 128, 1, 2, 128, 128, // - 0, 128, 1, 128, 2, 3, 128, 128, // - 128, 0, 1, 128, 2, 3, 128, 128, // - 0, 1, 2, 128, 3, 4, 128, 128, // - 128, 128, 128, 0, 1, 2, 128, 128, // - 0, 128, 128, 1, 2, 3, 128, 128, // - 128, 0, 128, 1, 2, 3, 128, 128, // - 0, 1, 128, 2, 3, 4, 128, 128, // - 128, 128, 0, 1, 2, 3, 128, 128, // - 0, 128, 1, 2, 3, 4, 128, 128, // - 128, 0, 1, 2, 3, 4, 128, 128, // - 0, 1, 2, 3, 4, 5, 128, 128, // - 128, 128, 128, 128, 128, 128, 0, 128, // - 0, 128, 128, 128, 128, 128, 1, 128, // - 128, 0, 128, 128, 128, 128, 1, 128, // - 0, 1, 128, 128, 128, 128, 2, 128, // - 128, 128, 0, 128, 128, 128, 1, 128, // - 0, 128, 1, 128, 128, 128, 2, 128, // - 128, 0, 1, 128, 128, 128, 2, 128, // - 0, 1, 2, 128, 128, 128, 3, 128, // - 128, 128, 128, 0, 128, 128, 1, 128, // - 0, 128, 128, 1, 128, 128, 2, 128, // - 128, 0, 128, 1, 128, 128, 2, 128, // - 0, 1, 128, 2, 128, 128, 3, 128, // - 128, 128, 0, 1, 128, 128, 2, 128, // - 0, 128, 1, 2, 128, 128, 3, 128, // - 128, 0, 1, 2, 128, 128, 3, 128, // - 0, 1, 2, 3, 128, 128, 4, 128, // - 128, 128, 128, 128, 0, 128, 1, 128, // - 0, 128, 128, 128, 1, 128, 2, 128, // - 128, 0, 128, 128, 1, 128, 2, 128, // - 0, 1, 128, 128, 2, 128, 3, 128, // - 128, 128, 0, 128, 1, 128, 2, 128, // - 0, 128, 1, 128, 2, 128, 3, 128, // - 128, 0, 1, 128, 2, 128, 3, 128, // - 0, 1, 2, 128, 3, 128, 4, 128, // - 128, 128, 128, 0, 1, 128, 2, 128, // - 0, 128, 128, 1, 2, 128, 3, 128, // - 128, 0, 128, 1, 2, 128, 3, 128, // - 0, 1, 128, 2, 3, 128, 4, 128, // - 128, 128, 0, 1, 2, 128, 3, 128, // - 0, 128, 1, 2, 3, 128, 4, 128, // - 128, 0, 1, 2, 3, 128, 4, 128, // - 0, 1, 2, 3, 4, 128, 5, 128, // - 128, 128, 128, 128, 128, 0, 1, 128, // - 0, 128, 128, 128, 128, 1, 2, 128, // - 128, 0, 128, 128, 128, 1, 2, 128, // - 0, 1, 128, 128, 128, 2, 3, 128, // - 128, 128, 0, 128, 128, 1, 2, 128, // - 0, 128, 1, 128, 128, 2, 3, 128, // - 128, 0, 1, 128, 128, 2, 3, 128, // - 0, 1, 2, 128, 128, 3, 4, 128, // - 128, 128, 128, 0, 128, 1, 2, 128, // - 0, 128, 128, 1, 128, 2, 3, 128, // - 128, 0, 128, 1, 128, 2, 3, 128, // - 0, 1, 128, 2, 128, 3, 4, 128, // - 128, 128, 0, 1, 128, 2, 3, 128, // - 0, 128, 1, 2, 128, 3, 4, 128, // - 128, 0, 1, 2, 128, 3, 4, 128, // - 0, 1, 2, 3, 128, 4, 5, 128, // - 128, 128, 128, 128, 0, 1, 2, 128, // - 0, 128, 128, 128, 1, 2, 3, 128, // - 128, 0, 128, 128, 1, 2, 3, 128, // - 0, 1, 128, 128, 2, 3, 4, 128, // - 128, 128, 0, 128, 1, 2, 3, 128, // - 0, 128, 1, 128, 2, 3, 4, 128, // - 128, 0, 1, 128, 2, 3, 4, 128, // - 0, 1, 2, 128, 3, 4, 5, 128, // - 128, 128, 128, 0, 1, 2, 3, 128, // - 0, 128, 128, 1, 2, 3, 4, 128, // - 128, 0, 128, 1, 2, 3, 4, 128, // - 0, 1, 128, 2, 3, 4, 5, 128, // - 128, 128, 0, 1, 2, 3, 4, 128, // - 0, 128, 1, 2, 3, 4, 5, 128, // - 128, 0, 1, 2, 3, 4, 5, 128, // - 0, 1, 2, 3, 4, 5, 6, 128, // - 128, 128, 128, 128, 128, 128, 128, 0, // - 0, 128, 128, 128, 128, 128, 128, 1, // - 128, 0, 128, 128, 128, 128, 128, 1, // - 0, 1, 128, 128, 128, 128, 128, 2, // - 128, 128, 0, 128, 128, 128, 128, 1, // - 0, 128, 1, 128, 128, 128, 128, 2, // - 128, 0, 1, 128, 128, 128, 128, 2, // - 0, 1, 2, 128, 128, 128, 128, 3, // - 128, 128, 128, 0, 128, 128, 128, 1, // - 0, 128, 128, 1, 128, 128, 128, 2, // - 128, 0, 128, 1, 128, 128, 128, 2, // - 0, 1, 128, 2, 128, 128, 128, 3, // - 128, 128, 0, 1, 128, 128, 128, 2, // - 0, 128, 1, 2, 128, 128, 128, 3, // - 128, 0, 1, 2, 128, 128, 128, 3, // - 0, 1, 2, 3, 128, 128, 128, 4, // - 128, 128, 128, 128, 0, 128, 128, 1, // - 0, 128, 128, 128, 1, 128, 128, 2, // - 128, 0, 128, 128, 1, 128, 128, 2, // - 0, 1, 128, 128, 2, 128, 128, 3, // - 128, 128, 0, 128, 1, 128, 128, 2, // - 0, 128, 1, 128, 2, 128, 128, 3, // - 128, 0, 1, 128, 2, 128, 128, 3, // - 0, 1, 2, 128, 3, 128, 128, 4, // - 128, 128, 128, 0, 1, 128, 128, 2, // - 0, 128, 128, 1, 2, 128, 128, 3, // - 128, 0, 128, 1, 2, 128, 128, 3, // - 0, 1, 128, 2, 3, 128, 128, 4, // - 128, 128, 0, 1, 2, 128, 128, 3, // - 0, 128, 1, 2, 3, 128, 128, 4, // - 128, 0, 1, 2, 3, 128, 128, 4, // - 0, 1, 2, 3, 4, 128, 128, 5, // - 128, 128, 128, 128, 128, 0, 128, 1, // - 0, 128, 128, 128, 128, 1, 128, 2, // - 128, 0, 128, 128, 128, 1, 128, 2, // - 0, 1, 128, 128, 128, 2, 128, 3, // - 128, 128, 0, 128, 128, 1, 128, 2, // - 0, 128, 1, 128, 128, 2, 128, 3, // - 128, 0, 1, 128, 128, 2, 128, 3, // - 0, 1, 2, 128, 128, 3, 128, 4, // - 128, 128, 128, 0, 128, 1, 128, 2, // - 0, 128, 128, 1, 128, 2, 128, 3, // - 128, 0, 128, 1, 128, 2, 128, 3, // - 0, 1, 128, 2, 128, 3, 128, 4, // - 128, 128, 0, 1, 128, 2, 128, 3, // - 0, 128, 1, 2, 128, 3, 128, 4, // - 128, 0, 1, 2, 128, 3, 128, 4, // - 0, 1, 2, 3, 128, 4, 128, 5, // - 128, 128, 128, 128, 0, 1, 128, 2, // - 0, 128, 128, 128, 1, 2, 128, 3, // - 128, 0, 128, 128, 1, 2, 128, 3, // - 0, 1, 128, 128, 2, 3, 128, 4, // - 128, 128, 0, 128, 1, 2, 128, 3, // - 0, 128, 1, 128, 2, 3, 128, 4, // - 128, 0, 1, 128, 2, 3, 128, 4, // - 0, 1, 2, 128, 3, 4, 128, 5, // - 128, 128, 128, 0, 1, 2, 128, 3, // - 0, 128, 128, 1, 2, 3, 128, 4, // - 128, 0, 128, 1, 2, 3, 128, 4, // - 0, 1, 128, 2, 3, 4, 128, 5, // - 128, 128, 0, 1, 2, 3, 128, 4, // - 0, 128, 1, 2, 3, 4, 128, 5, // - 128, 0, 1, 2, 3, 4, 128, 5, // - 0, 1, 2, 3, 4, 5, 128, 6, // - 128, 128, 128, 128, 128, 128, 0, 1, // - 0, 128, 128, 128, 128, 128, 1, 2, // - 128, 0, 128, 128, 128, 128, 1, 2, // - 0, 1, 128, 128, 128, 128, 2, 3, // - 128, 128, 0, 128, 128, 128, 1, 2, // - 0, 128, 1, 128, 128, 128, 2, 3, // - 128, 0, 1, 128, 128, 128, 2, 3, // - 0, 1, 2, 128, 128, 128, 3, 4, // - 128, 128, 128, 0, 128, 128, 1, 2, // - 0, 128, 128, 1, 128, 128, 2, 3, // - 128, 0, 128, 1, 128, 128, 2, 3, // - 0, 1, 128, 2, 128, 128, 3, 4, // - 128, 128, 0, 1, 128, 128, 2, 3, // - 0, 128, 1, 2, 128, 128, 3, 4, // - 128, 0, 1, 2, 128, 128, 3, 4, // - 0, 1, 2, 3, 128, 128, 4, 5, // - 128, 128, 128, 128, 0, 128, 1, 2, // - 0, 128, 128, 128, 1, 128, 2, 3, // - 128, 0, 128, 128, 1, 128, 2, 3, // - 0, 1, 128, 128, 2, 128, 3, 4, // - 128, 128, 0, 128, 1, 128, 2, 3, // - 0, 128, 1, 128, 2, 128, 3, 4, // - 128, 0, 1, 128, 2, 128, 3, 4, // - 0, 1, 2, 128, 3, 128, 4, 5, // - 128, 128, 128, 0, 1, 128, 2, 3, // - 0, 128, 128, 1, 2, 128, 3, 4, // - 128, 0, 128, 1, 2, 128, 3, 4, // - 0, 1, 128, 2, 3, 128, 4, 5, // - 128, 128, 0, 1, 2, 128, 3, 4, // - 0, 128, 1, 2, 3, 128, 4, 5, // - 128, 0, 1, 2, 3, 128, 4, 5, // - 0, 1, 2, 3, 4, 128, 5, 6, // - 128, 128, 128, 128, 128, 0, 1, 2, // - 0, 128, 128, 128, 128, 1, 2, 3, // - 128, 0, 128, 128, 128, 1, 2, 3, // - 0, 1, 128, 128, 128, 2, 3, 4, // - 128, 128, 0, 128, 128, 1, 2, 3, // - 0, 128, 1, 128, 128, 2, 3, 4, // - 128, 0, 1, 128, 128, 2, 3, 4, // - 0, 1, 2, 128, 128, 3, 4, 5, // - 128, 128, 128, 0, 128, 1, 2, 3, // - 0, 128, 128, 1, 128, 2, 3, 4, // - 128, 0, 128, 1, 128, 2, 3, 4, // - 0, 1, 128, 2, 128, 3, 4, 5, // - 128, 128, 0, 1, 128, 2, 3, 4, // - 0, 128, 1, 2, 128, 3, 4, 5, // - 128, 0, 1, 2, 128, 3, 4, 5, // - 0, 1, 2, 3, 128, 4, 5, 6, // - 128, 128, 128, 128, 0, 1, 2, 3, // - 0, 128, 128, 128, 1, 2, 3, 4, // - 128, 0, 128, 128, 1, 2, 3, 4, // - 0, 1, 128, 128, 2, 3, 4, 5, // - 128, 128, 0, 128, 1, 2, 3, 4, // - 0, 128, 1, 128, 2, 3, 4, 5, // - 128, 0, 1, 128, 2, 3, 4, 5, // - 0, 1, 2, 128, 3, 4, 5, 6, // - 128, 128, 128, 0, 1, 2, 3, 4, // - 0, 128, 128, 1, 2, 3, 4, 5, // - 128, 0, 128, 1, 2, 3, 4, 5, // - 0, 1, 128, 2, 3, 4, 5, 6, // - 128, 128, 0, 1, 2, 3, 4, 5, // - 0, 128, 1, 2, 3, 4, 5, 6, // - 128, 0, 1, 2, 3, 4, 5, 6, // - 0, 1, 2, 3, 4, 5, 6, 7}; - return LoadU(du8, table + mask_bits * 8); -} - -} // namespace detail - -// Half vector of bytes: one table lookup -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - const DFromV d; - - const uint64_t mask_bits = detail::BitsFromMask(mask); - const Vec128 indices = - detail::IndicesForExpandFromBits(mask_bits); - return BitCast(d, TableLookupBytesOr0(v, indices)); -} - -// Full vector of bytes: two table lookups -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - const Full128 d; - const RebindToUnsigned du; - const Half duh; - const Vec128 vu = BitCast(du, v); - - const uint64_t mask_bits = detail::BitsFromMask(mask); - const uint64_t maskL = mask_bits & 0xFF; - const uint64_t maskH = mask_bits >> 8; - - // We want to skip past the v bytes already consumed by idxL. There is no - // instruction for shift-reg by variable bytes. Storing v itself would work - // but would involve a store-load forwarding stall. We instead shuffle using - // loaded indices. multishift_epi64_epi8 would also help, but if we have that, - // we probably also have native 8-bit Expand. - alignas(16) static constexpr uint8_t iota[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; - const VFromD shift = LoadU(du, iota + PopCount(maskL)); - const VFromD vL = LowerHalf(duh, vu); - const VFromD vH = - LowerHalf(duh, TableLookupBytesOr0(vu, shift)); - - const VFromD idxL = detail::IndicesForExpandFromBits<8>(maskL); - const VFromD idxH = detail::IndicesForExpandFromBits<8>(maskH); - - const VFromD expandL = TableLookupBytesOr0(vL, idxL); - const VFromD expandH = TableLookupBytesOr0(vH, idxH); - return BitCast(d, Combine(du, expandH, expandL)); -} - -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - const DFromV d; - const RebindToUnsigned du; - - const Rebind du8; - const uint64_t mask_bits = detail::BitsFromMask(mask); - - // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply - // the nibble trick used below because not all indices fit within one lane. - alignas(16) static constexpr uint8_t table[2048] = { - // PrintExpand16x8ByteTables - 128, 128, 128, 128, 128, 128, 128, 128, // - 0, 128, 128, 128, 128, 128, 128, 128, // - 128, 0, 128, 128, 128, 128, 128, 128, // - 0, 2, 128, 128, 128, 128, 128, 128, // - 128, 128, 0, 128, 128, 128, 128, 128, // - 0, 128, 2, 128, 128, 128, 128, 128, // - 128, 0, 2, 128, 128, 128, 128, 128, // - 0, 2, 4, 128, 128, 128, 128, 128, // - 128, 128, 128, 0, 128, 128, 128, 128, // - 0, 128, 128, 2, 128, 128, 128, 128, // - 128, 0, 128, 2, 128, 128, 128, 128, // - 0, 2, 128, 4, 128, 128, 128, 128, // - 128, 128, 0, 2, 128, 128, 128, 128, // - 0, 128, 2, 4, 128, 128, 128, 128, // - 128, 0, 2, 4, 128, 128, 128, 128, // - 0, 2, 4, 6, 128, 128, 128, 128, // - 128, 128, 128, 128, 0, 128, 128, 128, // - 0, 128, 128, 128, 2, 128, 128, 128, // - 128, 0, 128, 128, 2, 128, 128, 128, // - 0, 2, 128, 128, 4, 128, 128, 128, // - 128, 128, 0, 128, 2, 128, 128, 128, // - 0, 128, 2, 128, 4, 128, 128, 128, // - 128, 0, 2, 128, 4, 128, 128, 128, // - 0, 2, 4, 128, 6, 128, 128, 128, // - 128, 128, 128, 0, 2, 128, 128, 128, // - 0, 128, 128, 2, 4, 128, 128, 128, // - 128, 0, 128, 2, 4, 128, 128, 128, // - 0, 2, 128, 4, 6, 128, 128, 128, // - 128, 128, 0, 2, 4, 128, 128, 128, // - 0, 128, 2, 4, 6, 128, 128, 128, // - 128, 0, 2, 4, 6, 128, 128, 128, // - 0, 2, 4, 6, 8, 128, 128, 128, // - 128, 128, 128, 128, 128, 0, 128, 128, // - 0, 128, 128, 128, 128, 2, 128, 128, // - 128, 0, 128, 128, 128, 2, 128, 128, // - 0, 2, 128, 128, 128, 4, 128, 128, // - 128, 128, 0, 128, 128, 2, 128, 128, // - 0, 128, 2, 128, 128, 4, 128, 128, // - 128, 0, 2, 128, 128, 4, 128, 128, // - 0, 2, 4, 128, 128, 6, 128, 128, // - 128, 128, 128, 0, 128, 2, 128, 128, // - 0, 128, 128, 2, 128, 4, 128, 128, // - 128, 0, 128, 2, 128, 4, 128, 128, // - 0, 2, 128, 4, 128, 6, 128, 128, // - 128, 128, 0, 2, 128, 4, 128, 128, // - 0, 128, 2, 4, 128, 6, 128, 128, // - 128, 0, 2, 4, 128, 6, 128, 128, // - 0, 2, 4, 6, 128, 8, 128, 128, // - 128, 128, 128, 128, 0, 2, 128, 128, // - 0, 128, 128, 128, 2, 4, 128, 128, // - 128, 0, 128, 128, 2, 4, 128, 128, // - 0, 2, 128, 128, 4, 6, 128, 128, // - 128, 128, 0, 128, 2, 4, 128, 128, // - 0, 128, 2, 128, 4, 6, 128, 128, // - 128, 0, 2, 128, 4, 6, 128, 128, // - 0, 2, 4, 128, 6, 8, 128, 128, // - 128, 128, 128, 0, 2, 4, 128, 128, // - 0, 128, 128, 2, 4, 6, 128, 128, // - 128, 0, 128, 2, 4, 6, 128, 128, // - 0, 2, 128, 4, 6, 8, 128, 128, // - 128, 128, 0, 2, 4, 6, 128, 128, // - 0, 128, 2, 4, 6, 8, 128, 128, // - 128, 0, 2, 4, 6, 8, 128, 128, // - 0, 2, 4, 6, 8, 10, 128, 128, // - 128, 128, 128, 128, 128, 128, 0, 128, // - 0, 128, 128, 128, 128, 128, 2, 128, // - 128, 0, 128, 128, 128, 128, 2, 128, // - 0, 2, 128, 128, 128, 128, 4, 128, // - 128, 128, 0, 128, 128, 128, 2, 128, // - 0, 128, 2, 128, 128, 128, 4, 128, // - 128, 0, 2, 128, 128, 128, 4, 128, // - 0, 2, 4, 128, 128, 128, 6, 128, // - 128, 128, 128, 0, 128, 128, 2, 128, // - 0, 128, 128, 2, 128, 128, 4, 128, // - 128, 0, 128, 2, 128, 128, 4, 128, // - 0, 2, 128, 4, 128, 128, 6, 128, // - 128, 128, 0, 2, 128, 128, 4, 128, // - 0, 128, 2, 4, 128, 128, 6, 128, // - 128, 0, 2, 4, 128, 128, 6, 128, // - 0, 2, 4, 6, 128, 128, 8, 128, // - 128, 128, 128, 128, 0, 128, 2, 128, // - 0, 128, 128, 128, 2, 128, 4, 128, // - 128, 0, 128, 128, 2, 128, 4, 128, // - 0, 2, 128, 128, 4, 128, 6, 128, // - 128, 128, 0, 128, 2, 128, 4, 128, // - 0, 128, 2, 128, 4, 128, 6, 128, // - 128, 0, 2, 128, 4, 128, 6, 128, // - 0, 2, 4, 128, 6, 128, 8, 128, // - 128, 128, 128, 0, 2, 128, 4, 128, // - 0, 128, 128, 2, 4, 128, 6, 128, // - 128, 0, 128, 2, 4, 128, 6, 128, // - 0, 2, 128, 4, 6, 128, 8, 128, // - 128, 128, 0, 2, 4, 128, 6, 128, // - 0, 128, 2, 4, 6, 128, 8, 128, // - 128, 0, 2, 4, 6, 128, 8, 128, // - 0, 2, 4, 6, 8, 128, 10, 128, // - 128, 128, 128, 128, 128, 0, 2, 128, // - 0, 128, 128, 128, 128, 2, 4, 128, // - 128, 0, 128, 128, 128, 2, 4, 128, // - 0, 2, 128, 128, 128, 4, 6, 128, // - 128, 128, 0, 128, 128, 2, 4, 128, // - 0, 128, 2, 128, 128, 4, 6, 128, // - 128, 0, 2, 128, 128, 4, 6, 128, // - 0, 2, 4, 128, 128, 6, 8, 128, // - 128, 128, 128, 0, 128, 2, 4, 128, // - 0, 128, 128, 2, 128, 4, 6, 128, // - 128, 0, 128, 2, 128, 4, 6, 128, // - 0, 2, 128, 4, 128, 6, 8, 128, // - 128, 128, 0, 2, 128, 4, 6, 128, // - 0, 128, 2, 4, 128, 6, 8, 128, // - 128, 0, 2, 4, 128, 6, 8, 128, // - 0, 2, 4, 6, 128, 8, 10, 128, // - 128, 128, 128, 128, 0, 2, 4, 128, // - 0, 128, 128, 128, 2, 4, 6, 128, // - 128, 0, 128, 128, 2, 4, 6, 128, // - 0, 2, 128, 128, 4, 6, 8, 128, // - 128, 128, 0, 128, 2, 4, 6, 128, // - 0, 128, 2, 128, 4, 6, 8, 128, // - 128, 0, 2, 128, 4, 6, 8, 128, // - 0, 2, 4, 128, 6, 8, 10, 128, // - 128, 128, 128, 0, 2, 4, 6, 128, // - 0, 128, 128, 2, 4, 6, 8, 128, // - 128, 0, 128, 2, 4, 6, 8, 128, // - 0, 2, 128, 4, 6, 8, 10, 128, // - 128, 128, 0, 2, 4, 6, 8, 128, // - 0, 128, 2, 4, 6, 8, 10, 128, // - 128, 0, 2, 4, 6, 8, 10, 128, // - 0, 2, 4, 6, 8, 10, 12, 128, // - 128, 128, 128, 128, 128, 128, 128, 0, // - 0, 128, 128, 128, 128, 128, 128, 2, // - 128, 0, 128, 128, 128, 128, 128, 2, // - 0, 2, 128, 128, 128, 128, 128, 4, // - 128, 128, 0, 128, 128, 128, 128, 2, // - 0, 128, 2, 128, 128, 128, 128, 4, // - 128, 0, 2, 128, 128, 128, 128, 4, // - 0, 2, 4, 128, 128, 128, 128, 6, // - 128, 128, 128, 0, 128, 128, 128, 2, // - 0, 128, 128, 2, 128, 128, 128, 4, // - 128, 0, 128, 2, 128, 128, 128, 4, // - 0, 2, 128, 4, 128, 128, 128, 6, // - 128, 128, 0, 2, 128, 128, 128, 4, // - 0, 128, 2, 4, 128, 128, 128, 6, // - 128, 0, 2, 4, 128, 128, 128, 6, // - 0, 2, 4, 6, 128, 128, 128, 8, // - 128, 128, 128, 128, 0, 128, 128, 2, // - 0, 128, 128, 128, 2, 128, 128, 4, // - 128, 0, 128, 128, 2, 128, 128, 4, // - 0, 2, 128, 128, 4, 128, 128, 6, // - 128, 128, 0, 128, 2, 128, 128, 4, // - 0, 128, 2, 128, 4, 128, 128, 6, // - 128, 0, 2, 128, 4, 128, 128, 6, // - 0, 2, 4, 128, 6, 128, 128, 8, // - 128, 128, 128, 0, 2, 128, 128, 4, // - 0, 128, 128, 2, 4, 128, 128, 6, // - 128, 0, 128, 2, 4, 128, 128, 6, // - 0, 2, 128, 4, 6, 128, 128, 8, // - 128, 128, 0, 2, 4, 128, 128, 6, // - 0, 128, 2, 4, 6, 128, 128, 8, // - 128, 0, 2, 4, 6, 128, 128, 8, // - 0, 2, 4, 6, 8, 128, 128, 10, // - 128, 128, 128, 128, 128, 0, 128, 2, // - 0, 128, 128, 128, 128, 2, 128, 4, // - 128, 0, 128, 128, 128, 2, 128, 4, // - 0, 2, 128, 128, 128, 4, 128, 6, // - 128, 128, 0, 128, 128, 2, 128, 4, // - 0, 128, 2, 128, 128, 4, 128, 6, // - 128, 0, 2, 128, 128, 4, 128, 6, // - 0, 2, 4, 128, 128, 6, 128, 8, // - 128, 128, 128, 0, 128, 2, 128, 4, // - 0, 128, 128, 2, 128, 4, 128, 6, // - 128, 0, 128, 2, 128, 4, 128, 6, // - 0, 2, 128, 4, 128, 6, 128, 8, // - 128, 128, 0, 2, 128, 4, 128, 6, // - 0, 128, 2, 4, 128, 6, 128, 8, // - 128, 0, 2, 4, 128, 6, 128, 8, // - 0, 2, 4, 6, 128, 8, 128, 10, // - 128, 128, 128, 128, 0, 2, 128, 4, // - 0, 128, 128, 128, 2, 4, 128, 6, // - 128, 0, 128, 128, 2, 4, 128, 6, // - 0, 2, 128, 128, 4, 6, 128, 8, // - 128, 128, 0, 128, 2, 4, 128, 6, // - 0, 128, 2, 128, 4, 6, 128, 8, // - 128, 0, 2, 128, 4, 6, 128, 8, // - 0, 2, 4, 128, 6, 8, 128, 10, // - 128, 128, 128, 0, 2, 4, 128, 6, // - 0, 128, 128, 2, 4, 6, 128, 8, // - 128, 0, 128, 2, 4, 6, 128, 8, // - 0, 2, 128, 4, 6, 8, 128, 10, // - 128, 128, 0, 2, 4, 6, 128, 8, // - 0, 128, 2, 4, 6, 8, 128, 10, // - 128, 0, 2, 4, 6, 8, 128, 10, // - 0, 2, 4, 6, 8, 10, 128, 12, // - 128, 128, 128, 128, 128, 128, 0, 2, // - 0, 128, 128, 128, 128, 128, 2, 4, // - 128, 0, 128, 128, 128, 128, 2, 4, // - 0, 2, 128, 128, 128, 128, 4, 6, // - 128, 128, 0, 128, 128, 128, 2, 4, // - 0, 128, 2, 128, 128, 128, 4, 6, // - 128, 0, 2, 128, 128, 128, 4, 6, // - 0, 2, 4, 128, 128, 128, 6, 8, // - 128, 128, 128, 0, 128, 128, 2, 4, // - 0, 128, 128, 2, 128, 128, 4, 6, // - 128, 0, 128, 2, 128, 128, 4, 6, // - 0, 2, 128, 4, 128, 128, 6, 8, // - 128, 128, 0, 2, 128, 128, 4, 6, // - 0, 128, 2, 4, 128, 128, 6, 8, // - 128, 0, 2, 4, 128, 128, 6, 8, // - 0, 2, 4, 6, 128, 128, 8, 10, // - 128, 128, 128, 128, 0, 128, 2, 4, // - 0, 128, 128, 128, 2, 128, 4, 6, // - 128, 0, 128, 128, 2, 128, 4, 6, // - 0, 2, 128, 128, 4, 128, 6, 8, // - 128, 128, 0, 128, 2, 128, 4, 6, // - 0, 128, 2, 128, 4, 128, 6, 8, // - 128, 0, 2, 128, 4, 128, 6, 8, // - 0, 2, 4, 128, 6, 128, 8, 10, // - 128, 128, 128, 0, 2, 128, 4, 6, // - 0, 128, 128, 2, 4, 128, 6, 8, // - 128, 0, 128, 2, 4, 128, 6, 8, // - 0, 2, 128, 4, 6, 128, 8, 10, // - 128, 128, 0, 2, 4, 128, 6, 8, // - 0, 128, 2, 4, 6, 128, 8, 10, // - 128, 0, 2, 4, 6, 128, 8, 10, // - 0, 2, 4, 6, 8, 128, 10, 12, // - 128, 128, 128, 128, 128, 0, 2, 4, // - 0, 128, 128, 128, 128, 2, 4, 6, // - 128, 0, 128, 128, 128, 2, 4, 6, // - 0, 2, 128, 128, 128, 4, 6, 8, // - 128, 128, 0, 128, 128, 2, 4, 6, // - 0, 128, 2, 128, 128, 4, 6, 8, // - 128, 0, 2, 128, 128, 4, 6, 8, // - 0, 2, 4, 128, 128, 6, 8, 10, // - 128, 128, 128, 0, 128, 2, 4, 6, // - 0, 128, 128, 2, 128, 4, 6, 8, // - 128, 0, 128, 2, 128, 4, 6, 8, // - 0, 2, 128, 4, 128, 6, 8, 10, // - 128, 128, 0, 2, 128, 4, 6, 8, // - 0, 128, 2, 4, 128, 6, 8, 10, // - 128, 0, 2, 4, 128, 6, 8, 10, // - 0, 2, 4, 6, 128, 8, 10, 12, // - 128, 128, 128, 128, 0, 2, 4, 6, // - 0, 128, 128, 128, 2, 4, 6, 8, // - 128, 0, 128, 128, 2, 4, 6, 8, // - 0, 2, 128, 128, 4, 6, 8, 10, // - 128, 128, 0, 128, 2, 4, 6, 8, // - 0, 128, 2, 128, 4, 6, 8, 10, // - 128, 0, 2, 128, 4, 6, 8, 10, // - 0, 2, 4, 128, 6, 8, 10, 12, // - 128, 128, 128, 0, 2, 4, 6, 8, // - 0, 128, 128, 2, 4, 6, 8, 10, // - 128, 0, 128, 2, 4, 6, 8, 10, // - 0, 2, 128, 4, 6, 8, 10, 12, // - 128, 128, 0, 2, 4, 6, 8, 10, // - 0, 128, 2, 4, 6, 8, 10, 12, // - 128, 0, 2, 4, 6, 8, 10, 12, // - 0, 2, 4, 6, 8, 10, 12, 14}; - // Extend to double length because InterleaveLower will only use the (valid) - // lower half, and we want N u16. - const Twice du8x2; - const Vec128 indices8 = - ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8)); - const Vec128 indices16 = - BitCast(du, InterleaveLower(du8x2, indices8, indices8)); - // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte - // indices, add 0 to even and 1 to odd byte lanes. - const Vec128 byte_indices = Add(indices16, Set(du, 0x0100)); - return BitCast(d, TableLookupBytesOr0(v, byte_indices)); -} - -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - const DFromV d; - const RebindToUnsigned du; - - const uint64_t mask_bits = detail::BitsFromMask(mask); - - alignas(16) static constexpr uint32_t packed_array[16] = { - // PrintExpand64x4Nibble - same for 32x4. - 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, - 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, - 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 2). - const Vec128 packed = Set(du, packed_array[mask_bits]); - alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12}; - Vec128 indices = packed >> Load(du, shifts); - // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec - // checks bounds, so clear the upper bits. - indices = And(indices, Set(du, N - 1)); - const Vec128 expand = - TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices)); - // TableLookupLanes cannot also zero masked-off lanes, so do that now. - return IfThenElseZero(mask, BitCast(d, expand)); -} - -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - // Same as Compress, just zero out the mask=false lanes. - return IfThenElseZero(mask, Compress(v, mask)); -} - -// For single-element vectors, this is at least as fast as native. -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - return IfThenElseZero(mask, v); -} - -// ------------------------------ LoadExpand -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { - return Expand(LoadU(d, unaligned), mask); -} - -#endif // HWY_NATIVE_EXPAND - -// ------------------------------ TwoTablesLookupLanes - -template -using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned()))); - -// RVV/SVE have their own implementations of -// TwoTablesLookupLanes(D d, VFromD a, VFromD b, IndicesFromD idx) -#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \ - HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \ - HWY_TARGET != HWY_SVE2_128 -template -HWY_API VFromD TwoTablesLookupLanes(D /*d*/, VFromD a, VFromD b, - IndicesFromD idx) { - return TwoTablesLookupLanes(a, b, idx); -} -#endif - -// ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit) - -#if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -#undef HWY_PREFER_ROTATE -// Platforms on which RotateRight is likely faster than TableLookupBytes. -// RVV and SVE anyway have their own implementation of this. -#if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \ - HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8 -#define HWY_PREFER_ROTATE 1 -#else -#define HWY_PREFER_ROTATE 0 -#endif - -template -HWY_API VFromD Reverse2(D d, VFromD v) { - // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions. -#if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3 - const Repartition du16; - return BitCast(d, RotateRight<8>(BitCast(du16, v))); -#else - alignas(16) static constexpr TFromD kShuffle[16] = { - 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - return TableLookupBytes(v, LoadDup128(d, kShuffle)); -#endif -} - -template -HWY_API VFromD Reverse4(D d, VFromD v) { -#if HWY_PREFER_ROTATE - const Repartition du16; - return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v)))); -#else - alignas(16) static constexpr uint8_t kShuffle[16] = { - 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}; - const Repartition du8; - return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle))); -#endif -} - -template -HWY_API VFromD Reverse8(D d, VFromD v) { -#if HWY_PREFER_ROTATE - const Repartition du32; - return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v)))); -#else - alignas(16) static constexpr uint8_t kShuffle[16] = { - 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; - const Repartition du8; - return TableLookupBytes(v, BitCast(d, LoadDup128(du8, kShuffle))); -#endif -} - -#endif // HWY_NATIVE_REVERSE2_8 - -// ------------------------------ ReverseLaneBytes - -#if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_REVERSE_LANE_BYTES -#undef HWY_NATIVE_REVERSE_LANE_BYTES -#else -#define HWY_NATIVE_REVERSE_LANE_BYTES -#endif - -template -HWY_API V ReverseLaneBytes(V v) { - const DFromV d; - const Repartition du8; - return BitCast(d, Reverse2(du8, BitCast(du8, v))); -} - -template -HWY_API V ReverseLaneBytes(V v) { - const DFromV d; - const Repartition du8; - return BitCast(d, Reverse4(du8, BitCast(du8, v))); -} - -template -HWY_API V ReverseLaneBytes(V v) { - const DFromV d; - const Repartition du8; - return BitCast(d, Reverse8(du8, BitCast(du8, v))); -} - -#endif // HWY_NATIVE_REVERSE_LANE_BYTES - -// ------------------------------ ReverseBits - -// On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore -// require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit -// shifts because those would add extra masking already taken care of by -// UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to -// implement ReverseBits, so this code is not used there. -#undef HWY_REVERSE_BITS_MIN_BYTES -#if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \ - HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256) -#define HWY_REVERSE_BITS_MIN_BYTES 2 -#else -#define HWY_REVERSE_BITS_MIN_BYTES 1 -#endif - -#if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_REVERSE_BITS_UI8 -#undef HWY_NATIVE_REVERSE_BITS_UI8 -#else -#define HWY_NATIVE_REVERSE_BITS_UI8 -#endif - -namespace detail { - -template , HWY_REVERSE_BITS_MIN_BYTES - 1)> -HWY_INLINE V UI8ReverseBitsStep(V v) { - const DFromV d; - const RebindToUnsigned du; -#if HWY_REVERSE_BITS_MIN_BYTES == 2 - const Repartition d_shift; -#else - const RebindToUnsigned d_shift; -#endif - - const auto v_to_shift = BitCast(d_shift, v); - const auto shl_result = BitCast(d, ShiftLeft(v_to_shift)); - const auto shr_result = BitCast(d, ShiftRight(v_to_shift)); - const auto shr_result_mask = - BitCast(d, Set(du, static_cast(kShrResultMask))); - return Or(And(shr_result, shr_result_mask), - AndNot(shr_result_mask, shl_result)); -} - -#if HWY_REVERSE_BITS_MIN_BYTES == 2 -template , 1)> -HWY_INLINE V UI8ReverseBitsStep(V v) { - return V{UI8ReverseBitsStep(Vec128{v.raw}) - .raw}; -} -#endif - -} // namespace detail - -template -HWY_API V ReverseBits(V v) { - auto result = detail::UI8ReverseBitsStep<1, 0x55>(v); - result = detail::UI8ReverseBitsStep<2, 0x33>(result); - result = detail::UI8ReverseBitsStep<4, 0x0F>(result); - return result; -} - -#endif // HWY_NATIVE_REVERSE_BITS_UI8 - -#if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#else -#define HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#endif - -template -HWY_API V ReverseBits(V v) { - const DFromV d; - const Repartition du8; - return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v)))); -} -#endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64 - -// ------------------------------ Per4LaneBlockShuffle - -#if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#else -#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#endif - -#if HWY_TARGET != HWY_SCALAR -namespace detail { - -template -HWY_INLINE Vec Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, - const uint32_t x2, - const uint32_t x1, - const uint32_t x0) { - alignas(16) const uint32_t lanes[4] = {x0, x1, x2, x3}; - -#if HWY_TARGET == HWY_RVV - constexpr int kPow2 = d.Pow2(); - constexpr int kLoadPow2 = HWY_MAX(kPow2, -1); - const ScalableTag d_load; -#else - constexpr size_t kMaxBytes = d.MaxBytes(); -#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES - constexpr size_t kMinLanesToLoad = 2; -#else - constexpr size_t kMinLanesToLoad = 4; -#endif - constexpr size_t kNumToLoad = - HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad); - const CappedTag d_load; -#endif - - return ResizeBitCast(d, LoadDup128(d_load, lanes)); -} - -} // namespace detail -#endif - -#endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32 - -#if HWY_TARGET != HWY_SCALAR -namespace detail { - -template -HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) { - return DupEven(v); -} - -template -HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) { - const DFromV d; - return Reverse2(d, v); -} - -template -HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) { - return v; -} - -template -HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) { - return DupOdd(v); -} - -HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { -#if HWY_IS_LITTLE_ENDIAN - return static_cast((idx3 << 24) | (idx2 << 16) | (idx1 << 8) | - idx0); -#else - return static_cast(idx3 | (idx2 << 8) | (idx1 << 16) | - (idx0 << 24)); -#endif -} - -template -HWY_INLINE Vec TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { -#if HWY_TARGET == HWY_RVV - const AdjustSimdTagToMinVecPow2> du32; -#else - const Repartition du32; -#endif - - return ResizeBitCast( - d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0))); -} - -#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \ - HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128 -#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr -#else -#define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8) - -template -HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) { - const DFromV d; - const Repartition du8; - return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx))); -} - -template -HWY_INLINE Vec TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const Repartition du32; - const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0); - const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( - du32, static_cast(idx3210 + 0x0C0C0C0C), - static_cast(idx3210 + 0x08080808), - static_cast(idx3210 + 0x04040404), - static_cast(idx3210)); - return ResizeBitCast(d, v_byte_idx); -} - -template -HWY_INLINE Vec TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const Repartition du32; -#if HWY_IS_LITTLE_ENDIAN - const uint32_t idx10 = static_cast((idx1 << 16) | idx0); - const uint32_t idx32 = static_cast((idx3 << 16) | idx2); - constexpr uint32_t kLaneByteOffsets{0x01000100}; -#else - const uint32_t idx10 = static_cast(idx1 | (idx0 << 16)); - const uint32_t idx32 = static_cast(idx3 | (idx2 << 16)); - constexpr uint32_t kLaneByteOffsets{0x00010001}; -#endif - constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u}; - - const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( - du32, static_cast(idx32 * 0x0202u + kHiLaneByteOffsets), - static_cast(idx10 * 0x0202u + kHiLaneByteOffsets), - static_cast(idx32 * 0x0202u + kLaneByteOffsets), - static_cast(idx10 * 0x0202u + kLaneByteOffsets)); - return ResizeBitCast(d, v_byte_idx); -} - -template -HWY_INLINE Vec TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const Repartition du32; -#if HWY_IS_LITTLE_ENDIAN - constexpr uint32_t kLaneByteOffsets{0x03020100}; -#else - constexpr uint32_t kLaneByteOffsets{0x00010203}; -#endif - - const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( - du32, static_cast(idx3 * 0x04040404u + kLaneByteOffsets), - static_cast(idx2 * 0x04040404u + kLaneByteOffsets), - static_cast(idx1 * 0x04040404u + kLaneByteOffsets), - static_cast(idx0 * 0x04040404u + kLaneByteOffsets)); - return ResizeBitCast(d, v_byte_idx); -} -#endif - -template -HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0); -} - -#if HWY_TARGET == HWY_RVV -template -HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const Rebind du8; - return PromoteTo(d, - TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0)); -} -#else -template -HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const uint16_t u16_idx0 = static_cast(idx0); - const uint16_t u16_idx1 = static_cast(idx1); - const uint16_t u16_idx2 = static_cast(idx2); - const uint16_t u16_idx3 = static_cast(idx3); - alignas(16) - const uint16_t indices[8] = {u16_idx0, u16_idx1, u16_idx2, u16_idx3, - u16_idx0, u16_idx1, u16_idx2, u16_idx3}; - -#if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES - constexpr size_t kMinLanesToLoad = 4; -#else - constexpr size_t kMinLanesToLoad = 8; -#endif - constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad); - const CappedTag d_load; - - return ResizeBitCast(d, LoadDup128(d_load, indices)); -} - -template -HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0); -} - -template -HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const RebindToUnsigned du; - const Rebind du32; - return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2, - idx1, idx0))); -} -#endif - -template -HWY_INLINE IndicesFromD TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, - const uint32_t idx2, - const uint32_t idx1, - const uint32_t idx0) { - const RebindToUnsigned du; - using TU = TFromD; - auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0); - - constexpr size_t kN = HWY_MAX_LANES_D(D); - if (kN < 4) { - idx_in_blk = And(idx_in_blk, Set(du, static_cast(kN - 1))); - } - -#if HWY_TARGET == HWY_RVV - const auto blk_offsets = AndS(Iota0(du), static_cast(~TU{3})); -#else - const auto blk_offsets = - And(Iota(du, TU{0}), Set(du, static_cast(~TU{3}))); -#endif - return IndicesFromVec(d, Add(idx_in_blk, blk_offsets)); -} - -template )> -HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD> idx) { - return TableLookupLanes(v, idx); -} - -#undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE - -template -HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) { - const DFromV d; - const uint32_t idx3 = static_cast((idx3210 >> 6) & 3); - const uint32_t idx2 = static_cast((idx3210 >> 4) & 3); - const uint32_t idx1 = static_cast((idx3210 >> 2) & 3); - const uint32_t idx0 = static_cast(idx3210 & 3); - const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0); - return Per4LaneBlkShufDoTblLookup(v, idx); -} - -// The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag -// and vect_size_tag parameters are only called for vectors that have at -// least 4 lanes (or scalable vectors that might possibly have 4 or more lanes) -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - return TblLookupPer4LaneBlkShuf(v, kIdx3210); -} - -#if HWY_HAVE_FLOAT64 -template -HWY_INLINE VFromD>> Per4LaneBlockShufCastToWide( - hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) { - const DFromV d; - const RepartitionToWide dw; - return BitCast(dw, v); -} -#endif - -template -HWY_INLINE VFromD>>> -Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */, - hwy::SizeTag /* lane_size_tag */, V v) { - const DFromV d; - const RebindToUnsigned du; - const RepartitionToWide dw; - return BitCast(dw, v); -} - -template -HWY_INLINE VFromD>> Per4LaneBlockShufCastToWide( - hwy::NonFloatTag /* type_tag */, - hwy::SizeTag /* lane_size_tag */, V v) { - const DFromV d; - const RepartitionToWide dw; - return BitCast(dw, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) { - const DFromV d; - return Reverse4(d, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) { - const DFromV d; - const auto vw = Per4LaneBlockShufCastToWide( - hwy::IsFloatTag>(), hwy::SizeTag)>(), v); - return BitCast(d, DupEven(vw)); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) { - const DFromV d; - const auto vw = Per4LaneBlockShufCastToWide( - hwy::IsFloatTag>(), hwy::SizeTag)>(), v); - const DFromV dw; - return BitCast(d, Reverse2(dw, vw)); -} - -#if HWY_MAX_BYTES >= 32 -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) { - return SwapAdjacentBlocks(v); -} -#endif - -template , 4), - HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) { - const DFromV d; - return InterleaveLower(d, v, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) { - const DFromV d; - return InterleaveLower(d, v, v); -} - -template , 4)> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) { - const DFromV d; - return ConcatEven(d, v, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) { - return DupEven(v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) { - const DFromV d; - return Reverse2(d, v); -} - -template , 4)> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) { - const DFromV d; - return ConcatOdd(d, v, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) { - return v; -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) { - const DFromV d; - const auto vw = Per4LaneBlockShufCastToWide( - hwy::IsFloatTag>(), hwy::SizeTag)>(), v); - return BitCast(d, DupOdd(vw)); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) { - return DupOdd(v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) { - const DFromV d; - return InterleaveUpper(d, v, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag idx_3210_tag, V v) { - const DFromV d; - return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag)>(), - hwy::SizeTag(), v); -} - -} // namespace detail -#endif // HWY_TARGET != HWY_SCALAR - -template , 1)> -HWY_API V Per4LaneBlockShuffle(V v) { - static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); - static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); - static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); - static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); - - return v; -} - -#if HWY_TARGET != HWY_SCALAR -template , 2)> -HWY_API V Per4LaneBlockShuffle(V v) { - static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); - static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); - static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); - static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); - - constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1); - constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0); - constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1); - - constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0; - static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true"); - return detail::Per2LaneBlockShuffle(hwy::SizeTag(), v); -} - -template , 2)> -HWY_API V Per4LaneBlockShuffle(V v) { - static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); - static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); - static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); - static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); - - constexpr size_t kIdx3210 = - (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0; - return detail::Per4LaneBlockShuffle(hwy::SizeTag(), v); -} -#endif - -// ------------------------------ Blocks - -template -HWY_API size_t Blocks(D d) { - return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD) + 15) / 16); -} - -// ------------------------------ Block insert/extract/broadcast ops -#if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT -#undef HWY_NATIVE_BLK_INSERT_EXTRACT -#else -#define HWY_NATIVE_BLK_INSERT_EXTRACT -#endif - -template -HWY_API V InsertBlock(V /*v*/, V blk_to_insert) { - static_assert(kBlockIdx == 0, "Invalid block index"); - return blk_to_insert; -} - -template -HWY_API V ExtractBlock(V v) { - static_assert(kBlockIdx == 0, "Invalid block index"); - return v; -} - -template -HWY_API V BroadcastBlock(V v) { - static_assert(kBlockIdx == 0, "Invalid block index"); - return v; -} - -#endif // HWY_NATIVE_BLK_INSERT_EXTRACT - -// ------------------------------ BroadcastLane -#if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_BROADCASTLANE -#undef HWY_NATIVE_BROADCASTLANE -#else -#define HWY_NATIVE_BROADCASTLANE -#endif - -template -HWY_API V BroadcastLane(V v) { - return Broadcast(v); -} - -#endif // HWY_NATIVE_BROADCASTLANE - -// ------------------------------ Slide1Up and Slide1Down -#if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_SLIDE1_UP_DOWN -#undef HWY_NATIVE_SLIDE1_UP_DOWN -#else -#define HWY_NATIVE_SLIDE1_UP_DOWN -#endif - -template -HWY_API VFromD Slide1Up(D d, VFromD /*v*/) { - return Zero(d); -} -template -HWY_API VFromD Slide1Down(D d, VFromD /*v*/) { - return Zero(d); -} - -#if HWY_TARGET != HWY_SCALAR -template -HWY_API VFromD Slide1Up(D d, VFromD v) { - return ShiftLeftLanes<1>(d, v); -} -template -HWY_API VFromD Slide1Down(D d, VFromD v) { - return ShiftRightLanes<1>(d, v); -} -#endif // HWY_TARGET != HWY_SCALAR - -#endif // HWY_NATIVE_SLIDE1_UP_DOWN - -// ------------------------------ SlideUpBlocks - -template -HWY_API VFromD SlideUpBlocks(D /*d*/, VFromD v) { - static_assert(kBlocks == 0, "kBlocks == 0 must be true"); - return v; -} - -#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 -template -HWY_API VFromD SlideUpBlocks(D d, VFromD v) { - static_assert(0 <= kBlocks && static_cast(kBlocks) < d.MaxBlocks(), - "kBlocks must be between 0 and d.MaxBlocks() - 1"); - constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); - return SlideUpLanes(d, v, static_cast(kBlocks) * kLanesPerBlock); -} -#endif - -// ------------------------------ SlideDownBlocks - -template -HWY_API VFromD SlideDownBlocks(D /*d*/, VFromD v) { - static_assert(kBlocks == 0, "kBlocks == 0 must be true"); - return v; -} - -#if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 -template -HWY_API VFromD SlideDownBlocks(D d, VFromD v) { - static_assert(0 <= kBlocks && static_cast(kBlocks) < d.MaxBlocks(), - "kBlocks must be between 0 and d.MaxBlocks() - 1"); - constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); - return SlideDownLanes(d, v, static_cast(kBlocks) * kLanesPerBlock); -} -#endif - -// ================================================== Operator wrapper - -// SVE* and RVV currently cannot define operators and have already defined -// (only) the corresponding functions such as Add. -#if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE)) -#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS -#undef HWY_NATIVE_OPERATOR_REPLACEMENTS -#else -#define HWY_NATIVE_OPERATOR_REPLACEMENTS -#endif - -template -HWY_API V Add(V a, V b) { - return a + b; -} -template -HWY_API V Sub(V a, V b) { - return a - b; -} - -template -HWY_API V Mul(V a, V b) { - return a * b; -} -template -HWY_API V Div(V a, V b) { - return a / b; -} - -template -V Shl(V a, V b) { - return a << b; -} -template -V Shr(V a, V b) { - return a >> b; -} - -template -HWY_API auto Eq(V a, V b) -> decltype(a == b) { - return a == b; -} -template -HWY_API auto Ne(V a, V b) -> decltype(a == b) { - return a != b; -} -template -HWY_API auto Lt(V a, V b) -> decltype(a == b) { - return a < b; -} - -template -HWY_API auto Gt(V a, V b) -> decltype(a == b) { - return a > b; -} -template -HWY_API auto Ge(V a, V b) -> decltype(a == b) { - return a >= b; -} - -template -HWY_API auto Le(V a, V b) -> decltype(a == b) { - return a <= b; -} - -#endif // HWY_NATIVE_OPERATOR_REPLACEMENTS - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/ppc_vsx-inl.h b/deps/highway/include/hwy/ops/ppc_vsx-inl.h deleted file mode 100644 index dfa99038..00000000 --- a/deps/highway/include/hwy/ops/ppc_vsx-inl.h +++ /dev/null @@ -1,5339 +0,0 @@ -// Copyright 2023 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// 128-bit vectors for VSX -// External include guard in highway.h - see comment there. - -#pragma push_macro("vector") -#pragma push_macro("pixel") -#pragma push_macro("bool") - -#undef vector -#undef pixel -#undef bool - -#include - -#pragma pop_macro("vector") -#pragma pop_macro("pixel") -#pragma pop_macro("bool") - -#include "hwy/ops/shared-inl.h" - -// clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and -// some GCC do the same for _ARCH_PWR10. -// This means we can only use POWER10-specific intrinsics in static dispatch -// mode (where the -mpower10-vector compiler flag is passed). Same for PPC9. -// On other compilers, the usual target check is sufficient. -#if HWY_TARGET <= HWY_PPC9 && \ - (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__)) -#define HWY_PPC_HAVE_9 1 -#else -#define HWY_PPC_HAVE_9 0 -#endif - -#if HWY_TARGET <= HWY_PPC10 && \ - (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__)) -#define HWY_PPC_HAVE_10 1 -#else -#define HWY_PPC_HAVE_10 0 -#endif - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { -namespace detail { - -template -struct Raw128; - -// Each Raw128 specialization defines the following typedefs: -// - type: -// the backing Altivec/VSX raw vector type of the Vec128 type -// - RawBoolVec: -// the backing Altivec/VSX raw __bool vector type of the Mask128 type -// - RawT: -// the lane type for intrinsics, in particular vec_splat -// - AlignedRawVec: -// the 128-bit GCC/Clang vector type for aligned loads/stores -// - UnalignedRawVec: -// the 128-bit GCC/Clang vector type for unaligned loads/stores -#define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \ - template <> \ - struct Raw128 { \ - using type = __vector RAW_VECT_LANE_TYPE; \ - using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \ - using RawT = RAW_VECT_LANE_TYPE; \ - typedef LANE_TYPE AlignedRawVec \ - __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \ - typedef LANE_TYPE UnalignedRawVec __attribute__(( \ - __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \ - }; - -HWY_VSX_RAW128(int8_t, signed char, char) -HWY_VSX_RAW128(uint8_t, unsigned char, char) -HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int) -HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int) -HWY_VSX_RAW128(int32_t, signed int, int) -HWY_VSX_RAW128(uint32_t, unsigned int, int) -HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int) -HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int) -HWY_VSX_RAW128(float, float, int) -HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int) - -template <> -struct Raw128 : public Raw128 {}; - -template <> -struct Raw128 : public Raw128 {}; - -#undef HWY_VSX_RAW128 - -} // namespace detail - -template -class Vec128 { - using Raw = typename detail::Raw128::type; - - public: - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = N; // only for DFromV - - // Compound assignment. Only usable if there is a corresponding non-member - // binary operator overload. For example, only f32 and f64 support division. - HWY_INLINE Vec128& operator*=(const Vec128 other) { - return *this = (*this * other); - } - HWY_INLINE Vec128& operator/=(const Vec128 other) { - return *this = (*this / other); - } - HWY_INLINE Vec128& operator+=(const Vec128 other) { - return *this = (*this + other); - } - HWY_INLINE Vec128& operator-=(const Vec128 other) { - return *this = (*this - other); - } - HWY_INLINE Vec128& operator&=(const Vec128 other) { - return *this = (*this & other); - } - HWY_INLINE Vec128& operator|=(const Vec128 other) { - return *this = (*this | other); - } - HWY_INLINE Vec128& operator^=(const Vec128 other) { - return *this = (*this ^ other); - } - - Raw raw; -}; - -template -using Vec64 = Vec128; - -template -using Vec32 = Vec128; - -template -using Vec16 = Vec128; - -// FF..FF or 0. -template -struct Mask128 { - typename detail::Raw128::RawBoolVec raw; - - using PrivateT = T; // only for DFromM - static constexpr size_t kPrivateN = N; // only for DFromM -}; - -template -using DFromV = Simd; - -template -using DFromM = Simd; - -template -using TFromV = typename V::PrivateT; - -// ------------------------------ Zero - -// Returns an all-zero vector/part. -template > -HWY_API Vec128 Zero(D /* tag */) { - // There is no vec_splats for 64-bit, so we cannot rely on casting the 0 - // argument in order to select the correct overload. We instead cast the - // return vector type; see also the comment in BitCast. - return Vec128{ - reinterpret_cast::type>(vec_splats(0))}; -} - -template -using VFromD = decltype(Zero(D())); - -// ------------------------------ Tuple (VFromD) -#include "hwy/ops/tuple-inl.h" - -// ------------------------------ BitCast - -template -HWY_API VFromD BitCast(D /*d*/, - Vec128().MaxLanes()> v) { - // C-style casts are not sufficient when compiling with - // -fno-lax-vector-conversions, which will be the future default in Clang, - // but reinterpret_cast is. - return VFromD{ - reinterpret_cast>::type>(v.raw)}; -} - -// ------------------------------ ResizeBitCast - -template -HWY_API VFromD ResizeBitCast(D /*d*/, FromV v) { - // C-style casts are not sufficient when compiling with - // -fno-lax-vector-conversions, which will be the future default in Clang, - // but reinterpret_cast is. - return VFromD{ - reinterpret_cast>::type>(v.raw)}; -} - -// ------------------------------ Set - -// Returns a vector/part with all lanes set to "t". -template )> -HWY_API VFromD Set(D /* tag */, TFromD t) { - using RawLane = typename detail::Raw128>::RawT; - return VFromD{vec_splats(static_cast(t))}; -} - -// Returns a vector with uninitialized elements. -template -HWY_API VFromD Undefined(D d) { -#if HWY_COMPILER_GCC_ACTUAL - // Suppressing maybe-uninitialized both here and at the caller does not work, - // so initialize. - return Zero(d); -#else - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") - typename detail::Raw128>::type raw; - return VFromD{raw}; - HWY_DIAGNOSTICS(pop) -#endif -} - -// ------------------------------ GetLane - -// Gets the single value stored in a vector/part. - -template -HWY_API T GetLane(Vec128 v) { - return static_cast(v.raw[0]); -} - -// ================================================== LOGICAL - -// ------------------------------ And - -template -HWY_API Vec128 And(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)}); -} - -// ------------------------------ AndNot - -// Returns ~not_mask & mask. -template -HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast( - d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)}); -} - -// ------------------------------ Or - -template -HWY_API Vec128 Or(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)}); -} - -// ------------------------------ Xor - -template -HWY_API Vec128 Xor(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)}); -} - -// ------------------------------ Not -template -HWY_API Vec128 Not(Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)}); -} - -// ------------------------------ IsConstantRawAltivecVect -namespace detail { - -template -static HWY_INLINE bool IsConstantRawAltivecVect( - hwy::SizeTag<1> /* lane_size_tag */, RawV v) { - return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && - __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && - __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && - __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && - __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && - __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && - __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && - __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]); -} - -template -static HWY_INLINE bool IsConstantRawAltivecVect( - hwy::SizeTag<2> /* lane_size_tag */, RawV v) { - return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && - __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && - __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && - __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]); -} - -template -static HWY_INLINE bool IsConstantRawAltivecVect( - hwy::SizeTag<4> /* lane_size_tag */, RawV v) { - return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && - __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]); -} - -template -static HWY_INLINE bool IsConstantRawAltivecVect( - hwy::SizeTag<8> /* lane_size_tag */, RawV v) { - return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]); -} - -template -static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) { - return IsConstantRawAltivecVect(hwy::SizeTag(), v); -} - -} // namespace detail - -// ------------------------------ TernaryLogic -#if HWY_PPC_HAVE_10 -namespace detail { - -// NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse -// order of the kTernLogOp bits of AVX3 -// _mm_ternarylogic_epi64(a, b, c, kTernLogOp) -template -HWY_INLINE V TernaryLogic(V a, V b, V c) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const auto a_raw = BitCast(du, a).raw; - const auto b_raw = BitCast(du, b).raw; - const auto c_raw = BitCast(du, c).raw; - -#if HWY_COMPILER_GCC_ACTUAL - // Use inline assembly on GCC to work around GCC compiler bug - typename detail::Raw128>::type raw_ternlog_result; - __asm__("xxeval %x0,%x1,%x2,%x3,%4" - : "=wa"(raw_ternlog_result) - : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw), - "n"(static_cast(kTernLogOp)) - :); -#else - const auto raw_ternlog_result = - vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp); -#endif - - return BitCast(d, VU{raw_ternlog_result}); -} - -} // namespace detail -#endif // HWY_PPC_HAVE_10 - -// ------------------------------ Xor3 -template -HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { -#if HWY_PPC_HAVE_10 -#if defined(__OPTIMIZE__) - if (static_cast(detail::IsConstantRawAltivecVect(x1.raw)) + - static_cast(detail::IsConstantRawAltivecVect(x2.raw)) + - static_cast(detail::IsConstantRawAltivecVect(x3.raw)) >= - 2) { - return Xor(x1, Xor(x2, x3)); - } else // NOLINT -#endif - { - return detail::TernaryLogic<0x69>(x1, x2, x3); - } -#else - return Xor(x1, Xor(x2, x3)); -#endif -} - -// ------------------------------ Or3 -template -HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { -#if HWY_PPC_HAVE_10 -#if defined(__OPTIMIZE__) - if (static_cast(detail::IsConstantRawAltivecVect(o1.raw)) + - static_cast(detail::IsConstantRawAltivecVect(o2.raw)) + - static_cast(detail::IsConstantRawAltivecVect(o3.raw)) >= - 2) { - return Or(o1, Or(o2, o3)); - } else // NOLINT -#endif - { - return detail::TernaryLogic<0x7F>(o1, o2, o3); - } -#else - return Or(o1, Or(o2, o3)); -#endif -} - -// ------------------------------ OrAnd -template -HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { -#if HWY_PPC_HAVE_10 -#if defined(__OPTIMIZE__) - if (detail::IsConstantRawAltivecVect(a1.raw) && - detail::IsConstantRawAltivecVect(a2.raw)) { - return Or(o, And(a1, a2)); - } else // NOLINT -#endif - { - return detail::TernaryLogic<0x1F>(o, a1, a2); - } -#else - return Or(o, And(a1, a2)); -#endif -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, - Vec128 no) { - const DFromV d; - const RebindToUnsigned du; - return BitCast( - d, VFromD{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw, - BitCast(du, mask).raw)}); -} - -// ------------------------------ BitwiseIfThenElse - -#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#else -#define HWY_NATIVE_BITWISE_IF_THEN_ELSE -#endif - -template -HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { - return IfVecThenElse(mask, yes, no); -} - -// ------------------------------ Operator overloads (internal-only if float) - -template -HWY_API Vec128 operator&(Vec128 a, Vec128 b) { - return And(a, b); -} - -template -HWY_API Vec128 operator|(Vec128 a, Vec128 b) { - return Or(a, b); -} - -template -HWY_API Vec128 operator^(Vec128 a, Vec128 b) { - return Xor(a, b); -} - -// ================================================== SIGN - -// ------------------------------ Neg - -template -HWY_INLINE Vec128 Neg(Vec128 v) { - return Vec128{vec_neg(v.raw)}; -} - -template -HWY_API Vec128 Neg(const Vec128 v) { - return Xor(v, SignBit(DFromV())); -} - -// ------------------------------ Abs - -// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. -template -HWY_API Vec128 Abs(Vec128 v) { - return Vec128{vec_abs(v.raw)}; -} - -// ------------------------------ CopySign - -template -HWY_API Vec128 CopySign(Vec128 magn, - Vec128 sign) { - // Work around compiler bugs that are there with vec_cpsgn on older versions - // of GCC/Clang -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 - return Vec128{__builtin_vec_copysign(magn.raw, sign.raw)}; -#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ - HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp) - return Vec128{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)}; -#else - return Vec128{vec_cpsgn(sign.raw, magn.raw)}; -#endif -} - -template -HWY_API Vec128 CopySign(Vec128 magn, - Vec128 sign) { - // Work around compiler bugs that are there with vec_cpsgn on older versions - // of GCC/Clang -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 - return Vec128{__builtin_vec_copysign(magn.raw, sign.raw)}; -#elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ - HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp) - return Vec128{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)}; -#else - return Vec128{vec_cpsgn(sign.raw, magn.raw)}; -#endif -} - -template -HWY_API Vec128 CopySignToAbs(Vec128 abs, Vec128 sign) { - // PPC8 can also handle abs < 0, so no extra action needed. - static_assert(IsFloat(), "Only makes sense for floating-point"); - return CopySign(abs, sign); -} - -// ================================================== MEMORY (1) - -// Note: type punning is safe because the types are tagged with may_alias. -// (https://godbolt.org/z/fqrWjfjsP) - -// ------------------------------ Load - -template > -HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { - using LoadRaw = typename detail::Raw128::AlignedRawVec; - const LoadRaw* HWY_RESTRICT p = reinterpret_cast(aligned); - using ResultRaw = typename detail::Raw128::type; - return Vec128{reinterpret_cast(*p)}; -} - -// Any <= 64 bit -template > -HWY_API VFromD Load(D d, const T* HWY_RESTRICT p) { - using BitsT = UnsignedFromSize; - - BitsT bits; - const Repartition d_bits; - CopyBytes(p, &bits); - return BitCast(d, Set(d_bits, bits)); -} - -// ================================================== MASK - -// ------------------------------ Mask - -// Mask and Vec are both backed by vector types (true = FF..FF). -template -HWY_API Mask128 MaskFromVec(Vec128 v) { - using Raw = typename detail::Raw128::RawBoolVec; - return Mask128{reinterpret_cast(v.raw)}; -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -template -HWY_API Vec128 VecFromMask(Mask128 v) { - return Vec128{ - reinterpret_cast::type>(v.raw)}; -} - -template -HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { - return VFromD{ - reinterpret_cast>::type>(v.raw)}; -} - -// mask ? yes : no -template -HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, - Vec128 no) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, VFromD{vec_sel( - BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)}); -} - -// mask ? yes : 0 -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, - VFromD{vec_and(BitCast(du, yes).raw, mask.raw)}); -} - -// mask ? 0 : no -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, - VFromD{vec_andc(BitCast(du, no).raw, mask.raw)}); -} - -// ------------------------------ Mask logical - -template -HWY_API Mask128 Not(Mask128 m) { - return Mask128{vec_nor(m.raw, m.raw)}; -} - -template -HWY_API Mask128 And(Mask128 a, Mask128 b) { - return Mask128{vec_and(a.raw, b.raw)}; -} - -template -HWY_API Mask128 AndNot(Mask128 a, Mask128 b) { - return Mask128{vec_andc(b.raw, a.raw)}; -} - -template -HWY_API Mask128 Or(Mask128 a, Mask128 b) { - return Mask128{vec_or(a.raw, b.raw)}; -} - -template -HWY_API Mask128 Xor(Mask128 a, Mask128 b) { - return Mask128{vec_xor(a.raw, b.raw)}; -} - -template -HWY_API Mask128 ExclusiveNeither(Mask128 a, Mask128 b) { - return Mask128{vec_nor(a.raw, b.raw)}; -} - -// ------------------------------ BroadcastSignBit - -template -HWY_API Vec128 BroadcastSignBit(Vec128 v) { - return Vec128{ - vec_sra(v.raw, vec_splats(static_cast(7)))}; -} - -template -HWY_API Vec128 BroadcastSignBit(Vec128 v) { - return Vec128{ - vec_sra(v.raw, vec_splats(static_cast(15)))}; -} - -template -HWY_API Vec128 BroadcastSignBit(Vec128 v) { - return Vec128{vec_sra(v.raw, vec_splats(31u))}; -} - -template -HWY_API Vec128 BroadcastSignBit(Vec128 v) { - return Vec128{vec_sra(v.raw, vec_splats(63ULL))}; -} - -// ------------------------------ ShiftLeftSame - -template -HWY_API Vec128 ShiftLeftSame(Vec128 v, const int bits) { - using TU = typename detail::Raw128>::RawT; - return Vec128{vec_sl(v.raw, vec_splats(static_cast(bits)))}; -} - -// ------------------------------ ShiftRightSame - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { - using TU = typename detail::Raw128>::RawT; - return Vec128{vec_sr(v.raw, vec_splats(static_cast(bits)))}; -} - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { - using TU = typename detail::Raw128>::RawT; - return Vec128{vec_sra(v.raw, vec_splats(static_cast(bits)))}; -} - -// ------------------------------ ShiftLeft - -template -HWY_API Vec128 ShiftLeft(Vec128 v) { - static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); - return ShiftLeftSame(v, kBits); -} - -// ------------------------------ ShiftRight - -template -HWY_API Vec128 ShiftRight(Vec128 v) { - static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); - return ShiftRightSame(v, kBits); -} - -// ================================================== SWIZZLE (1) - -// ------------------------------ TableLookupBytes -template -HWY_API Vec128 TableLookupBytes(Vec128 bytes, - Vec128 from) { - const Repartition> du8_from; - return Vec128{reinterpret_cast::type>( - vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))}; -} - -// ------------------------------ TableLookupBytesOr0 -// For all vector widths; Altivec/VSX needs zero out -template -HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { - const DFromV di; - Repartition di8; - const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from))); - return AndNot(zeroOutMask, TableLookupBytes(bytes, from)); -} - -// ------------------------------ Reverse -template , HWY_IF_LANES_GT_D(D, 1)> -HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { - return Vec128{vec_reve(v.raw)}; -} - -// ------------------------------ Shuffles (Reverse) - -// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). -// Shuffle0321 rotates one lane to the right (the previous least-significant -// lane is now most-significant). These could also be implemented via -// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. - -// Swap 32-bit halves in 64-bit halves. -template -HWY_API Vec128 Shuffle2301(Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, - 12, 13, 14, 15, 8, 9, 10, 11}; - return Vec128{vec_perm(v.raw, v.raw, kShuffle)}; -} - -// These are used by generic_ops-inl to implement LoadInterleaved3. As with -// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output -// comes from the first argument. -namespace detail { - -template -HWY_API Vec32 ShuffleTwo2301(Vec32 a, Vec32 b) { - const __vector unsigned char kShuffle16 = {1, 0, 19, 18}; - return Vec32{vec_perm(a.raw, b.raw, kShuffle16)}; -} -template -HWY_API Vec64 ShuffleTwo2301(Vec64 a, Vec64 b) { - const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21}; - return Vec64{vec_perm(a.raw, b.raw, kShuffle)}; -} -template -HWY_API Vec128 ShuffleTwo2301(Vec128 a, Vec128 b) { - const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, - 28, 29, 30, 31, 24, 25, 26, 27}; - return Vec128{vec_perm(a.raw, b.raw, kShuffle)}; -} - -template -HWY_API Vec32 ShuffleTwo1230(Vec32 a, Vec32 b) { - const __vector unsigned char kShuffle = {0, 3, 18, 17}; - return Vec32{vec_perm(a.raw, b.raw, kShuffle)}; -} -template -HWY_API Vec64 ShuffleTwo1230(Vec64 a, Vec64 b) { - const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19}; - return Vec64{vec_perm(a.raw, b.raw, kShuffle)}; -} -template -HWY_API Vec128 ShuffleTwo1230(Vec128 a, Vec128 b) { - const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15, - 24, 25, 26, 27, 20, 21, 22, 23}; - return Vec128{vec_perm(a.raw, b.raw, kShuffle)}; -} - -template -HWY_API Vec32 ShuffleTwo3012(Vec32 a, Vec32 b) { - const __vector unsigned char kShuffle = {2, 1, 16, 19}; - return Vec32{vec_perm(a.raw, b.raw, kShuffle)}; -} -template -HWY_API Vec64 ShuffleTwo3012(Vec64 a, Vec64 b) { - const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23}; - return Vec64{vec_perm(a.raw, b.raw, kShuffle)}; -} -template -HWY_API Vec128 ShuffleTwo3012(Vec128 a, Vec128 b) { - const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7, - 16, 17, 18, 19, 28, 29, 30, 31}; - return Vec128{vec_perm(a.raw, b.raw, kShuffle)}; -} - -} // namespace detail - -// Swap 64-bit halves -template -HWY_API Vec128 Shuffle1032(Vec128 v) { - const Full128 d; - const Full128 du64; - return BitCast(d, Reverse(du64, BitCast(du64, v))); -} -template -HWY_API Vec128 Shuffle01(Vec128 v) { - return Reverse(Full128(), v); -} - -// Rotate right 32 bits -template -HWY_API Vec128 Shuffle0321(Vec128 v) { -#if HWY_IS_LITTLE_ENDIAN - return Vec128{vec_sld(v.raw, v.raw, 12)}; -#else - return Vec128{vec_sld(v.raw, v.raw, 4)}; -#endif -} -// Rotate left 32 bits -template -HWY_API Vec128 Shuffle2103(Vec128 v) { -#if HWY_IS_LITTLE_ENDIAN - return Vec128{vec_sld(v.raw, v.raw, 4)}; -#else - return Vec128{vec_sld(v.raw, v.raw, 12)}; -#endif -} - -template -HWY_API Vec128 Shuffle0123(Vec128 v) { - return Reverse(Full128(), v); -} - -// ================================================== COMPARE - -// Comparisons fill a lane with 1-bits if the condition is true, else 0. - -template -HWY_API MFromD RebindMask(DTo /*dto*/, Mask128 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - return MFromD{m.raw}; -} - -template -HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} - -// ------------------------------ Equality - -template -HWY_API Mask128 operator==(Vec128 a, Vec128 b) { - return Mask128{vec_cmpeq(a.raw, b.raw)}; -} - -// ------------------------------ Inequality - -// This cannot have T as a template argument, otherwise it is not more -// specialized than rewritten operator== in C++20, leading to compile -// errors: https://gcc.godbolt.org/z/xsrPhPvPT. -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { -#if HWY_PPC_HAVE_9 - return Mask128{vec_cmpne(a.raw, b.raw)}; -#else - return Not(a == b); -#endif -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { -#if HWY_PPC_HAVE_9 - return Mask128{vec_cmpne(a.raw, b.raw)}; -#else - return Not(a == b); -#endif -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { -#if HWY_PPC_HAVE_9 - return Mask128{vec_cmpne(a.raw, b.raw)}; -#else - return Not(a == b); -#endif -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { -#if HWY_PPC_HAVE_9 - return Mask128{vec_cmpne(a.raw, b.raw)}; -#else - return Not(a == b); -#endif -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { -#if HWY_PPC_HAVE_9 - return Mask128{vec_cmpne(a.raw, b.raw)}; -#else - return Not(a == b); -#endif -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { -#if HWY_PPC_HAVE_9 - return Mask128{vec_cmpne(a.raw, b.raw)}; -#else - return Not(a == b); -#endif -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} - -template -HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { - return Not(a == b); -} - -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} - -// ------------------------------ Strict inequality - -template -HWY_INLINE Mask128 operator>(Vec128 a, Vec128 b) { - return Mask128{vec_cmpgt(a.raw, b.raw)}; -} - -// ------------------------------ Weak inequality - -template -HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { - return Mask128{vec_cmpge(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { - return Not(b > a); -} - -// ------------------------------ Reversed comparisons - -template -HWY_API Mask128 operator<(Vec128 a, Vec128 b) { - return b > a; -} - -template -HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { - return b >= a; -} - -// ================================================== MEMORY (2) - -// ------------------------------ Load -template > -HWY_API Vec128 LoadU(D /* tag */, const T* HWY_RESTRICT p) { - using LoadRaw = typename detail::Raw128::UnalignedRawVec; - const LoadRaw* HWY_RESTRICT praw = reinterpret_cast(p); - using ResultRaw = typename detail::Raw128::type; - return Vec128{reinterpret_cast(*praw)}; -} - -// For < 128 bit, LoadU == Load. -template > -HWY_API VFromD LoadU(D d, const T* HWY_RESTRICT p) { - return Load(d, p); -} - -// 128-bit SIMD => nothing to duplicate, same as an unaligned load. -template > -HWY_API VFromD LoadDup128(D d, const T* HWY_RESTRICT p) { - return LoadU(d, p); -} - -#if HWY_PPC_HAVE_9 -#ifdef HWY_NATIVE_LOAD_N -#undef HWY_NATIVE_LOAD_N -#else -#define HWY_NATIVE_LOAD_N -#endif - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { -#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) { - return Zero(d); - } - - if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) && - max_lanes_to_load >= HWY_MAX_LANES_D(D)) { - return LoadU(d, p); - } -#endif - - const size_t num_of_bytes_to_load = - HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD); - const Repartition du8; - return BitCast( - d, - VFromD{vec_xl_len( - const_cast(reinterpret_cast(p)), - num_of_bytes_to_load)}); -} -#endif - -// Returns a vector with lane i=[0, N) set to "first" + i. -namespace detail { - -template -HWY_INLINE VFromD Iota0(D d) { - constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; - return BitCast(d, VFromD>{kU8Iota0}); -} - -template -HWY_INLINE VFromD Iota0(D d) { - constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; - return BitCast(d, VFromD>{kU16Iota0}); -} - -template -HWY_INLINE VFromD Iota0(D d) { - constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3}; - return BitCast(d, VFromD>{kU32Iota0}); -} - -template -HWY_INLINE VFromD Iota0(D d) { - constexpr __vector unsigned long long kU64Iota0 = {0, 1}; - return BitCast(d, VFromD>{kU64Iota0}); -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f}; - return VFromD{kF32Iota0}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - constexpr __vector double kF64Iota0 = {0.0, 1.0}; - return VFromD{kF64Iota0}; -} - -} // namespace detail - -template -HWY_API VFromD Iota(D d, const T2 first) { - return detail::Iota0(d) + Set(d, static_cast>(first)); -} - -// ------------------------------ FirstN (Iota, Lt) - -template -HWY_API MFromD FirstN(D d, size_t num) { - const RebindToUnsigned du; - using TU = TFromD; - return RebindMask(d, Iota(du, 0) < Set(du, static_cast(num))); -} - -// ------------------------------ MaskedLoad -template > -HWY_API VFromD MaskedLoad(MFromD m, D d, const T* HWY_RESTRICT p) { - return IfThenElseZero(m, LoadU(d, p)); -} - -// ------------------------------ MaskedLoadOr -template > -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const T* HWY_RESTRICT p) { - return IfThenElse(m, LoadU(d, p), v); -} - -// ------------------------------ Store - -template > -HWY_API void Store(Vec128 v, D /* tag */, T* HWY_RESTRICT aligned) { - using StoreRaw = typename detail::Raw128::AlignedRawVec; - *reinterpret_cast(aligned) = reinterpret_cast(v.raw); -} - -template > -HWY_API void StoreU(Vec128 v, D /* tag */, T* HWY_RESTRICT p) { - using StoreRaw = typename detail::Raw128::UnalignedRawVec; - *reinterpret_cast(p) = reinterpret_cast(v.raw); -} - -template > -HWY_API void Store(VFromD v, D d, T* HWY_RESTRICT p) { - using BitsT = UnsignedFromSize; - - const Repartition d_bits; - const BitsT bits = GetLane(BitCast(d_bits, v)); - CopyBytes(&bits, p); -} - -// For < 128 bit, StoreU == Store. -template > -HWY_API void StoreU(VFromD v, D d, T* HWY_RESTRICT p) { - Store(v, d, p); -} - -#if HWY_PPC_HAVE_9 - -#ifdef HWY_NATIVE_STORE_N -#undef HWY_NATIVE_STORE_N -#else -#define HWY_NATIVE_STORE_N -#endif - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { -#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) { - return; - } - - if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) && - max_lanes_to_store >= HWY_MAX_LANES_D(D)) { - StoreU(v, d, p); - return; - } -#endif - - const size_t num_of_bytes_to_store = - HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD); - const Repartition du8; - vec_xst_len(BitCast(du8, v).raw, reinterpret_cast(p), - num_of_bytes_to_store); -} -#endif - -// ------------------------------ BlendedStore - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - const RebindToSigned di; // for testing mask if T=bfloat16_t. - using TI = TFromD; - alignas(16) TI buf[MaxLanes(d)]; - alignas(16) TI mask[MaxLanes(d)]; - Store(BitCast(di, v), di, buf); - Store(BitCast(di, VecFromMask(d, m)), di, mask); - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask[i]) { - CopySameSize(buf + i, p + i); - } - } -} - -// ================================================== ARITHMETIC - -// ------------------------------ Addition - -template -HWY_API Vec128 operator+(Vec128 a, Vec128 b) { - return Vec128{vec_add(a.raw, b.raw)}; -} - -// ------------------------------ Subtraction - -template -HWY_API Vec128 operator-(Vec128 a, Vec128 b) { - return Vec128{vec_sub(a.raw, b.raw)}; -} - -// ------------------------------ SumsOf8 -namespace detail { - -// Casts nominally int32_t result to D. -template -HWY_INLINE VFromD AltivecVsum4sbs(D d, __vector signed char a, - __vector signed int b) { - const Repartition di32; -#ifdef __OPTIMIZE__ - if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { - const int64_t sum0 = - static_cast(a[0]) + static_cast(a[1]) + - static_cast(a[2]) + static_cast(a[3]) + - static_cast(b[0]); - const int64_t sum1 = - static_cast(a[4]) + static_cast(a[5]) + - static_cast(a[6]) + static_cast(a[7]) + - static_cast(b[1]); - const int64_t sum2 = - static_cast(a[8]) + static_cast(a[9]) + - static_cast(a[10]) + static_cast(a[11]) + - static_cast(b[2]); - const int64_t sum3 = - static_cast(a[12]) + static_cast(a[13]) + - static_cast(a[14]) + static_cast(a[15]) + - static_cast(b[3]); - const int32_t sign0 = static_cast(sum0 >> 63); - const int32_t sign1 = static_cast(sum1 >> 63); - const int32_t sign2 = static_cast(sum2 >> 63); - const int32_t sign3 = static_cast(sum3 >> 63); - using Raw = typename detail::Raw128::type; - return BitCast( - d, - VFromD{Raw{ - (sign0 == (sum0 >> 31)) ? static_cast(sum0) - : static_cast(sign0 ^ 0x7FFFFFFF), - (sign1 == (sum1 >> 31)) ? static_cast(sum1) - : static_cast(sign1 ^ 0x7FFFFFFF), - (sign2 == (sum2 >> 31)) ? static_cast(sum2) - : static_cast(sign2 ^ 0x7FFFFFFF), - (sign3 == (sum3 >> 31)) - ? static_cast(sum3) - : static_cast(sign3 ^ 0x7FFFFFFF)}}); - } else // NOLINT -#endif - { - return BitCast(d, VFromD{vec_vsum4sbs(a, b)}); - } -} - -// Casts nominally uint32_t result to D. -template -HWY_INLINE VFromD AltivecVsum4ubs(D d, __vector unsigned char a, - __vector unsigned int b) { - const Repartition du32; -#ifdef __OPTIMIZE__ - if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { - const uint64_t sum0 = - static_cast(a[0]) + static_cast(a[1]) + - static_cast(a[2]) + static_cast(a[3]) + - static_cast(b[0]); - const uint64_t sum1 = - static_cast(a[4]) + static_cast(a[5]) + - static_cast(a[6]) + static_cast(a[7]) + - static_cast(b[1]); - const uint64_t sum2 = - static_cast(a[8]) + static_cast(a[9]) + - static_cast(a[10]) + static_cast(a[11]) + - static_cast(b[2]); - const uint64_t sum3 = - static_cast(a[12]) + static_cast(a[13]) + - static_cast(a[14]) + static_cast(a[15]) + - static_cast(b[3]); - return BitCast( - d, - VFromD{(__vector unsigned int){ - static_cast(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu), - static_cast(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu), - static_cast(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu), - static_cast(sum3 <= 0xFFFFFFFFu ? sum3 - : 0xFFFFFFFFu)}}); - } else // NOLINT -#endif - { - return BitCast(d, VFromD{vec_vsum4ubs(a, b)}); - } -} - -// Casts nominally int32_t result to D. -template -HWY_INLINE VFromD AltivecVsum2sws(D d, __vector signed int a, - __vector signed int b) { - const Repartition di32; -#ifdef __OPTIMIZE__ - const Repartition du64; - constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN; - if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) && - __builtin_constant_p(b[kDestLaneOffset + 2])) { - const int64_t sum0 = static_cast(a[0]) + - static_cast(a[1]) + - static_cast(b[kDestLaneOffset]); - const int64_t sum1 = static_cast(a[2]) + - static_cast(a[3]) + - static_cast(b[kDestLaneOffset + 2]); - const int32_t sign0 = static_cast(sum0 >> 63); - const int32_t sign1 = static_cast(sum1 >> 63); - return BitCast(d, VFromD{(__vector unsigned long long){ - (sign0 == (sum0 >> 31)) - ? static_cast(sum0) - : static_cast(sign0 ^ 0x7FFFFFFF), - (sign1 == (sum1 >> 31)) - ? static_cast(sum1) - : static_cast(sign1 ^ 0x7FFFFFFF)}}); - } else // NOLINT -#endif - { - __vector signed int sum; - - // Inline assembly is used for vsum2sws to avoid unnecessary shuffling - // on little-endian PowerPC targets as the result of the vsum2sws - // instruction will already be in the correct lanes on little-endian - // PowerPC targets. - __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); - - return BitCast(d, VFromD{sum}); - } -} - -} // namespace detail - -template -HWY_API Vec128 SumsOf8(Vec128 v) { - const Repartition> du64; - const Repartition di32; - const RebindToUnsigned du32; - - return detail::AltivecVsum2sws( - du64, detail::AltivecVsum4ubs(di32, v.raw, Zero(du32).raw).raw, - Zero(di32).raw); -} - -// ------------------------------ SaturatedAdd - -// Returns a + b clamped to the destination range. - -#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB -#undef HWY_NATIVE_I32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB -#undef HWY_NATIVE_U32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U32_SATURATED_ADDSUB -#endif - -template -HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { - return Vec128{vec_adds(a.raw, b.raw)}; -} - -#if HWY_PPC_HAVE_10 - -#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB -#undef HWY_NATIVE_I64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I64_SATURATED_ADDSUB -#endif - -template )> -HWY_API V SaturatedAdd(V a, V b) { - const DFromV d; - const auto sum = Add(a, b); - const auto overflow_mask = - BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum)); - const auto overflow_result = - Xor(BroadcastSignBit(a), Set(d, LimitsMax())); - return IfNegativeThenElse(overflow_mask, overflow_result, sum); -} - -#endif // HWY_PPC_HAVE_10 - -// ------------------------------ SaturatedSub - -// Returns a - b clamped to the destination range. - -template -HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { - return Vec128{vec_subs(a.raw, b.raw)}; -} - -#if HWY_PPC_HAVE_10 - -template )> -HWY_API V SaturatedSub(V a, V b) { - const DFromV d; - const auto diff = Sub(a, b); - const auto overflow_mask = - BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff)); - const auto overflow_result = - Xor(BroadcastSignBit(a), Set(d, LimitsMax())); - return IfNegativeThenElse(overflow_mask, overflow_result, diff); -} - -#endif // HWY_PPC_HAVE_10 - -// ------------------------------ AverageRound - -// Returns (a + b + 1) / 2 - -template -HWY_API Vec128 AverageRound(Vec128 a, Vec128 b) { - return Vec128{vec_avg(a.raw, b.raw)}; -} - -// ------------------------------ Multiplication - -// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -template -HWY_API Vec128 operator*(Vec128 a, Vec128 b) { - return Vec128{a.raw * b.raw}; -} - -// Returns the upper 16 bits of a * b in each lane. -template -HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { - const DFromV d; - const RepartitionToWide dw; - const VFromD p1{vec_mule(a.raw, b.raw)}; - const VFromD p2{vec_mulo(a.raw, b.raw)}; -#if HWY_IS_LITTLE_ENDIAN - const __vector unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23, - 10, 11, 26, 27, 14, 15, 30, 31}; -#else - const __vector unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21, - 8, 9, 24, 25, 12, 13, 28, 29}; -#endif - return BitCast(d, VFromD{vec_perm(p1.raw, p2.raw, kShuffle)}); -} - -template -HWY_API Vec128 MulFixedPoint15(Vec128 a, - Vec128 b) { - const Vec128 zero = Zero(Full128()); - return Vec128{vec_mradds(a.raw, b.raw, zero.raw)}; -} - -// Multiplies even lanes (0, 2, ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -template -HWY_API Vec128, (N + 1) / 2> MulEven(Vec128 a, - Vec128 b) { - return Vec128, (N + 1) / 2>{vec_mule(a.raw, b.raw)}; -} - -// Multiplies odd lanes (1, 3, ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -template -HWY_API Vec128, (N + 1) / 2> MulOdd(Vec128 a, - Vec128 b) { - return Vec128, (N + 1) / 2>{vec_mulo(a.raw, b.raw)}; -} - -// ------------------------------ RotateRight -template -HWY_API Vec128 RotateRight(const Vec128 v) { - const DFromV d; - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - return Vec128{vec_rl(v.raw, Set(d, kSizeInBits - kBits).raw)}; -} - -// ------------------------------ ZeroIfNegative (BroadcastSignBit) -template -HWY_API Vec128 ZeroIfNegative(Vec128 v) { - static_assert(IsFloat(), "Only works for float"); - const DFromV d; - const RebindToSigned di; - const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); - return IfThenElse(mask, Zero(d), v); -} - -// ------------------------------ IfNegativeThenElse - -template -HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, - Vec128 no) { - static_assert(IsSigned(), "Only works for signed/float"); - - const DFromV d; -#if HWY_PPC_HAVE_10 - const RebindToUnsigned du; - return BitCast( - d, VFromD{vec_blendv( - BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)}); -#else - const RebindToSigned di; - return IfThenElse(MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))), - yes, no); -#endif -} - -// generic_ops takes care of integer T. -template -HWY_API Vec128 AbsDiff(Vec128 a, Vec128 b) { - return Abs(a - b); -} - -// ------------------------------ Floating-point multiply-add variants - -// Returns mul * x + add -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return Vec128{vec_madd(mul.raw, x.raw, add.raw)}; -} - -// Returns add - mul * x -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - // NOTE: the vec_nmsub operation below computes -(mul * x - add), - // which is equivalent to add - mul * x in the round-to-nearest - // and round-towards-zero rounding modes - return Vec128{vec_nmsub(mul.raw, x.raw, add.raw)}; -} - -// Returns mul * x - sub -template -HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return Vec128{vec_msub(mul.raw, x.raw, sub.raw)}; -} - -// Returns -mul * x - sub -template -HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - // NOTE: The vec_nmadd operation below computes -(mul * x + sub), - // which is equivalent to -mul * x - sub in the round-to-nearest - // and round-towards-zero rounding modes - return Vec128{vec_nmadd(mul.raw, x.raw, sub.raw)}; -} - -// ------------------------------ Floating-point div -// Approximate reciprocal - -#ifdef HWY_NATIVE_F64_APPROX_RECIP -#undef HWY_NATIVE_F64_APPROX_RECIP -#else -#define HWY_NATIVE_F64_APPROX_RECIP -#endif - -template -HWY_API Vec128 ApproximateReciprocal(Vec128 v) { - return Vec128{vec_re(v.raw)}; -} - -template -HWY_API Vec128 operator/(Vec128 a, Vec128 b) { - return Vec128{vec_div(a.raw, b.raw)}; -} - -// ------------------------------ Floating-point square root - -#ifdef HWY_NATIVE_F64_APPROX_RSQRT -#undef HWY_NATIVE_F64_APPROX_RSQRT -#else -#define HWY_NATIVE_F64_APPROX_RSQRT -#endif - -// Approximate reciprocal square root -template -HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { - return Vec128{vec_rsqrte(v.raw)}; -} - -// Full precision square root -template -HWY_API Vec128 Sqrt(Vec128 v) { - return Vec128{vec_sqrt(v.raw)}; -} - -// ------------------------------ Min (Gt, IfThenElse) - -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{vec_min(a.raw, b.raw)}; -} - -// ------------------------------ Max (Gt, IfThenElse) - -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{vec_max(a.raw, b.raw)}; -} - -// ------------------------------- Integer AbsDiff for PPC9/PPC10 - -#if HWY_PPC_HAVE_9 -#ifdef HWY_NATIVE_INTEGER_ABS_DIFF -#undef HWY_NATIVE_INTEGER_ABS_DIFF -#else -#define HWY_NATIVE_INTEGER_ABS_DIFF -#endif - -template -HWY_API V AbsDiff(const V a, const V b) { - return V{vec_absd(a.raw, b.raw)}; -} - -template )> -HWY_API V AbsDiff(const V a, const V b) { - return Sub(Max(a, b), Min(a, b)); -} - -template -HWY_API V AbsDiff(const V a, const V b) { - return Sub(Max(a, b), Min(a, b)); -} - -#endif // HWY_PPC_HAVE_9 - -// ================================================== MEMORY (3) - -// ------------------------------ Non-temporal stores - -template -HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - __builtin_prefetch(aligned, 1, 0); - Store(v, d, aligned); -} - -// ------------------------------ Scatter in generic_ops-inl.h -// ------------------------------ Gather in generic_ops-inl.h - -// ================================================== SWIZZLE (2) - -// ------------------------------ LowerHalf - -// Returns upper/lower half of a vector. -template -HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { - return VFromD{v.raw}; -} -template -HWY_API Vec128 LowerHalf(Vec128 v) { - return Vec128{v.raw}; -} - -// ------------------------------ ShiftLeftBytes - -// NOTE: The ShiftLeftBytes operation moves the elements of v to the right -// by kBytes bytes and zeroes out the first kBytes bytes of v on both -// little-endian and big-endian PPC targets -// (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both -// little-endian and big-endian targets) - -template -HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - if (kBytes == 0) return v; - const auto zeros = Zero(d); -#if HWY_IS_LITTLE_ENDIAN - return VFromD{vec_sld(v.raw, zeros.raw, kBytes)}; -#else - return VFromD{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; -#endif -} - -template -HWY_API Vec128 ShiftLeftBytes(Vec128 v) { - return ShiftLeftBytes(DFromV(), v); -} - -// ------------------------------ ShiftLeftLanes - -// NOTE: The ShiftLeftLanes operation moves the elements of v to the right -// by kLanes lanes and zeroes out the first kLanes lanes of v on both -// little-endian and big-endian PPC targets -// (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both -// little-endian and big-endian targets) - -template > -HWY_API VFromD ShiftLeftLanes(D d, VFromD v) { - const Repartition d8; - return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); -} - -template -HWY_API Vec128 ShiftLeftLanes(Vec128 v) { - return ShiftLeftLanes(DFromV(), v); -} - -// ------------------------------ ShiftRightBytes - -// NOTE: The ShiftRightBytes operation moves the elements of v to the left -// by kBytes bytes and zeroes out the last kBytes bytes of v on both -// little-endian and big-endian PPC targets -// (same behavior as the HWY_EMU128 ShiftRightBytes operation on both -// little-endian and big-endian targets) - -template -HWY_API VFromD ShiftRightBytes(D d, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - if (kBytes == 0) return v; - - // For partial vectors, clear upper lanes so we shift in zeros. - if (d.MaxBytes() != 16) { - const Full128> dfull; - VFromD vfull{v.raw}; - v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; - } - - const auto zeros = Zero(d); -#if HWY_IS_LITTLE_ENDIAN - return VFromD{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; -#else - return VFromD{vec_sld(v.raw, zeros.raw, kBytes)}; -#endif -} - -// ------------------------------ ShiftRightLanes - -// NOTE: The ShiftRightLanes operation moves the elements of v to the left -// by kLanes lanes and zeroes out the last kLanes lanes of v on both -// little-endian and big-endian PPC targets -// (same behavior as the HWY_EMU128 ShiftRightLanes operation on both -// little-endian and big-endian targets) - -template -HWY_API VFromD ShiftRightLanes(D d, VFromD v) { - const Repartition d8; - constexpr size_t kBytes = kLanes * sizeof(TFromD); - return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); -} - -// ------------------------------ UpperHalf (ShiftRightBytes) - -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - return LowerHalf(d, ShiftRightBytes(Twice(), v)); -} - -// ------------------------------ ExtractLane -template -HWY_API T ExtractLane(Vec128 v, size_t i) { - return static_cast(v.raw[i]); -} - -// ------------------------------ InsertLane -template -HWY_API Vec128 InsertLane(Vec128 v, size_t i, T t) { -#if HWY_IS_LITTLE_ENDIAN - typename detail::Raw128::type raw_result = v.raw; - raw_result[i] = t; - return Vec128{raw_result}; -#else - // On ppc64be without this, mul_test fails, but swizzle_test passes. - DFromV d; - alignas(16) T lanes[16 / sizeof(T)]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -#endif -} - -// ------------------------------ CombineShiftRightBytes - -// NOTE: The CombineShiftRightBytes operation below moves the elements of lo to -// the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes() -// - kBytes) bytes on both little-endian and big-endian PPC targets. - -template > -HWY_API Vec128 CombineShiftRightBytes(D /*d*/, Vec128 hi, Vec128 lo) { - constexpr size_t kSize = 16; - static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); -#if HWY_IS_LITTLE_ENDIAN - return Vec128{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)}; -#else - return Vec128{vec_sld(lo.raw, hi.raw, kBytes)}; -#endif -} - -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - constexpr size_t kSize = d.MaxBytes(); - static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); - const Repartition d8; - using V8 = Vec128; - const DFromV dfull8; - const Repartition, decltype(dfull8)> dfull; - const V8 hi8{BitCast(d8, hi).raw}; - // Move into most-significant bytes - const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); - const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); - return VFromD{BitCast(dfull, r).raw}; -} - -// ------------------------------ Broadcast/splat any lane - -template -HWY_API Vec128 Broadcast(Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{vec_splat(v.raw, kLane)}; -} - -// ------------------------------ TableLookupLanes (Shuffle01) - -// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. -template -struct Indices128 { - __vector unsigned char raw; -}; - -namespace detail { - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - return Iota(d8, 0); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - constexpr __vector unsigned char kBroadcastLaneBytes = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; -#else - constexpr __vector unsigned char kBroadcastLaneBytes = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; -#endif - return VFromD{kBroadcastLaneBytes}; -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - constexpr __vector unsigned char kBroadcastLaneBytes = { - 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; -#else - constexpr __vector unsigned char kBroadcastLaneBytes = { - 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15}; -#endif - return VFromD{kBroadcastLaneBytes}; -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - constexpr __vector unsigned char kBroadcastLaneBytes = { - 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; -#else - constexpr __vector unsigned char kBroadcastLaneBytes = { - 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15}; -#endif - return VFromD{kBroadcastLaneBytes}; -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - return Zero(d8); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1}; - return VFromD{kByteOffsets}; -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3}; - return VFromD{kByteOffsets}; -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7}; - return VFromD{kByteOffsets}; -} - -} // namespace detail - -template -HWY_API Indices128, MaxLanes(D())> IndicesFromVec( - D d, Vec128 vec) { - using T = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const RebindToUnsigned du; - using TU = TFromD; - HWY_DASSERT(AllTrue( - du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); -#endif - - const Repartition d8; - return Indices128, MaxLanes(D())>{BitCast(d8, vec).raw}; -} - -template -HWY_API Indices128, MaxLanes(D())> IndicesFromVec( - D d, Vec128 vec) { - using T = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const RebindToUnsigned du; - using TU = TFromD; - HWY_DASSERT(AllTrue( - du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); -#endif - - const Repartition d8; - using V8 = VFromD; - - // Broadcast each lane index to all bytes of T and shift to bytes - const V8 lane_indices = TableLookupBytes( - BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); - constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); - const V8 byte_indices = ShiftLeft(lane_indices); - const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); - return Indices128, MaxLanes(D())>{sum.raw}; -} - -template -HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( - D d, const TI* idx) { - const Rebind di; - return IndicesFromVec(d, LoadU(di, idx)); -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { - const DFromV d; - const Repartition d8; - return BitCast(d, TableLookupBytes(v, VFromD{idx.raw})); -} - -// Single lane: no change -template -HWY_API Vec128 TableLookupLanes(Vec128 v, - Indices128 /* idx */) { - return v; -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - const DFromV d; - const Twice dt; - const Repartition dt_u8; -// TableLookupLanes currently requires table and index vectors to be the same -// size, though a half-length index vector would be sufficient here. -#if HWY_IS_MSAN - const Vec128 idx_vec{idx.raw}; - const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; -#else - // We only keep LowerHalf of the result, which is valid in idx. - const Indices128 idx2{idx.raw}; -#endif - return LowerHalf( - d, TableLookupBytes(Combine(dt, b, a), - BitCast(dt, VFromD{idx2.raw}))); -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - return Vec128{vec_perm(a.raw, b.raw, idx.raw)}; -} - -// ------------------------------ ReverseBlocks - -// Single block: no change -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return v; -} - -// ------------------------------ Reverse (Shuffle0123, Shuffle2301) - -// Single lane: no change -template , HWY_IF_LANES_D(D, 1)> -HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { - return v; -} - -// 32-bit x2: shuffle -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec64 Reverse(D /* tag */, Vec64 v) { - return Vec64{Shuffle2301(Vec128{v.raw}).raw}; -} - -// 16-bit x4: shuffle -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec64 Reverse(D /* tag */, Vec64 v) { - const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, - 14, 15, 12, 13, 10, 11, 8, 9}; - return Vec64{vec_perm(v.raw, v.raw, kShuffle)}; -} - -// 16-bit x2: rotate bytes -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec32 Reverse(D d, Vec32 v) { - const RepartitionToWide> du32; - return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); -} - -// ------------------------------- ReverseLaneBytes - -#if HWY_PPC_HAVE_9 && \ - (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400) - -// Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes. -#ifdef HWY_NATIVE_REVERSE_LANE_BYTES -#undef HWY_NATIVE_REVERSE_LANE_BYTES -#else -#define HWY_NATIVE_REVERSE_LANE_BYTES -#endif - -template -HWY_API V ReverseLaneBytes(V v) { - return V{vec_revb(v.raw)}; -} - -// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API VFromD Reverse2(D d, VFromD v) { - const Repartition du16; - return BitCast(d, ReverseLaneBytes(BitCast(du16, v))); -} - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API VFromD Reverse4(D d, VFromD v) { - const Repartition du32; - return BitCast(d, ReverseLaneBytes(BitCast(du32, v))); -} - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API VFromD Reverse8(D d, VFromD v) { - const Repartition du64; - return BitCast(d, ReverseLaneBytes(BitCast(du64, v))); -} - -#endif // HWY_PPC_HAVE_9 - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec16 Reverse(D d, Vec16 v) { - return Reverse2(d, v); -} - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 Reverse(D d, Vec32 v) { - return Reverse4(d, v); -} - -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec64 Reverse(D d, Vec64 v) { - return Reverse8(d, v); -} - -// ------------------------------ Reverse2 - -// Single lane: no change -template , HWY_IF_LANES_D(D, 1)> -HWY_API Vec128 Reverse2(D /* tag */, Vec128 v) { - return v; -} - -template , HWY_IF_T_SIZE(T, 2)> -HWY_API VFromD Reverse2(D d, VFromD v) { - const Repartition du32; - return BitCast(d, RotateRight<16>(BitCast(du32, v))); -} - -template , HWY_IF_T_SIZE(T, 4)> -HWY_API VFromD Reverse2(D d, VFromD v) { - const Repartition du64; - return BitCast(d, RotateRight<32>(BitCast(du64, v))); -} - -template , HWY_IF_T_SIZE(T, 8)> -HWY_API VFromD Reverse2(D /* tag */, VFromD v) { - return Shuffle01(v); -} - -// ------------------------------ Reverse4 - -template -HWY_API VFromD Reverse4(D /*d*/, VFromD v) { - const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, - 14, 15, 12, 13, 10, 11, 8, 9}; - return VFromD{vec_perm(v.raw, v.raw, kShuffle)}; -} - -template -HWY_API VFromD Reverse4(D d, VFromD v) { - return Reverse(d, v); -} - -template -HWY_API VFromD Reverse4(D /* tag */, VFromD /* v */) { - HWY_ASSERT(0); // don't have 4 u64 lanes -} - -// ------------------------------ Reverse8 - -template -HWY_API VFromD Reverse8(D d, VFromD v) { - return Reverse(d, v); -} - -template -HWY_API VFromD Reverse8(D /* tag */, VFromD /* v */) { - HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit -} - -// ------------------------------ InterleaveLower - -// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides -// the least-significant lane) and "b". To concatenate two half-width integers -// into one, use ZipLower/Upper instead (also works with scalar). - -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - return Vec128{vec_mergeh(a.raw, b.raw)}; -} - -// Additional overload for the optional tag -template -HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { - return InterleaveLower(a, b); -} - -// ------------------------------ InterleaveUpper (UpperHalf) - -// Full -template > -HWY_API Vec128 InterleaveUpper(D /* tag */, Vec128 a, Vec128 b) { - return Vec128{vec_mergel(a.raw, b.raw)}; -} - -// Partial -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const Half d2; - return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, - VFromD{UpperHalf(d2, b).raw}); -} - -// ------------------------------ ZipLower/ZipUpper (InterleaveLower) - -// Same as Interleave*, except that the return lanes are double-width integers; -// this is necessary because the single-lane scalar cannot return two values. -template >> -HWY_API VFromD ZipLower(V a, V b) { - return BitCast(DW(), InterleaveLower(a, b)); -} -template , class DW = RepartitionToWide> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - return BitCast(dw, InterleaveLower(D(), a, b)); -} - -template , class DW = RepartitionToWide> -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - return BitCast(dw, InterleaveUpper(D(), a, b)); -} - -// ------------------------------ Per4LaneBlkShufDupSet4xU32 - -// Used by hwy/ops/generic_ops-inl.h to implement Per4LaneBlockShuffle -namespace detail { - -#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#else -#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#endif - -template -HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, - const uint32_t x2, - const uint32_t x1, - const uint32_t x0) { - const __vector unsigned int raw = {x0, x1, x2, x3}; - return ResizeBitCast(d, Vec128{raw}); -} - -} // namespace detail - -// ------------------------------ SlideUpLanes - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { - const Repartition du8; - using VU8 = VFromD; - const auto v_shift_amt = - BitCast(Full128(), - Set(Full128(), - static_cast(amt * sizeof(TFromD) * 8))); - -#if HWY_IS_LITTLE_ENDIAN - return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)}); -#else - return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)}); -#endif -} - -// ------------------------------ SlideDownLanes - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { - using TU = UnsignedFromSize; - const Repartition du; - const auto v_shift_amt = - Set(du, static_cast(amt * sizeof(TFromD) * 8)); - -#if HWY_IS_LITTLE_ENDIAN - return BitCast(d, BitCast(du, v) >> v_shift_amt); -#else - return BitCast(d, BitCast(du, v) << v_shift_amt); -#endif -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { - const Repartition du8; - using VU8 = VFromD; - const auto v_shift_amt = - BitCast(Full128(), - Set(Full128(), - static_cast(amt * sizeof(TFromD) * 8))); - -#if HWY_IS_LITTLE_ENDIAN - return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)}); -#else - return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)}); -#endif -} - -// ================================================== COMBINE - -// ------------------------------ Combine (InterleaveLower) - -// N = N/2 + N/2 (upper half undefined) -template >> -HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { - const Half dh; - // Treat half-width input as one lane, and expand to two lanes. - using VU = Vec128, 2>; - using Raw = typename detail::Raw128>::type; - const VU lo{reinterpret_cast(lo_half.raw)}; - const VU hi{reinterpret_cast(hi_half.raw)}; - return BitCast(d, InterleaveLower(lo, hi)); -} - -// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) - -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { - const Half dh; - return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD{lo.raw}); -} - -// ------------------------------ Concat full (InterleaveLower) - -// hiH,hiL loH,loL |-> hiL,loL (= lower halves) -template > -HWY_API Vec128 ConcatLowerLower(D d, Vec128 hi, Vec128 lo) { - const Repartition d64; - return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); -} - -// hiH,hiL loH,loL |-> hiH,loH (= upper halves) -template > -HWY_API Vec128 ConcatUpperUpper(D d, Vec128 hi, Vec128 lo) { - const Repartition d64; - return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); -} - -// hiH,hiL loH,loL |-> hiL,loH (= inner halves) -template > -HWY_API Vec128 ConcatLowerUpper(D d, Vec128 hi, Vec128 lo) { - return CombineShiftRightBytes<8>(d, hi, lo); -} - -// hiH,hiL loH,loL |-> hiH,loL (= outer halves) -template > -HWY_API Vec128 ConcatUpperLower(D /*d*/, Vec128 hi, Vec128 lo) { - const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7, - 24, 25, 26, 27, 28, 29, 30, 31}; - return Vec128{vec_perm(lo.raw, hi.raw, kShuffle)}; -} - -// ------------------------------ Concat partial (Combine, LowerHalf) - -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); -} - -// ------------------------------ TruncateTo - -template = sizeof(TFromD) * 2)>* = nullptr, - HWY_IF_LANES_D(D, 1)> -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - using Raw = typename detail::Raw128>::type; -#if HWY_IS_LITTLE_ENDIAN - return VFromD{reinterpret_cast(v.raw)}; -#else - return VFromD{reinterpret_cast( - vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD)))}; -#endif -} - -namespace detail { - -template ) * 2), HWY_IF_LANES_GT_D(D, 1)> -HWY_API VFromD Truncate2To( - D /* tag */, Vec128().MaxLanes()> lo, - Vec128().MaxLanes()> hi) { - return VFromD{vec_pack(lo.raw, hi.raw)}; -} - -} // namespace detail - -template ) * 2), HWY_IF_LANES_GT_D(D, 1)> -HWY_API VFromD TruncateTo(D /* d */, - Vec128().MaxLanes()> v) { - return VFromD{vec_pack(v.raw, v.raw)}; -} - -template = sizeof(TFromD) * 4)>* = nullptr, - HWY_IF_LANES_GT_D(D, 1)> -HWY_API VFromD TruncateTo(D d, - Vec128().MaxLanes()> v) { - const Rebind, decltype(d)> d2; - return TruncateTo(d, TruncateTo(d2, v)); -} - -// ------------------------------ ConcatOdd (TruncateTo) - -// 8-bit full -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { - const Repartition dw; - const RebindToUnsigned du; -#if HWY_IS_LITTLE_ENDIAN - // Right-shift 8 bits per u16 so we can pack. - const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); -#else - const Vec128 uH = BitCast(dw, hi); - const Vec128 uL = BitCast(dw, lo); -#endif - return BitCast(d, detail::Truncate2To(du, uL, uH)); -} - -// 8-bit x8 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec64 ConcatOdd(D /*d*/, Vec64 hi, Vec64 lo) { - // Don't care about upper half, no need to zero. - const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23}; - return Vec64{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; -} - -// 8-bit x4 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 ConcatOdd(D /*d*/, Vec32 hi, Vec32 lo) { - // Don't care about upper half, no need to zero. - const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19}; - return Vec32{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; -} - -// 16-bit full -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { - const Repartition dw; - const RebindToUnsigned du; -#if HWY_IS_LITTLE_ENDIAN - const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); - const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); -#else - const Vec128 uH = BitCast(dw, hi); - const Vec128 uL = BitCast(dw, lo); -#endif - return BitCast(d, detail::Truncate2To(du, uL, uH)); -} - -// 16-bit x4 -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec64 ConcatOdd(D /*d*/, Vec64 hi, Vec64 lo) { - // Don't care about upper half, no need to zero. - const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23}; - return Vec64{vec_perm(lo.raw, hi.raw, kCompactOddU16)}; -} - -// 32-bit full -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { -#if HWY_IS_LITTLE_ENDIAN - (void)d; - const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15, - 20, 21, 22, 23, 28, 29, 30, 31}; - return Vec128{vec_perm(lo.raw, hi.raw, kShuffle)}; -#else - const RebindToUnsigned du; - const Repartition dw; - return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); -#endif -} - -// Any type x2 -template , HWY_IF_LANES_D(D, 2)> -HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { - return InterleaveUpper(d, lo, hi); -} - -// ------------------------------ ConcatEven (TruncateTo) - -// 8-bit full -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { - const Repartition dw; - const RebindToUnsigned du; -#if HWY_IS_LITTLE_ENDIAN - const Vec128 uH = BitCast(dw, hi); - const Vec128 uL = BitCast(dw, lo); -#else - // Right-shift 8 bits per u16 so we can pack. - const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); -#endif - return BitCast(d, detail::Truncate2To(du, uL, uH)); -} - -// 8-bit x8 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec64 ConcatEven(D /*d*/, Vec64 hi, Vec64 lo) { - // Don't care about upper half, no need to zero. - const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22}; - return Vec64{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; -} - -// 8-bit x4 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 ConcatEven(D /*d*/, Vec32 hi, Vec32 lo) { - // Don't care about upper half, no need to zero. - const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18}; - return Vec32{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; -} - -// 16-bit full -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { - // Isolate lower 16 bits per u32 so we can pack. - const Repartition dw; - const RebindToUnsigned du; -#if HWY_IS_LITTLE_ENDIAN - const Vec128 uH = BitCast(dw, hi); - const Vec128 uL = BitCast(dw, lo); -#else - const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); - const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); -#endif - return BitCast(d, detail::Truncate2To(du, uL, uH)); -} - -// 16-bit x4 -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec64 ConcatEven(D /*d*/, Vec64 hi, Vec64 lo) { - // Don't care about upper half, no need to zero. - const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21}; - return Vec64{vec_perm(lo.raw, hi.raw, kCompactEvenU16)}; -} - -// 32-bit full -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { -#if HWY_IS_LITTLE_ENDIAN - const Repartition dw; - const RebindToUnsigned du; - return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); -#else - (void)d; - constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11, - 16, 17, 18, 19, 24, 25, 26, 27}; - return Vec128{vec_perm(lo.raw, hi.raw, kShuffle)}; -#endif -} - -// Any T x2 -template , HWY_IF_LANES_D(D, 2)> -HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { - return InterleaveLower(d, lo, hi); -} - -// ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd) -#ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#else -#define HWY_NATIVE_ORDERED_TRUNCATE_2_TO -#endif - -template ) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedTruncate2To(D d, V a, V b) { -#if HWY_IS_LITTLE_ENDIAN - return ConcatEven(d, BitCast(d, b), BitCast(d, a)); -#else - return ConcatOdd(d, BitCast(d, b), BitCast(d, a)); -#endif -} - -// ------------------------------ DupEven (InterleaveLower) - -template -HWY_API Vec128 DupEven(Vec128 v) { - return v; -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - return InterleaveLower(DFromV(), v, v); -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - const DFromV d; - const Repartition du8; - constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6, - 8, 8, 10, 10, 12, 12, 14, 14}; - return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - const DFromV d; - const Repartition du8; - constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5, - 8, 9, 8, 9, 12, 13, 12, 13}; - return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - return Vec128{vec_mergee(v.raw, v.raw)}; -} - -// ------------------------------ DupOdd (InterleaveUpper) - -template -HWY_API Vec128 DupOdd(Vec128 v) { - const DFromV d; - const Repartition du8; - constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7, - 9, 9, 11, 11, 13, 13, 15, 15}; - return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - const DFromV d; - const Repartition du8; - constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7, - 10, 11, 10, 11, 14, 15, 14, 15}; - return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return Vec128{vec_mergeo(v.raw, v.raw)}; -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return InterleaveUpper(DFromV(), v, v); -} - -// ------------------------------ OddEven (IfThenElse) - -template -HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { - const DFromV d; - const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, - 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; - return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); -} - -template -HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { - const DFromV d; - const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, - 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; - return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); -} - -template -HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { - const DFromV d; - const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, - 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0}; - return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); -} - -template -HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { - // Same as ConcatUpperLower for full vectors; do not call that because this - // is more efficient for 64x1 vectors. - const DFromV d; - const __vector unsigned char mask = { - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0}; - return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); -} - -// ------------------------------ OddEvenBlocks -template -HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { - return even; -} - -// ------------------------------ SwapAdjacentBlocks - -template -HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { - return v; -} - -// ------------------------------ Shl - -namespace detail { -template -HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, - Vec128 bits) { - return Vec128{vec_sl(v.raw, bits.raw)}; -} - -// Signed left shift is the same as unsigned. -template -HWY_API Vec128 Shl(hwy::SignedTag /*tag*/, Vec128 v, - Vec128 bits) { - const DFromV di; - const RebindToUnsigned du; - return BitCast(di, - Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); -} - -} // namespace detail - -template -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return detail::Shl(hwy::TypeTag(), v, bits); -} - -// ------------------------------ Shr - -namespace detail { -template -HWY_API Vec128 Shr(hwy::UnsignedTag /*tag*/, Vec128 v, - Vec128 bits) { - return Vec128{vec_sr(v.raw, bits.raw)}; -} - -template -HWY_API Vec128 Shr(hwy::SignedTag /*tag*/, Vec128 v, - Vec128 bits) { - const DFromV di; - const RebindToUnsigned du; - return Vec128{vec_sra(v.raw, BitCast(du, bits).raw)}; -} - -} // namespace detail - -template -HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { - return detail::Shr(hwy::TypeTag(), v, bits); -} - -// ------------------------------ MulEven/Odd 64x64 (UpperHalf) - -HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { -#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) - using VU64 = __vector unsigned long long; - const VU64 mul128_result = reinterpret_cast(vec_mule(a.raw, b.raw)); -#if HWY_IS_LITTLE_ENDIAN - return Vec128{mul128_result}; -#else - // Need to swap the two halves of mul128_result on big-endian targets as - // the upper 64 bits of the product are in lane 0 of mul128_result and - // the lower 64 bits of the product are in lane 1 of mul128_result - return Vec128{vec_sld(mul128_result, mul128_result, 8)}; -#endif -#else - alignas(16) uint64_t mul[2]; - mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); - return Load(Full128(), mul); -#endif -} - -HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { -#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) - using VU64 = __vector unsigned long long; - const VU64 mul128_result = reinterpret_cast(vec_mulo(a.raw, b.raw)); -#if HWY_IS_LITTLE_ENDIAN - return Vec128{mul128_result}; -#else - // Need to swap the two halves of mul128_result on big-endian targets as - // the upper 64 bits of the product are in lane 0 of mul128_result and - // the lower 64 bits of the product are in lane 1 of mul128_result - return Vec128{vec_sld(mul128_result, mul128_result, 8)}; -#endif -#else - alignas(16) uint64_t mul[2]; - const Full64 d2; - mul[0] = - Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); - return Load(Full128(), mul); -#endif -} - -// ------------------------------ WidenMulPairwiseAdd - -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { - const RebindToUnsigned du32; - // Lane order within sum0/1 is undefined, hence we can avoid the - // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip - // leads to the odd/even order that RearrangeToOddPlusEven prefers. - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), - Mul(BitCast(df32, ao), BitCast(df32, bo))); -} - -// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) { - return VFromD{vec_msum(a.raw, b.raw, Zero(d32).raw)}; -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, - VFromD sum0, - VFromD& sum1) { - const RebindToUnsigned du32; - // Lane order within sum0/1 is undefined, hence we can avoid the - // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip - // leads to the odd/even order that RearrangeToOddPlusEven prefers. - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -} - -// Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 /* tag */, V16 a, V16 b, - VFromD sum0, - VFromD& /*sum1*/) { - return VFromD{vec_msum(a.raw, b.raw, sum0.raw)}; -} - -// ------------------------------ RearrangeToOddPlusEven -template -HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, - Vec128 /*sum1*/) { - return sum0; // invariant already holds -} - -template -HWY_API Vec128 RearrangeToOddPlusEven( - Vec128 sum0, Vec128 /*sum1*/) { - return sum0; // invariant already holds -} - -template -HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { - return Add(sum0, sum1); -} - -// ------------------------------ SumOfMulQuadAccumulate -#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#endif -template -HWY_API VFromD SumOfMulQuadAccumulate( - DU32 /*du32*/, VFromD> a, - VFromD> b, VFromD sum) { - return VFromD{vec_msum(a.raw, b.raw, sum.raw)}; -} - -#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate( - DI32 /*di32*/, VFromD> a_u, - VFromD> b_i, VFromD sum) { - return VFromD{vec_msum(b_i.raw, a_u.raw, sum.raw)}; -} - -#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#endif -template -HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, - VFromD> a, - VFromD> b, - VFromD sum) { - const Repartition du8; - - const auto result_sum_0 = - SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum); - const auto result_sum_1 = ShiftLeft<8>(detail::AltivecVsum4sbs( - di32, And(b, BroadcastSignBit(a)).raw, Zero(di32).raw)); - return result_sum_0 - result_sum_1; -} - -// ================================================== CONVERT - -// ------------------------------ Promotions (part w/ narrow lanes -> full) - -// Unsigned to signed/unsigned: zero-extend. -template -HWY_API VFromD PromoteTo(D /* d */, - Vec128().MaxLanes()> v) { - // First pretend the input has twice the lanes - the upper half will be - // ignored by ZipLower. - const Rebind> d2; - const VFromD twice{v.raw}; - // Then cast to narrow as expected by ZipLower, in case the sign of FromT - // differs from that of D. - const RepartitionToNarrow dn; - -#if HWY_IS_LITTLE_ENDIAN - return ZipLower(BitCast(dn, twice), Zero(dn)); -#else - return ZipLower(Zero(dn), BitCast(dn, twice)); -#endif -} - -// Signed: replicate sign bit. -template -HWY_API VFromD PromoteTo(D /* d */, - Vec128().MaxLanes()> v) { - using Raw = typename detail::Raw128>::type; - return VFromD{reinterpret_cast(vec_unpackh(v.raw))}; -} - -// 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit. -template -HWY_API VFromD PromoteTo(D d32, - Vec128().MaxLanes()> v) { - const DFromV d8; - const Rebind, decltype(d8)> d16; - return PromoteTo(d32, PromoteTo(d16, v)); -} - -// 8-bit or 16-bit to 64-bit: First, promote to MakeWide, and then -// convert to 64-bit. -template -HWY_API VFromD PromoteTo(D d64, - Vec128().MaxLanes()> v) { - const Rebind, decltype(d64)> dw; - return PromoteTo(d64, PromoteTo(dw, v)); -} - -#if HWY_PPC_HAVE_9 - -// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif - -template -HWY_INLINE VFromD PromoteTo(D /*tag*/, VFromD> v) { - return VFromD{vec_extract_fp32_from_shorth(v.raw)}; -} - -#endif // HWY_PPC_HAVE_9 - -template -HWY_API VFromD PromoteTo(D df32, VFromD> v) { - const Rebind du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - const __vector float raw_v = InterleaveLower(v, v).raw; -#if HWY_IS_LITTLE_ENDIAN - return VFromD{vec_doubleo(raw_v)}; -#else - return VFromD{vec_doublee(raw_v)}; -#endif -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - const __vector signed int raw_v = InterleaveLower(v, v).raw; -#if HWY_IS_LITTLE_ENDIAN - return VFromD{vec_doubleo(raw_v)}; -#else - return VFromD{vec_doublee(raw_v)}; -#endif -} - -// ------------------------------ Demotions (full -> part w/ narrow lanes) - -template ) * 2)> -HWY_API VFromD DemoteTo(D /* tag */, - Vec128().MaxLanes()> v) { - return VFromD{vec_packsu(v.raw, v.raw)}; -} - -template ) * 2)> -HWY_API VFromD DemoteTo(D /* tag */, - Vec128().MaxLanes()> v) { - return VFromD{vec_packs(v.raw, v.raw)}; -} - -template ) * 2)> -HWY_API VFromD DemoteTo(D /* tag */, - Vec128().MaxLanes()> v) { - return VFromD{vec_packs(v.raw, v.raw)}; -} - -template = sizeof(TFromD) * 4)>* = nullptr> -HWY_API VFromD DemoteTo(D d, - Vec128().MaxLanes()> v) { - const Rebind, D> d2; - return DemoteTo(d, DemoteTo(d2, v)); -} - -template = sizeof(TFromD) * 4)>* = nullptr> -HWY_API VFromD DemoteTo(D d, - Vec128().MaxLanes()> v) { - const Rebind, D> d2; - return DemoteTo(d, DemoteTo(d2, v)); -} - -template = sizeof(TFromD) * 4)>* = nullptr> -HWY_API VFromD DemoteTo(D d, - Vec128().MaxLanes()> v) { - const Rebind>, D> d2; - return DemoteTo(d, DemoteTo(d2, v)); -} - -#if HWY_PPC_HAVE_9 && \ - (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)) - -// We already toggled HWY_NATIVE_F16C above. - -template -HWY_API VFromD DemoteTo(D df16, VFromD> v) { -// Avoid vec_pack_to_short_fp32 on Clang because its implementation is buggy. -#if HWY_COMPILER_GCC_ACTUAL - (void)df16; - return VFromD{vec_pack_to_short_fp32(v.raw, v.raw)}; -#elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp) - // Work around bug in the clang implementation of vec_pack_to_short_fp32 - // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets - // if the __builtin_vsx_xvcvsphp intrinsic is available - const RebindToUnsigned du16; - const Rebind du; - const VFromD bits16{ - reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))}; - return BitCast(df16, TruncateTo(du16, bits16)); -#else -#error "Only define the function if we have a native implementation" -#endif -} - -#endif // HWY_PPC_HAVE_9 - -template -HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { - const Rebind du32; // for logical shift right - const Rebind du16; - const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v)); - return BitCast(dbf16, TruncateTo(du16, bits_in_32)); -} - -template >> -HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { - const RebindToUnsigned du16; - const Repartition du32; -#if HWY_IS_LITTLE_ENDIAN - const auto a_in_odd = a; - const auto b_in_even = ShiftRight<16>(BitCast(du32, b)); -#else - const auto a_in_odd = ShiftRight<16>(BitCast(du32, a)); - const auto b_in_even = b; -#endif - return BitCast(dbf16, - OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even))); -} - -// Specializations for partial vectors because vec_packs sets lanes above 2*N. -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Twice dn_full; - const Repartition du32_full; - - const VFromD v_full{vec_packs(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN /*dn*/, V a, V b) { - return VFromD{vec_packs(a.raw, b.raw)}; -} - -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Twice dn_full; - const Repartition du32_full; - - const VFromD v_full{vec_packsu(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN /*dn*/, V a, V b) { - return VFromD{vec_packsu(a.raw, b.raw)}; -} - -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Twice dn_full; - const Repartition du32_full; - - const VFromD v_full{vec_packs(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template ) * 2)> -HWY_API VFromD ReorderDemote2To(DN /*dn*/, V a, V b) { - return VFromD{vec_packs(a.raw, b.raw)}; -} - -template ), class V, - HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return ReorderDemote2To(d, a, b); -} - -template >> -HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { - const RebindToUnsigned du16; -#if HWY_IS_LITTLE_ENDIAN - return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); -#else - return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a))); -#endif -} - -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - return Vec32{vec_floate(v.raw)}; -} - -template -HWY_API Vec64 DemoteTo(D d, Vec128 v) { -#if HWY_IS_LITTLE_ENDIAN - const Vec128 f64_to_f32{vec_floate(v.raw)}; -#else - const Vec128 f64_to_f32{vec_floato(v.raw)}; -#endif - - const RebindToUnsigned du; - const Rebind du64; - return Vec64{ - BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw}; -} - -template -HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { - return Vec32{vec_signede(v.raw)}; -} - -template -HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { -#if HWY_IS_LITTLE_ENDIAN - const Vec128 f64_to_i32{vec_signede(v.raw)}; -#else - const Vec128 f64_to_i32{vec_signedo(v.raw)}; -#endif - - const Rebind di64; - const Vec128 vi64 = BitCast(di64, f64_to_i32); - return Vec64{vec_pack(vi64.raw, vi64.raw)}; -} - -// For already range-limited input [0, 255]. -template -HWY_API Vec128 U8FromU32(Vec128 v) { - const Rebind> du16; - const Rebind du8; - return TruncateTo(du8, TruncateTo(du16, v)); -} -// ------------------------------ Integer <=> fp (ShiftRight, OddEven) - -// Note: altivec.h vec_ct* currently contain C casts which triggers -// -Wdeprecate-lax-vec-conv-all warnings, so disable them. - -template -HWY_API VFromD ConvertTo(D /* tag */, - Vec128().MaxLanes()> v) { - HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_CLANG - HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") -#endif - return VFromD{vec_ctf(v.raw, 0)}; - HWY_DIAGNOSTICS(pop) -} - -template -HWY_API VFromD ConvertTo(D /* tag */, - Vec128().MaxLanes()> v) { - return VFromD{vec_double(v.raw)}; -} - -// Truncates (rounds toward zero). -template -HWY_API VFromD ConvertTo(D /* tag */, - Vec128().MaxLanes()> v) { - HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_CLANG - HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") -#endif - return VFromD{vec_cts(v.raw, 0)}; - HWY_DIAGNOSTICS(pop) -} - -template -HWY_API VFromD ConvertTo(D /* tag */, - Vec128().MaxLanes()> v) { - HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_CLANG - HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") -#endif - return VFromD{vec_ctu(v.raw, 0)}; - HWY_DIAGNOSTICS(pop) -} - -template -HWY_API Vec128 NearestInt(Vec128 v) { - HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_CLANG - HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") -#endif - return Vec128{vec_cts(vec_round(v.raw), 0)}; - HWY_DIAGNOSTICS(pop) -} - -// ------------------------------ Floating-point rounding (ConvertTo) - -// Toward nearest integer, ties to even -template -HWY_API Vec128 Round(Vec128 v) { - return Vec128{vec_round(v.raw)}; -} - -template -HWY_API Vec128 Round(Vec128 v) { - return Vec128{vec_rint(v.raw)}; -} - -// Toward zero, aka truncate -template -HWY_API Vec128 Trunc(Vec128 v) { - return Vec128{vec_trunc(v.raw)}; -} - -// Toward +infinity, aka ceiling -template -HWY_API Vec128 Ceil(Vec128 v) { - return Vec128{vec_ceil(v.raw)}; -} - -// Toward -infinity, aka floor -template -HWY_API Vec128 Floor(Vec128 v) { - return Vec128{vec_floor(v.raw)}; -} - -// ------------------------------ Floating-point classification - -template -HWY_API Mask128 IsNaN(Vec128 v) { - static_assert(IsFloat(), "Only for float"); - return v != v; -} - -template -HWY_API Mask128 IsInf(Vec128 v) { - static_assert(IsFloat(), "Only for float"); - using TU = MakeUnsigned; - const DFromV d; - const RebindToUnsigned du; - const VFromD vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask( - d, - Eq(Add(vu, vu), Set(du, static_cast(hwy::MaxExponentTimes2())))); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask128 IsFinite(Vec128 v) { - static_assert(IsFloat(), "Only for float"); - using TU = MakeUnsigned; - const DFromV d; - const RebindToUnsigned du; - const VFromD vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, check for exponent(hwy::MaxExponentTimes2())))); -} - -// ================================================== CRYPTO - -#if !defined(HWY_DISABLE_PPC8_CRYPTO) - -// Per-target flag to prevent generic_ops-inl.h from defining AESRound. -#ifdef HWY_NATIVE_AES -#undef HWY_NATIVE_AES -#else -#define HWY_NATIVE_AES -#endif - -namespace detail { -#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600 -using CipherTag = Full128; -#else -using CipherTag = Full128; -#endif // !HWY_COMPILER_CLANG -using CipherVec = VFromD; -} // namespace detail - -HWY_API Vec128 AESRound(Vec128 state, - Vec128 round_key) { - const detail::CipherTag dc; - const Full128 du8; -#if HWY_IS_LITTLE_ENDIAN - return Reverse(du8, - BitCast(du8, detail::CipherVec{vec_cipher_be( - BitCast(dc, Reverse(du8, state)).raw, - BitCast(dc, Reverse(du8, round_key)).raw)})); -#else - return BitCast(du8, detail::CipherVec{vec_cipher_be( - BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); -#endif -} - -HWY_API Vec128 AESLastRound(Vec128 state, - Vec128 round_key) { - const detail::CipherTag dc; - const Full128 du8; -#if HWY_IS_LITTLE_ENDIAN - return Reverse(du8, - BitCast(du8, detail::CipherVec{vec_cipherlast_be( - BitCast(dc, Reverse(du8, state)).raw, - BitCast(dc, Reverse(du8, round_key)).raw)})); -#else - return BitCast(du8, detail::CipherVec{vec_cipherlast_be( - BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); -#endif -} - -HWY_API Vec128 AESRoundInv(Vec128 state, - Vec128 round_key) { - const detail::CipherTag dc; - const Full128 du8; -#if HWY_IS_LITTLE_ENDIAN - return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be( - BitCast(dc, Reverse(du8, state)).raw, - Zero(dc).raw)})), - round_key); -#else - return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be( - BitCast(dc, state).raw, Zero(dc).raw)}), - round_key); -#endif -} - -HWY_API Vec128 AESLastRoundInv(Vec128 state, - Vec128 round_key) { - const detail::CipherTag dc; - const Full128 du8; -#if HWY_IS_LITTLE_ENDIAN - return Reverse(du8, - BitCast(du8, detail::CipherVec{vec_ncipherlast_be( - BitCast(dc, Reverse(du8, state)).raw, - BitCast(dc, Reverse(du8, round_key)).raw)})); -#else - return BitCast(du8, detail::CipherVec{vec_ncipherlast_be( - BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); -#endif -} - -HWY_API Vec128 AESInvMixColumns(Vec128 state) { - const Full128 du8; - const auto zero = Zero(du8); - - // PPC8/PPC9/PPC10 does not have a single instruction for the AES - // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do. - - // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10 - // by doing an AESLastRound operation with a zero round_key followed by an - // AESRoundInv operation with a zero round_key. - return AESRoundInv(AESLastRound(state, zero), zero); -} - -template -HWY_API Vec128 AESKeyGenAssist(Vec128 v) { - constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0, - 0, 0, 0, 0, kRcon, 0, 0, 0}; - constexpr __vector unsigned char kRotWordShuffle = { - 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12}; - const detail::CipherTag dc; - const Full128 du8; - const auto sub_word_result = - BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)}); - const auto rot_word_result = - TableLookupBytes(sub_word_result, Vec128{kRotWordShuffle}); - return Xor(rot_word_result, Vec128{kRconXorMask}); -} - -template -HWY_API Vec128 CLMulLower(Vec128 a, - Vec128 b) { - // NOTE: Lane 1 of both a and b need to be zeroed out for the - // vec_pmsum_be operation below as the vec_pmsum_be operation - // does a carryless multiplication of each 64-bit half and then - // adds the two halves using an bitwise XOR operation. - - const DFromV d; - const auto zero = Zero(d); - - using VU64 = __vector unsigned long long; - const VU64 pmsum_result = reinterpret_cast( - vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw)); - -#if HWY_IS_LITTLE_ENDIAN - return Vec128{pmsum_result}; -#else - // Need to swap the two halves of pmsum_result on big-endian targets as - // the upper 64 bits of the carryless multiplication result are in lane 0 of - // pmsum_result and the lower 64 bits of the carryless multiplication result - // are in lane 1 of mul128_result - return Vec128{vec_sld(pmsum_result, pmsum_result, 8)}; -#endif -} - -template -HWY_API Vec128 CLMulUpper(Vec128 a, - Vec128 b) { - // NOTE: Lane 0 of both a and b need to be zeroed out for the - // vec_pmsum_be operation below as the vec_pmsum_be operation - // does a carryless multiplication of each 64-bit half and then - // adds the two halves using an bitwise XOR operation. - - const DFromV d; - const auto zero = Zero(d); - - using VU64 = __vector unsigned long long; - const VU64 pmsum_result = reinterpret_cast( - vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw))); - -#if HWY_IS_LITTLE_ENDIAN - return Vec128{pmsum_result}; -#else - // Need to swap the two halves of pmsum_result on big-endian targets as - // the upper 64 bits of the carryless multiplication result are in lane 0 of - // pmsum_result and the lower 64 bits of the carryless multiplication result - // are in lane 1 of mul128_result - return Vec128{vec_sld(pmsum_result, pmsum_result, 8)}; -#endif -} - -#endif // !defined(HWY_DISABLE_PPC8_CRYPTO) - -// ================================================== MISC - -// ------------------------------ LoadMaskBits (TestBit) - -namespace detail { - -template -HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { -#if HWY_PPC_HAVE_10 - const Vec128 mask_vec{vec_genbm(mask_bits)}; - -#if HWY_IS_LITTLE_ENDIAN - return MFromD{MaskFromVec(mask_vec).raw}; -#else - return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; -#endif // HWY_IS_LITTLE_ENDIAN - -#else // PPC9 or earlier - const Full128 du8; - const Full128 du16; - const Vec128 vbits = - BitCast(du8, Set(du16, static_cast(mask_bits))); - - // Replicate bytes 8x such that each byte contains the bit that governs it. -#if HWY_IS_LITTLE_ENDIAN - const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1}; -#else - const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0}; -#endif // HWY_IS_LITTLE_ENDIAN - - const Vec128 rep8{vec_perm(vbits.raw, vbits.raw, kRep8)}; - const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128}; - return MFromD{TestBit(rep8, Vec128{kBit}).raw}; -#endif // HWY_PPC_HAVE_10 -} - -template -HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { -#if HWY_PPC_HAVE_10 - const Vec128 mask_vec{vec_genhm(mask_bits)}; - -#if HWY_IS_LITTLE_ENDIAN - return MFromD{MaskFromVec(mask_vec).raw}; -#else - return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; -#endif // HWY_IS_LITTLE_ENDIAN - -#else // PPC9 or earlier - const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128}; - const auto vmask_bits = - Set(Full128(), static_cast(mask_bits)); - return MFromD{TestBit(vmask_bits, Vec128{kBit}).raw}; -#endif // HWY_PPC_HAVE_10 -} - -template -HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { -#if HWY_PPC_HAVE_10 - const Vec128 mask_vec{vec_genwm(mask_bits)}; - -#if HWY_IS_LITTLE_ENDIAN - return MFromD{MaskFromVec(mask_vec).raw}; -#else - return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; -#endif // HWY_IS_LITTLE_ENDIAN - -#else // PPC9 or earlier - const __vector unsigned int kBit = {1, 2, 4, 8}; - const auto vmask_bits = - Set(Full128(), static_cast(mask_bits)); - return MFromD{TestBit(vmask_bits, Vec128{kBit}).raw}; -#endif // HWY_PPC_HAVE_10 -} - -template -HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { -#if HWY_PPC_HAVE_10 - const Vec128 mask_vec{vec_gendm(mask_bits)}; - -#if HWY_IS_LITTLE_ENDIAN - return MFromD{MaskFromVec(mask_vec).raw}; -#else - return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; -#endif // HWY_IS_LITTLE_ENDIAN - -#else // PPC9 or earlier - const __vector unsigned long long kBit = {1, 2}; - const auto vmask_bits = - Set(Full128(), static_cast(mask_bits)); - return MFromD{TestBit(vmask_bits, Vec128{kBit}).raw}; -#endif // HWY_PPC_HAVE_10 -} - -} // namespace detail - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t - uint64_t mask_bits = bits[0]; - - constexpr size_t kN = MaxLanes(d); - if (kN < 8) mask_bits &= (1u << kN) - 1; - - return detail::LoadMaskBits128(d, mask_bits); -} - -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - // First, copy the mask bits to a uint16_t as there as there are at most - // 16 lanes in a vector. - - // Copying the mask bits to a uint16_t first will also ensure that the - // mask bits are loaded into the lower 16 bits on big-endian PPC targets. - uint16_t u16_mask_bits; - CopyBytes(bits, &u16_mask_bits); - -#if HWY_IS_LITTLE_ENDIAN - return detail::LoadMaskBits128(d, u16_mask_bits); -#else - // On big-endian targets, u16_mask_bits need to be byte swapped as bits - // contains the mask bits in little-endian byte order - - // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a - // single lhbrx instruction on big-endian PPC targets when optimizations - // are enabled. -#if HWY_HAS_BUILTIN(__builtin_bswap16) - return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits)); -#else - return detail::LoadMaskBits128( - d, static_cast((u16_mask_bits << 8) | (u16_mask_bits >> 8))); -#endif -#endif -} - -template -struct CompressIsPartition { - // generic_ops-inl does not guarantee IsPartition for 8-bit. - enum { value = (sizeof(T) != 1) }; -}; - -// ------------------------------ StoreMaskBits - -namespace detail { - -#if !HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN -// fallback for missing vec_extractm -template -HWY_INLINE uint64_t ExtractSignBits(Vec128 sign_bits, - __vector unsigned char bit_shuffle) { - // clang POWER8 and 9 targets appear to differ in their return type of - // vec_vbpermq: unsigned or signed, so cast to avoid a warning. - using VU64 = detail::Raw128::type; - const Vec128 extracted{ - reinterpret_cast(vec_vbpermq(sign_bits.raw, bit_shuffle))}; - return extracted.raw[HWY_IS_LITTLE_ENDIAN]; -} - -#endif // !HWY_PPC_HAVE_10 - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { - const DFromM d; - const Repartition du8; - const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); -#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - return static_cast(vec_extractm(sign_bits.raw)); -#else - const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, - 56, 48, 40, 32, 24, 16, 8, 0}; - return ExtractSignBits(sign_bits, kBitShuffle); -#endif // HWY_PPC_HAVE_10 -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { - const DFromM d; - const Repartition du8; - const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); - -#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - const RebindToUnsigned du; - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); -#else -#if HWY_IS_LITTLE_ENDIAN - const __vector unsigned char kBitShuffle = { - 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128}; -#else - const __vector unsigned char kBitShuffle = { - 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; -#endif - return ExtractSignBits(sign_bits, kBitShuffle); -#endif // HWY_PPC_HAVE_10 -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - const DFromM d; - const Repartition du8; - const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); -#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - const RebindToUnsigned du; - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); -#else -#if HWY_IS_LITTLE_ENDIAN - const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128, - 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128}; -#else - const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, - 96, 64, 32, 0}; -#endif - return ExtractSignBits(sign_bits, kBitShuffle); -#endif // HWY_PPC_HAVE_10 -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { - const DFromM d; - const Repartition du8; - const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); -#if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN - const RebindToUnsigned du; - return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); -#else -#if HWY_IS_LITTLE_ENDIAN - const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128}; -#else - const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, - 128, 128, 64, 0}; -#endif - return ExtractSignBits(sign_bits, kBitShuffle); -#endif // HWY_PPC_HAVE_10 -} - -// Returns the lowest N of the mask bits. -template -constexpr uint64_t OnlyActive(uint64_t mask_bits) { - return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); -} - -template -HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -} // namespace detail - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { - // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask - // to an uint8_t and store the result in bits[0]. - bits[0] = static_cast(detail::BitsFromMask(mask)); - return sizeof(uint8_t); -} - -template -HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { - const auto mask_bits = detail::BitsFromMask(mask); - - // First convert mask_bits to a uint16_t as we only want to store - // the lower 16 bits of mask_bits as there are 16 lanes in mask. - - // Converting mask_bits to a uint16_t first will also ensure that - // the lower 16 bits of mask_bits are stored instead of the upper 16 bits - // of mask_bits on big-endian PPC targets. -#if HWY_IS_LITTLE_ENDIAN - const uint16_t u16_mask_bits = static_cast(mask_bits); -#else - // On big-endian targets, the bytes of mask_bits need to be swapped - // as StoreMaskBits expects the mask bits to be stored in little-endian - // byte order. - - // GCC will also optimize the byte swap and CopyBytes operations below - // to a single sthbrx instruction when optimizations are enabled on - // big-endian PPC targets -#if HWY_HAS_BUILTIN(__builtin_bswap16) - const uint16_t u16_mask_bits = - __builtin_bswap16(static_cast(mask_bits)); -#else - const uint16_t u16_mask_bits = static_cast( - (mask_bits << 8) | (static_cast(mask_bits) >> 8)); -#endif -#endif - - CopyBytes(&u16_mask_bits, bits); - return sizeof(uint16_t); -} - -// ------------------------------ Mask testing - -template -HWY_API bool AllFalse(D d, MFromD mask) { - const RebindToUnsigned du; - return static_cast(vec_all_eq(RebindMask(du, mask).raw, Zero(du).raw)); -} - -template -HWY_API bool AllTrue(D d, MFromD mask) { - const RebindToUnsigned du; - using TU = TFromD; - return static_cast( - vec_all_eq(RebindMask(du, mask).raw, Set(du, hwy::LimitsMax()).raw)); -} - -template -HWY_API bool AllFalse(D d, MFromD mask) { - const Full128> d_full; - constexpr size_t kN = MaxLanes(d); - return AllFalse(d_full, MFromD{ - vec_and(mask.raw, FirstN(d_full, kN).raw)}); -} - -template -HWY_API bool AllTrue(D d, MFromD mask) { - const Full128> d_full; - constexpr size_t kN = MaxLanes(d); - return AllTrue(d_full, MFromD{ - vec_or(mask.raw, Not(FirstN(d_full, kN)).raw)}); -} - -template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); -} - -#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) -namespace detail { - -template -static HWY_INLINE size_t VsxCntlzLsbb(V v) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \ - HWY_IS_LITTLE_ENDIAN - // Use inline assembly to work around bug in GCC 11 and earlier on - // little-endian PPC9 - int idx; - __asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw)); - return static_cast(idx); -#else - return static_cast(vec_cntlz_lsbb(v.raw)); -#endif -} - -template -static HWY_INLINE size_t VsxCnttzLsbb(V v) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \ - HWY_IS_LITTLE_ENDIAN - // Use inline assembly to work around bug in GCC 11 and earlier on - // little-endian PPC9 - int idx; - __asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw)); - return static_cast(idx); -#else - return static_cast(vec_cnttz_lsbb(v.raw)); -#endif -} - -} // namespace detail -#endif - -template > -HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { -// For little-endian PPC10, BitsFromMask is already efficient. -#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - if (detail::IsFull(d)) { - const Repartition d8; - const auto bytes = BitCast(d8, VecFromMask(d, mask)); - return detail::VsxCntlzLsbb(bytes) / sizeof(T); - } -#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask)); -} - -template > -HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { -// For little-endian PPC10, BitsFromMask is already efficient. -#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - constexpr size_t kN = 16 / sizeof(T); - if (detail::IsFull(d)) { - const Repartition d8; - const auto bytes = BitCast(d8, VecFromMask(d, mask)); - const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T); - return idx == kN ? -1 : static_cast(idx); - } -#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - const uint64_t mask_bits = detail::BitsFromMask(mask); - return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; -} - -template > -HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { -// For little-endian PPC10, BitsFromMask is already efficient. -#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - if (detail::IsFull(d)) { - const Repartition d8; - const auto bytes = BitCast(d8, VecFromMask(d, mask)); - const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T); - return 16 / sizeof(T) - 1 - idx; - } -#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask)); -} - -template > -HWY_API intptr_t FindLastTrue(D d, MFromD mask) { -// For little-endian PPC10, BitsFromMask is already efficient. -#if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - constexpr size_t kN = 16 / sizeof(T); - if (detail::IsFull(d)) { - const Repartition d8; - const auto bytes = BitCast(d8, VecFromMask(d, mask)); - const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T); - return idx == kN ? -1 : static_cast(kN - 1 - idx); - } -#endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) - (void)d; - const uint64_t mask_bits = detail::BitsFromMask(mask); - return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) - : -1; -} - -// ------------------------------ Compress, CompressBits - -namespace detail { - -// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. -template -HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Rebind d8; - const Twice d8t; - const RebindToUnsigned du; - - // To reduce cache footprint, store lane indices and convert to byte indices - // (2*lane + 0..1), with the doubling baked into the table. It's not clear - // that the additional cost of unpacking nibbles is worthwhile. - alignas(16) static constexpr uint8_t table[2048] = { - // PrintCompress16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // - 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // - 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // - 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // - 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // - 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // - 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // - 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // - 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // - 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // - 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // - 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // - 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // - 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // - 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // - 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // - 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // - 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // - 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // - 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // - 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // - 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // - 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // - 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // - 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // - 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // - 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // - 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // - 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // - 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // - 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // - 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // - 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // - 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // - 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // - 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // - 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // - 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // - 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // - 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // - 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // - 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // - 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // - 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // - 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // - 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // - 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // - 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // - 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // - 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // - 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // - 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // - 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // - 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // - 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // - 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // - 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // - 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // - 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // - 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // - 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // - 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // - 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // - 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // - 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // - 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // - 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // - 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // - 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // - 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // - 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // - 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // - 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // - 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // - 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // - 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // - 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // - 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // - 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // - 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // - 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // - 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // - 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // - 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // - 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // - 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // - 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // - 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // - 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // - 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // - 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // - 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // - 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // - 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // - 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // - 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // - 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // - 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // - 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // - 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // - 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // - 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // - 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // - 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // - 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // - 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // - 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // - 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // - 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // - 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // - 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // - 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // - 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // - 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // - 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // - 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // - 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // - 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // - 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // - 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // - 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; - const VFromD pairs = ZipLower(byte_idx, byte_idx); - constexpr uint16_t kPairIndexIncrement = - HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; - - return BitCast(d, pairs + Set(du, kPairIndexIncrement)); -} - -template -HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Rebind d8; - const Twice d8t; - const RebindToUnsigned du; - - // To reduce cache footprint, store lane indices and convert to byte indices - // (2*lane + 0..1), with the doubling baked into the table. It's not clear - // that the additional cost of unpacking nibbles is worthwhile. - alignas(16) static constexpr uint8_t table[2048] = { - // PrintCompressNot16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // - 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // - 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // - 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // - 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // - 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // - 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // - 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // - 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // - 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // - 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // - 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // - 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // - 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // - 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // - 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // - 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // - 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // - 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // - 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // - 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // - 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // - 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // - 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // - 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // - 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // - 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // - 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // - 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // - 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // - 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // - 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // - 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // - 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // - 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // - 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // - 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // - 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // - 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // - 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // - 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // - 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // - 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // - 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // - 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // - 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // - 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // - 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // - 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // - 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // - 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // - 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // - 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // - 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // - 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // - 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // - 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // - 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // - 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // - 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // - 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // - 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // - 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // - 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // - 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // - 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // - 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // - 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // - 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // - 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // - 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // - 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // - 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // - 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // - 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // - 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // - 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // - 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // - 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // - 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // - 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // - 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // - 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // - 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // - 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // - 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // - 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // - 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // - 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // - 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // - 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // - 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // - 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // - 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // - 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // - 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // - 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // - 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // - 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // - 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // - 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // - 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // - 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // - 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // - 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // - 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // - 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // - 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // - 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // - 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // - 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // - 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // - 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // - 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // - 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // - 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // - 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // - 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // - 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // - 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // - 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; - const VFromD pairs = ZipLower(byte_idx, byte_idx); - constexpr uint16_t kPairIndexIncrement = - HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; - - return BitCast(d, pairs + Set(du, kPairIndexIncrement)); -} - -template -HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[256] = { - // PrintCompress32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // - 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // - 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // - 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // - 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[256] = { - // PrintCompressNot32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, - 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[64] = { - // PrintCompress64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[64] = { - // PrintCompressNot64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_API Vec128 CompressBits(Vec128 v, uint64_t mask_bits) { - const DFromV d; - const RebindToUnsigned du; - - HWY_DASSERT(mask_bits < (1ull << N)); - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); -} - -template -HWY_API Vec128 CompressNotBits(Vec128 v, uint64_t mask_bits) { - const DFromV d; - const RebindToUnsigned du; - - HWY_DASSERT(mask_bits < (1ull << N)); - const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); - return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); -} - -} // namespace detail - -// Single lane: no-op -template -HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. - const Full128 d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskL, maskH); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 bytes -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, detail::BitsFromMask(mask)); -} - -// ------------------------------ CompressNot - -// Single lane: no-op -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. - const Full128 d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskH, maskL); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 bytes -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // For partial vectors, we cannot pull the Not() into the table because - // BitsFromMask clears the upper bits. - if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); - } - return detail::CompressNotBits(v, detail::BitsFromMask(mask)); -} - -// ------------------------------ CompressBlocksNot -HWY_API Vec128 CompressBlocksNot(Vec128 v, - Mask128 /* m */) { - return v; -} - -template -HWY_API Vec128 CompressBits(Vec128 v, - const uint8_t* HWY_RESTRICT bits) { - // As there are at most 8 lanes in v if sizeof(TFromD) > 1, simply - // convert bits[0] to a uint64_t - uint64_t mask_bits = bits[0]; - if (N < 8) { - mask_bits &= (1ull << N) - 1; - } - - return detail::CompressBits(v, mask_bits); -} - -// ------------------------------ CompressStore, CompressBitsStore - -template -HWY_API size_t CompressStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - - const uint64_t mask_bits = detail::BitsFromMask(m); - HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); - const size_t count = PopCount(mask_bits); - - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); - StoreU(compressed, d, unaligned); - return count; -} - -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - - const uint64_t mask_bits = detail::BitsFromMask(m); - HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); - const size_t count = PopCount(mask_bits); - - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); - BlendedStore(compressed, FirstN(d, count), d, unaligned); - return count; -} - -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - - // As there are at most 8 lanes in v if sizeof(TFromD) > 1, simply - // convert bits[0] to a uint64_t - uint64_t mask_bits = bits[0]; - constexpr size_t kN = MaxLanes(d); - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - const size_t count = PopCount(mask_bits); - - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); - StoreU(compressed, d, unaligned); - - return count; -} - -// ------------------------------ StoreInterleaved2/3/4 - -// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in -// generic_ops-inl.h. - -// ------------------------------ Additional mask logical operations -namespace detail { - -#if HWY_IS_LITTLE_ENDIAN -template -HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { - return v; -} -template -HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) { - return v; -} -#else -template -HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { - const DFromV d; - return Reverse8(d, v); -} -template -HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { - const DFromV d; - return Reverse4(d, v); -} -template -HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { - const DFromV d; - return Reverse2(d, v); -} -template -HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { - return v; -} -template -HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) { - const DFromV d; - return Reverse(d, v); -} -#endif - -template -HWY_INLINE V I128Subtract(V a, V b) { -#if defined(__SIZEOF_INT128__) - using VU128 = __vector unsigned __int128; - const V diff_i128{reinterpret_cast>::type>( - vec_sub(reinterpret_cast(a.raw), reinterpret_cast(b.raw)))}; -#else - const DFromV d; - const Repartition du64; - - const auto u64_a = BitCast(du64, a); - const auto u64_b = BitCast(du64, b); - - const auto diff_u64 = u64_a - u64_b; - const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b); - -#if HWY_IS_LITTLE_ENDIAN - const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64); -#else - const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64); -#endif - - const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted); -#endif - - return diff_i128; -} - -} // namespace detail - -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const FixedTag d; - const auto vmask = VecFromMask(d, mask); - return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Simd d; - const Full64 d_full64; - - const auto vmask = VecFromMask(d, mask); - const auto vmask_le64 = - BitCast(Full64(), - detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask))); - const auto neg_vmask_le64 = Neg(vmask_le64); - const auto neg_vmask = ResizeBitCast( - d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64))); - - return MaskFromVec(Or(vmask, neg_vmask)); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Full128 d; - auto vmask = VecFromMask(d, mask); - - const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask); - const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128); - const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128); - - return MaskFromVec(BitCast(d, Or(vmask, neg_vmask))); -} - -template -HWY_API Mask128 SetBeforeFirst(Mask128 mask) { - return Not(SetAtOrAfterFirst(mask)); -} - -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const FixedTag d; - const RebindToSigned di; - - const auto vmask = BitCast(di, VecFromMask(d, mask)); - const auto zero = Zero(di); - const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); - return MaskFromVec(BitCast(d, And(vmask, vmask2))); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Simd d; - const Full64 d_full64; - const RebindToSigned di; - - const auto vmask = VecFromMask(d, mask); - const auto vmask_le64 = - BitCast(Full64(), - detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask))); - const auto neg_vmask_le64 = Neg(vmask_le64); - const auto neg_vmask = ResizeBitCast( - d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64))); - - const auto first_vmask = BitCast(di, And(vmask, neg_vmask)); - return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask)))); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Full128 d; - const RebindToSigned di; - - const auto vmask = VecFromMask(d, mask); - const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask); - const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128); - const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128); - - return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask))))); -} - -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { - const FixedTag d; - const RebindToSigned di; - using TI = MakeSigned; - - return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); -} -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { - const Simd d; - return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); -} - -// ------------------------------ Reductions - -namespace detail { - -// N=1 for any T: no-op -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v) { - return v; -} - -// u32/i32/f32: - -// N=2 -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v10) { - // NOTE: AltivecVsum2sws cannot be used here as AltivecVsum2sws - // computes the signed saturated sum of the lanes. - return v10 + Shuffle2301(v10); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v10) { - return Min(v10, Shuffle2301(v10)); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v10) { - return Max(v10, Shuffle2301(v10)); -} - -// N=4 (full) -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v3210) { - // NOTE: AltivecVsumsws cannot be used here as AltivecVsumsws - // computes the signed saturated sum of the lanes. - const Vec128 v1032 = Shuffle1032(v3210); - const Vec128 v31_20_31_20 = v3210 + v1032; - const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); - return v20_31_20_31 + v31_20_31_20; -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v3210) { - const Vec128 v1032 = Shuffle1032(v3210); - const Vec128 v31_20_31_20 = Min(v3210, v1032); - const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); - return Min(v20_31_20_31, v31_20_31_20); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v3210) { - const Vec128 v1032 = Shuffle1032(v3210); - const Vec128 v31_20_31_20 = Max(v3210, v1032); - const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); - return Max(v20_31_20_31, v31_20_31_20); -} - -// u64/i64/f64: - -// N=2 (full) -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v10) { - const Vec128 v01 = Shuffle01(v10); - return v10 + v01; -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v10) { - const Vec128 v01 = Shuffle01(v10); - return Min(v10, v01); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v10) { - const Vec128 v01 = Shuffle01(v10); - return Max(v10, v01); -} - -// Casts nominally int32_t result to D. -template -HWY_INLINE VFromD AltivecVsum4shs(D d, __vector signed short a, - __vector signed int b) { - const Repartition di32; -#ifdef __OPTIMIZE__ - if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { - const int64_t sum0 = static_cast(a[0]) + - static_cast(a[1]) + - static_cast(b[0]); - const int64_t sum1 = static_cast(a[2]) + - static_cast(a[3]) + - static_cast(b[1]); - const int64_t sum2 = static_cast(a[4]) + - static_cast(a[5]) + - static_cast(b[2]); - const int64_t sum3 = static_cast(a[6]) + - static_cast(a[7]) + - static_cast(b[3]); - const int32_t sign0 = static_cast(sum0 >> 63); - const int32_t sign1 = static_cast(sum1 >> 63); - const int32_t sign2 = static_cast(sum2 >> 63); - const int32_t sign3 = static_cast(sum3 >> 63); - using Raw = typename detail::Raw128::type; - return BitCast( - d, - VFromD{Raw{ - (sign0 == (sum0 >> 31)) ? static_cast(sum0) - : static_cast(sign0 ^ 0x7FFFFFFF), - (sign1 == (sum1 >> 31)) ? static_cast(sum1) - : static_cast(sign1 ^ 0x7FFFFFFF), - (sign2 == (sum2 >> 31)) ? static_cast(sum2) - : static_cast(sign2 ^ 0x7FFFFFFF), - (sign3 == (sum3 >> 31)) - ? static_cast(sum3) - : static_cast(sign3 ^ 0x7FFFFFFF)}}); - } else // NOLINT -#endif - { - return BitCast(d, VFromD{vec_vsum4shs(a, b)}); - } -} - -// Casts nominally int32_t result to D. -template -HWY_INLINE VFromD AltivecVsumsws(D d, __vector signed int a, - __vector signed int b) { - const Repartition di32; -#ifdef __OPTIMIZE__ - constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) { - const int64_t sum = - static_cast(a[0]) + static_cast(a[1]) + - static_cast(a[2]) + static_cast(a[3]) + - static_cast(b[kDestLaneOffset]); - const int32_t sign = static_cast(sum >> 63); -#if HWY_IS_LITTLE_ENDIAN - return BitCast( - d, VFromD{(__vector signed int){ - (sign == (sum >> 31)) ? static_cast(sum) - : static_cast(sign ^ 0x7FFFFFFF), - 0, 0, 0}}); -#else - return BitCast(d, VFromD{(__vector signed int){ - 0, 0, 0, - (sign == (sum >> 31)) - ? static_cast(sum) - : static_cast(sign ^ 0x7FFFFFFF)}}); -#endif - } else // NOLINT -#endif - { - __vector signed int sum; - - // Inline assembly is used for vsumsws to avoid unnecessary shuffling - // on little-endian PowerPC targets as the result of the vsumsws - // instruction will already be in the correct lanes on little-endian - // PowerPC targets. - __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); - - return BitCast(d, VFromD{sum}); - } -} - -template -HWY_INLINE Vec128 AltivecU16SumsOf2(Vec128 v) { - const RebindToSigned> di16; - const RepartitionToWide di32; - return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw, - Set(di32, 65536).raw); -} - -HWY_API Vec32 SumOfLanes(Vec32 v) { - constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; - DFromV du16; - return Broadcast(BitCast(du16, AltivecU16SumsOf2(v))); -} - -HWY_API Vec64 SumOfLanes(Vec64 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - const Full64 du16; - const auto zero = Zero(Full128()); - return Broadcast( - AltivecVsum2sws(du16, AltivecU16SumsOf2(v).raw, zero.raw)); -} - -HWY_API Vec128 SumOfLanes(Vec128 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; - const Full128 du16; - const auto zero = Zero(Full128()); - return Broadcast( - AltivecVsumsws(du16, AltivecU16SumsOf2(v).raw, zero.raw)); -} - -HWY_API Vec32 SumOfLanes(Vec32 v) { - constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; - const Full32 di16; - const auto zero = Zero(Full128()); - return Broadcast(AltivecVsum4shs(di16, v.raw, zero.raw)); -} - -HWY_API Vec64 SumOfLanes(Vec64 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - const Full128 di32; - const Full64 di16; - const auto zero = Zero(di32); - return Broadcast(AltivecVsum2sws( - di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw)); -} - -HWY_API Vec128 SumOfLanes(Vec128 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; - const Full128 di16; - const Full128 di32; - const auto zero = Zero(di32); - return Broadcast(AltivecVsumsws( - di16, AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw)); -} - -// u8, N=2, N=4, N=8, N=16: -HWY_API Vec16 SumOfLanes(Vec16 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - const Full16 du8; - const Full16 du16; - const Twice dt_u8; - const Twice dt_u16; - const Full128 du32; - return LowerHalf(Broadcast(AltivecVsum4ubs( - dt_u8, BitCast(dt_u8, Combine(dt_u16, Zero(du16), BitCast(du16, v))).raw, - Zero(du32).raw))); -} - -HWY_API Vec32 SumOfLanes(Vec32 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - const Full128 du32; - const Full32 du8; - return Broadcast(AltivecVsum4ubs(du8, v.raw, Zero(du32).raw)); -} - -HWY_API Vec64 SumOfLanes(Vec64 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; - const Full64 du8; - return Broadcast(BitCast(du8, SumsOf8(v))); -} - -HWY_API Vec128 SumOfLanes(Vec128 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; - - const Full128 du32; - const RebindToSigned di32; - const Full128 du8; - const Vec128 zero = Zero(du32); - return Broadcast( - AltivecVsumsws(du8, AltivecVsum4ubs(di32, v.raw, zero.raw).raw, - BitCast(di32, zero).raw)); -} - -HWY_API Vec16 SumOfLanes(Vec16 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - - const Full128 du16; - const Repartition di32; - const Repartition di8; - const Vec128 zzvv = BitCast( - di8, InterleaveLower(BitCast(du16, Vec128{v.raw}), Zero(du16))); - return Vec16{ - Broadcast(AltivecVsum4sbs(di8, zzvv.raw, Zero(di32).raw)) - .raw}; -} - -HWY_API Vec32 SumOfLanes(Vec32 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; - const Full32 di8; - const Vec128 zero = Zero(Full128()); - return Broadcast(AltivecVsum4sbs(di8, v.raw, zero.raw)); -} - -HWY_API Vec64 SumOfLanes(Vec64 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; - const Full128 di32; - const Vec128 zero = Zero(di32); - const Full64 di8; - return Broadcast(AltivecVsum2sws( - di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw)); -} - -HWY_API Vec128 SumOfLanes(Vec128 v) { - constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; - const Full128 di8; - const Full128 di32; - const Vec128 zero = Zero(di32); - return Broadcast(AltivecVsumsws( - di8, AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw)); -} - -template -HWY_API Vec128 MaxOfLanes(Vec128 v) { - const DFromV d; - const RepartitionToWide d16; - const RepartitionToWide d32; - Vec128 vm = Max(v, Reverse2(d, v)); - vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); - vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); - if (N > 8) { - const RepartitionToWide d64; - vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); - } - return vm; -} - -template -HWY_API Vec128 MinOfLanes(Vec128 v) { - const DFromV d; - const RepartitionToWide d16; - const RepartitionToWide d32; - Vec128 vm = Min(v, Reverse2(d, v)); - vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); - vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); - if (N > 8) { - const RepartitionToWide d64; - vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); - } - return vm; -} - -template -HWY_API Vec128 MaxOfLanes(Vec128 v) { - const DFromV d; - const RepartitionToWide d16; - const RepartitionToWide d32; - Vec128 vm = Max(v, Reverse2(d, v)); - vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); - vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); - if (N > 8) { - const RepartitionToWide d64; - vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); - } - return vm; -} - -template -HWY_API Vec128 MinOfLanes(Vec128 v) { - const DFromV d; - const RepartitionToWide d16; - const RepartitionToWide d32; - Vec128 vm = Min(v, Reverse2(d, v)); - vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); - vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); - if (N > 8) { - const RepartitionToWide d64; - vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); - } - return vm; -} - -template -HWY_API Vec128 MinOfLanes(Vec128 v) { - const Simd d; - const RepartitionToWide d32; -#if HWY_IS_LITTLE_ENDIAN - const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); - const auto odd = ShiftRight<16>(BitCast(d32, v)); -#else - const auto even = ShiftRight<16>(BitCast(d32, v)); - const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF)); -#endif - const auto min = MinOfLanes(Min(even, odd)); - // Also broadcast into odd lanes on little-endian and into even lanes - // on big-endian - return Vec128{vec_pack(min.raw, min.raw)}; -} -template -HWY_API Vec128 MinOfLanes(Vec128 v) { - const Simd d; - const RepartitionToWide d32; - // Sign-extend -#if HWY_IS_LITTLE_ENDIAN - const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); - const auto odd = ShiftRight<16>(BitCast(d32, v)); -#else - const auto even = ShiftRight<16>(BitCast(d32, v)); - const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); -#endif - const auto min = MinOfLanes(Min(even, odd)); - // Also broadcast into odd lanes on little-endian and into even lanes - // on big-endian - return Vec128{vec_pack(min.raw, min.raw)}; -} - -template -HWY_API Vec128 MaxOfLanes(Vec128 v) { - const Simd d; - const RepartitionToWide d32; -#if HWY_IS_LITTLE_ENDIAN - const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); - const auto odd = ShiftRight<16>(BitCast(d32, v)); -#else - const auto even = ShiftRight<16>(BitCast(d32, v)); - const auto odd = And(BitCast(d32, v), Set(d32, 0xFFFF)); -#endif - const auto max = MaxOfLanes(Max(even, odd)); - // Also broadcast into odd lanes. - return Vec128{vec_pack(max.raw, max.raw)}; -} -template -HWY_API Vec128 MaxOfLanes(Vec128 v) { - const Simd d; - const RepartitionToWide d32; - // Sign-extend -#if HWY_IS_LITTLE_ENDIAN - const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); - const auto odd = ShiftRight<16>(BitCast(d32, v)); -#else - const auto even = ShiftRight<16>(BitCast(d32, v)); - const auto odd = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); -#endif - const auto max = MaxOfLanes(Max(even, odd)); - // Also broadcast into odd lanes on little-endian and into even lanes - // on big-endian - return Vec128{vec_pack(max.raw, max.raw)}; -} - -} // namespace detail - -// Supported for u/i/f 32/64. Returns the same value in each lane. -template -HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { - return detail::SumOfLanes(v); -} -template -HWY_API TFromD ReduceSum(D /* tag */, VFromD v) { - return GetLane(detail::SumOfLanes(v)); -} -template -HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { - return detail::MinOfLanes(v); -} -template -HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { - return detail::MaxOfLanes(v); -} - -// ------------------------------ Lt128 - -namespace detail { - -// Returns vector-mask for Lt128. -template > -HWY_INLINE V Lt128Vec(D d, V a, V b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); -#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) - (void)d; - using VU64 = __vector unsigned long long; - using VU128 = __vector unsigned __int128; -#if HWY_IS_LITTLE_ENDIAN - const VU128 a_u128 = reinterpret_cast(a.raw); - const VU128 b_u128 = reinterpret_cast(b.raw); -#else - // NOTE: Need to swap the halves of both a and b on big-endian targets - // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits - // of a and b are in lane 0 whereas the vec_cmplt operation below expects - // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on - // big-endian PPC targets. - const VU128 a_u128 = reinterpret_cast(vec_sld(a.raw, a.raw, 8)); - const VU128 b_u128 = reinterpret_cast(vec_sld(b.raw, b.raw, 8)); -#endif - return V{reinterpret_cast(vec_cmplt(a_u128, b_u128))}; -#else // !HWY_PPC_HAVE_10 - // Truth table of Eq and Lt for Hi and Lo u64. - // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) - // =H =L cH cL | out = cH | (=H & cL) - // 0 0 0 0 | 0 - // 0 0 0 1 | 0 - // 0 0 1 0 | 1 - // 0 0 1 1 | 1 - // 0 1 0 0 | 0 - // 0 1 0 1 | 0 - // 0 1 1 0 | 1 - // 1 0 0 0 | 0 - // 1 0 0 1 | 1 - // 1 1 0 0 | 0 - const auto eqHL = Eq(a, b); - const V ltHL = VecFromMask(d, Lt(a, b)); - const V ltLX = ShiftLeftLanes<1>(ltHL); - const V vecHx = IfThenElse(eqHL, ltLX, ltHL); - return InterleaveUpper(d, vecHx, vecHx); -#endif -} - -// Returns vector-mask for Eq128. -template > -HWY_INLINE V Eq128Vec(D d, V a, V b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); -#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) - (void)d; - using VU64 = __vector unsigned long long; - using VU128 = __vector unsigned __int128; - return V{reinterpret_cast(vec_cmpeq(reinterpret_cast(a.raw), - reinterpret_cast(b.raw)))}; -#else - const auto eqHL = VecFromMask(d, Eq(a, b)); - const auto eqLH = Reverse2(d, eqHL); - return And(eqHL, eqLH); -#endif -} - -template > -HWY_INLINE V Ne128Vec(D d, V a, V b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); -#if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) - (void)d; - using VU64 = __vector unsigned long long; - using VU128 = __vector unsigned __int128; - return V{reinterpret_cast(vec_cmpne(reinterpret_cast(a.raw), - reinterpret_cast(b.raw)))}; -#else - const auto neHL = VecFromMask(d, Ne(a, b)); - const auto neLH = Reverse2(d, neHL); - return Or(neHL, neLH); -#endif -} - -template > -HWY_INLINE V Lt128UpperVec(D d, V a, V b) { - const V ltHL = VecFromMask(d, Lt(a, b)); - return InterleaveUpper(d, ltHL, ltHL); -} - -template > -HWY_INLINE V Eq128UpperVec(D d, V a, V b) { - const V eqHL = VecFromMask(d, Eq(a, b)); - return InterleaveUpper(d, eqHL, eqHL); -} - -template > -HWY_INLINE V Ne128UpperVec(D d, V a, V b) { - const V neHL = VecFromMask(d, Ne(a, b)); - return InterleaveUpper(d, neHL, neHL); -} - -} // namespace detail - -template > -HWY_API MFromD Lt128(D d, V a, V b) { - return MaskFromVec(detail::Lt128Vec(d, a, b)); -} - -template > -HWY_API MFromD Eq128(D d, V a, V b) { - return MaskFromVec(detail::Eq128Vec(d, a, b)); -} - -template > -HWY_API MFromD Ne128(D d, V a, V b) { - return MaskFromVec(detail::Ne128Vec(d, a, b)); -} - -template > -HWY_API MFromD Lt128Upper(D d, V a, V b) { - return MaskFromVec(detail::Lt128UpperVec(d, a, b)); -} - -template > -HWY_API MFromD Eq128Upper(D d, V a, V b) { - return MaskFromVec(detail::Eq128UpperVec(d, a, b)); -} - -template > -HWY_API MFromD Ne128Upper(D d, V a, V b) { - return MaskFromVec(detail::Ne128UpperVec(d, a, b)); -} - -// ------------------------------ Min128, Max128 (Lt128) - -// Avoids the extra MaskFromVec in Lt128. -template > -HWY_API V Min128(D d, const V a, const V b) { - return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); -} - -template > -HWY_API V Max128(D d, const V a, const V b) { - return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); -} - -template > -HWY_API V Min128Upper(D d, const V a, const V b) { - return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); -} - -template > -HWY_API V Max128Upper(D d, const V a, const V b) { - return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); -} - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -#ifdef HWY_NATIVE_LEADING_ZERO_COUNT -#undef HWY_NATIVE_LEADING_ZERO_COUNT -#else -#define HWY_NATIVE_LEADING_ZERO_COUNT -#endif - -template -HWY_API V LeadingZeroCount(V v) { - return V{vec_cntlz(v.raw)}; -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - using T = TFromD; - return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); -} - -#if HWY_PPC_HAVE_9 -template -HWY_API V TrailingZeroCount(V v) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 - return V{vec_vctz(v.raw)}; -#else - return V{vec_cnttz(v.raw)}; -#endif -} -#else -template -HWY_API V TrailingZeroCount(V v) { - const DFromV d; - const RebindToSigned di; - using TI = TFromD; - - const auto vi = BitCast(di, v); - const auto lowest_bit = And(vi, Neg(vi)); - constexpr TI kNumOfBitsInT{sizeof(TI) * 8}; - const auto bit_idx = HighestSetBitIndex(lowest_bit); - return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)), - Set(di, kNumOfBitsInT), bit_idx)); -} -#endif - -#undef HWY_PPC_HAVE_9 -#undef HWY_PPC_HAVE_10 - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/rvv-inl.h b/deps/highway/include/hwy/ops/rvv-inl.h deleted file mode 100644 index c5b76db8..00000000 --- a/deps/highway/include/hwy/ops/rvv-inl.h +++ /dev/null @@ -1,4887 +0,0 @@ -// Copyright 2021 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// RISC-V V vectors (length not known at compile time). -// External include guard in highway.h - see comment there. - -#include - -#include "hwy/ops/shared-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -// Support for vfloat16m*_t and PromoteTo/DemoteTo. -#ifdef __riscv_zvfhmin -#define HWY_RVV_HAVE_F16C 1 -#else -#define HWY_RVV_HAVE_F16C 0 -#endif - -template -struct DFromV_t {}; // specialized in macros -template -using DFromV = typename DFromV_t>::type; - -template -using TFromV = TFromD>; - -template -constexpr size_t MLenFromD(Simd /* tag */) { - // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower - // argument enables fractional LMUL < 1. Limit to 64 because that is the - // largest value for which vbool##_t are defined. - return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2)); -} - -namespace detail { - -template -class AdjustSimdTagToMinVecPow2_t {}; - -template -class AdjustSimdTagToMinVecPow2_t> { - private: - using D = Simd; - static constexpr int kMinVecPow2 = - -3 + static_cast(FloorLog2(sizeof(T))); - static constexpr size_t kNumMaxLanes = HWY_MAX_LANES_D(D); - static constexpr int kNewPow2 = HWY_MAX(kPow2, kMinVecPow2); - static constexpr size_t kNewN = D::template NewN(); - - public: - using type = Simd; -}; - -template -using AdjustSimdTagToMinVecPow2 = - typename AdjustSimdTagToMinVecPow2_t>::type; - -} // namespace detail - -// ================================================== MACROS - -// Generate specializations and function definitions using X macros. Although -// harder to read and debug, writing everything manually is too bulky. - -namespace detail { // for code folding - -// For all mask sizes MLEN: (1/Nth of a register, one bit per lane) -// The first three arguments are arbitrary SEW, LMUL, SHIFT such that -// SEW >> SHIFT = MLEN. -#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \ - X_MACRO(64, 0, 64, NAME, OP) \ - X_MACRO(32, 0, 32, NAME, OP) \ - X_MACRO(16, 0, 16, NAME, OP) \ - X_MACRO(8, 0, 8, NAME, OP) \ - X_MACRO(8, 1, 4, NAME, OP) \ - X_MACRO(8, 2, 2, NAME, OP) \ - X_MACRO(8, 3, 1, NAME, OP) - -// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows -// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or -// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix. -// -// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same -// reason, also pass the double-width and half SEW and LMUL (suffixed D and H, -// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8). -// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP - -// LMULS = _TRUNC: truncatable (not the smallest LMUL) -#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) - -#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) - -#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) - -#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) - -// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH. -#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) - -#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) - -#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) - -#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) - -// LMULS = _LE2: <= 2 -#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) - -#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) - -#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) - -#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) - -// LMULS = _EXT: not the largest LMUL -#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) - -#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) - -#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) - -#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) - -// LMULS = _ALL (2^MinPow2() <= LMUL <= 8) -#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP) - -#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP) - -#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP) - -#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP) - -// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least -// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even -// though RISC-V LMUL must be at least SEW/64 (notice that this rules out -// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to -// one less than should be supported, with all other parameters (vector type -// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes() -// returns half of what it usually would. -// -// Notice that we can only add overloads whenever there is a D argument: those -// are unique with respect to non-virtual-LMUL overloads because their kPow2 -// template argument differs. Otherwise, there is no actual vuint64mf2_t, and -// defining another overload with the same LMUL would be an error. Thus we have -// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is -// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most -// functions that take a D. - -#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP) - -#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP) - -#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP) - -// ALL + VIRT -#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -// LE2 + VIRT -#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -// EXT + VIRT -#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -// DEMOTE + VIRT -#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \ - HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) - -// SEW for unsigned: -#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP) -#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP) -#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP) -#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP) - -// SEW for signed: -#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP) -#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP) -#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP) -#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP) - -// SEW for float: - -// Used for conversion instructions if HWY_RVV_HAVE_F16C. -#define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP) - -#if HWY_HAVE_FLOAT16 -// Full support for f16 in all ops -#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) -#else -#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) -#endif -#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP) -#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \ - HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP) - -// Commonly used type/SEW groups: -#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) - -// For all combinations of SEW: -#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) - -// Commonly used type categories: -#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) - -#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \ - HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) - -// Assemble types for use in x-macros -#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t -#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd -#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t -#define HWY_RVV_TUP(BASE, SEW, LMUL, TUP) v##BASE##SEW##LMUL##x##TUP##_t -#define HWY_RVV_M(MLEN) vbool##MLEN##_t - -} // namespace detail - -// Until we have full intrinsic support for fractional LMUL, mixed-precision -// code can use LMUL 1..8 (adequate unless they need many registers). -#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template <> \ - struct DFromV_t { \ - using Lane = HWY_RVV_T(BASE, SEW); \ - using type = ScalableTag; \ - }; - -HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL) -#undef HWY_SPECIALIZE - -// ------------------------------ Lanes - -// WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL! -#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ - constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \ - constexpr size_t kCap = MaxLanes(d); \ - /* If no cap, avoid generating a constant by using VLMAX. */ \ - return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \ - : __riscv_vsetvl_e##SEW##LMUL(kCap); \ - } - -#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ - constexpr size_t kCap = MaxLanes(d); \ - /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \ - /* vsetvl may or may not be correct, so do it ourselves. */ \ - const size_t actual = \ - detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \ - return HWY_MIN(actual, kCap); \ - } - -HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL) -HWY_RVV_FOREACH(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT) -// If not already defined via HWY_RVV_FOREACH, define the overloads because -// they do not require any new instruction. -#if !HWY_HAVE_FLOAT16 -HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL) -HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_LANES_VIRT, Lanes, lenb, _VIRT) -#endif -#undef HWY_RVV_LANES -#undef HWY_RVV_LANES_VIRT - -template -HWY_API size_t Lanes(Simd /* tag*/) { - return Lanes(Simd()); -} - -// ------------------------------ Common x-macros - -// Last argument to most intrinsics. Use when the op has no d arg of its own, -// which means there is no user-specified cap. -#define HWY_RVV_AVL(SEW, SHIFT) \ - Lanes(ScalableTag()) - -// vector = f(vector), e.g. Not -#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ - } - -// vector = f(vector, scalar), e.g. detail::AddS -#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \ - } - -// vector = f(vector, vector), e.g. Add -#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(a, b, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -// mask = f(mask) -#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \ - return __riscv_vm##OP##_m_b##MLEN(m, ~0ull); \ - } - -// ================================================== INIT - -// ------------------------------ Set - -#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT) -HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT) -#undef HWY_RVV_SET - -// Treat bfloat16_t as int16_t (using the previously defined Set overloads); -// required for Zero and VFromD. -template -decltype(Set(Simd(), 0)) Set(Simd d, - bfloat16_t arg) { - return Set(RebindToSigned(), arg.bits); -} -#if !HWY_HAVE_FLOAT16 // Otherwise already defined above. -// WARNING: returns a different type than emulated bfloat16_t so that we can -// implement PromoteTo overloads for both bfloat16_t and float16_t, and also -// provide a Neg(float16_t) overload that coexists with Neg(int16_t). -template -decltype(Set(Simd(), 0)) Set(Simd d, - float16_t arg) { - uint16_t bits; - CopySameSize(&arg, &bits); - return Set(RebindToUnsigned(), bits); -} -#endif - -template -using VFromD = decltype(Set(D(), TFromD())); - -// ------------------------------ Zero - -template -HWY_API VFromD Zero(D d) { - // Cast to support bfloat16_t. - const RebindToUnsigned du; - return BitCast(d, Set(du, 0)); -} - -// ------------------------------ Undefined - -// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized -// by it gives unpredictable results. It should only be used for maskoff, so -// keep it internal. For the Highway op, just use Zero (single instruction). -namespace detail { -#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \ - } - -HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL) -#undef HWY_RVV_UNDEFINED -} // namespace detail - -template -HWY_API VFromD Undefined(D d) { - return Zero(d); -} - -// ------------------------------ BitCast - -namespace detail { - -// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.) -#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \ - v); /* no AVL */ \ - } -HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC) -#undef HWY_RVV_TRUNC - -// Doubles LMUL to `d2` (the arg is only necessary for _VIRT). -#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMULD) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD( \ - v); /* no AVL */ \ - } -HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT) -#undef HWY_RVV_EXT - -// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is -// the same as the actual input type. -#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return v; \ - } -HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT) -#undef HWY_RVV_EXT_VIRT - -#if !HWY_HAVE_FLOAT16 -template -VFromD Ext(D d, VFromD> v) { - const RebindToUnsigned du; - const Half duh; - return BitCast(d, Ext(du, BitCast(duh, v))); -} -#endif - -template -VFromD Ext(D d, VFromD> v) { - const RebindToUnsigned du; - const Half duh; - return BitCast(d, Ext(du, BitCast(duh, v))); -} - -// For BitCastToByte, the D arg is only to prevent duplicate definitions caused -// by _ALL_VIRT. - -// There is no reinterpret from u8 <-> u8, so just return. -#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ - vuint8##LMUL##_t v) { \ - return v; \ - } \ - template \ - HWY_API vuint8##LMUL##_t BitCastFromByte( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ - return v; \ - } - -// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two). -#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ - vint8##LMUL##_t v) { \ - return __riscv_vreinterpret_v_i8##LMUL##_u8##LMUL(v); \ - } \ - template \ - HWY_API vint8##LMUL##_t BitCastFromByte( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ - return __riscv_vreinterpret_v_u8##LMUL##_i8##LMUL(v); \ - } - -// Separate u/i because clang only provides signed <-> unsigned reinterpret for -// the same SEW. -#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ - return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \ - } - -// Signed/Float: first cast to/from unsigned -#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API vuint8##LMUL##_t BitCastToByte(Simd /* d */, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ - __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \ - return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ - __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \ - } - -// Additional versions for virtual LMUL using LMULH for byte vectors. -#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API vuint8##LMULH##_t BitCastToByte(Simd /* d */, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return detail::Trunc(__riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ - HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ - const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ - return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \ - } - -// Signed/Float: first cast to/from unsigned -#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API vuint8##LMULH##_t BitCastToByte(Simd /* d */, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return detail::Trunc(__riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \ - __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \ - HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \ - const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \ - return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \ - __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \ - } - -HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL) -HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL) -HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL) -HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL) -HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT) -HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) -HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) -#if HWY_HAVE_FLOAT16 // HWY_RVV_FOREACH_F already covered float16_ -#elif HWY_RVV_HAVE_F16C // zvfhmin provides reinterpret* intrinsics: -HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_IF, _, reinterpret, _ALL) -HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT) -#else -template -HWY_INLINE VFromD> BitCastFromByte( - Simd /* d */, VFromD> v) { - return BitCastFromByte(Simd(), v); -} -#endif - -#undef HWY_RVV_CAST_U8 -#undef HWY_RVV_CAST_I8 -#undef HWY_RVV_CAST_U -#undef HWY_RVV_CAST_IF -#undef HWY_RVV_CAST_VIRT_U -#undef HWY_RVV_CAST_VIRT_IF - -template -HWY_INLINE VFromD> BitCastFromByte( - Simd /* d */, VFromD> v) { - return BitCastFromByte(Simd(), v); -} - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, FromV v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(d, v)); -} - -// ------------------------------ Iota - -namespace detail { - -#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \ - } - -// For i8 lanes, this may well wrap around. Unsigned only is less error-prone. -HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT) -#undef HWY_RVV_IOTA - -// Used by Expand. -#define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(mask, Lanes(d)); \ - } - -HWY_RVV_FOREACH_U(HWY_RVV_MASKED_IOTA, MaskedIota, iota_m, _ALL_VIRT) -#undef HWY_RVV_MASKED_IOTA - -} // namespace detail - -// ================================================== LOGICAL - -// ------------------------------ Not - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV, Not, not, _ALL) - -template -HWY_API V Not(const V v) { - using DF = DFromV; - using DU = RebindToUnsigned; - return BitCast(DF(), Not(BitCast(DU(), v))); -} - -// ------------------------------ And - -// Non-vector version (ideally immediate) for use with Iota0 -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL) -} // namespace detail - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, And, and, _ALL) - -template -HWY_API V And(const V a, const V b) { - using DF = DFromV; - using DU = RebindToUnsigned; - return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b))); -} - -// ------------------------------ Or - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Or, or, _ALL) - -template -HWY_API V Or(const V a, const V b) { - using DF = DFromV; - using DU = RebindToUnsigned; - return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b))); -} - -// ------------------------------ Xor - -// Non-vector version (ideally immediate) for use with Iota0 -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL) -} // namespace detail - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Xor, xor, _ALL) - -template -HWY_API V Xor(const V a, const V b) { - using DF = DFromV; - using DU = RebindToUnsigned; - return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b))); -} - -// ------------------------------ AndNot -template -HWY_API V AndNot(const V not_a, const V b) { - return And(Not(not_a), b); -} - -// ------------------------------ Xor3 -template -HWY_API V Xor3(V x1, V x2, V x3) { - return Xor(x1, Xor(x2, x3)); -} - -// ------------------------------ Or3 -template -HWY_API V Or3(V o1, V o2, V o3) { - return Or(o1, Or(o2, o3)); -} - -// ------------------------------ OrAnd -template -HWY_API V OrAnd(const V o, const V a1, const V a2) { - return Or(o, And(a1, a2)); -} - -// ------------------------------ CopySign - -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL) - -template -HWY_API V CopySignToAbs(const V abs, const V sign) { - // RVV can also handle abs < 0, so no extra action needed. - return CopySign(abs, sign); -} - -// ================================================== ARITHMETIC - -// Per-target flags to prevent generic_ops-inl.h defining Add etc. -#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS -#undef HWY_NATIVE_OPERATOR_REPLACEMENTS -#else -#define HWY_NATIVE_OPERATOR_REPLACEMENTS -#endif - -// ------------------------------ Add - -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL) -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL) -} // namespace detail - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL) - -// ------------------------------ Sub -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL) -} // namespace detail - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL) - -// ------------------------------ SaturatedAdd - -#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB -#undef HWY_NATIVE_I32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB -#undef HWY_NATIVE_U32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB -#undef HWY_NATIVE_I64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I64_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB -#undef HWY_NATIVE_U64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_U64_SATURATED_ADDSUB -#endif - -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL) - -// ------------------------------ SaturatedSub - -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL) - -// ------------------------------ AverageRound - -// Define this to opt-out of the default behavior, which is AVOID on certain -// compiler versions. You can define only this to use VXRM, or define both this -// and HWY_RVV_AVOID_VXRM to always avoid VXRM. -#ifndef HWY_RVV_CHOOSE_VXRM - -// Assume that GCC-13 defaults to 'avoid VXRM'. Tested with GCC 13.1.0. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400 -#define HWY_RVV_AVOID_VXRM -// Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid. -// Assume earlier versions avoid. -#elif HWY_COMPILER_CLANG && \ - (HWY_COMPILER_CLANG < 1600 || __riscv_v_intrinsic < 11000) -#define HWY_RVV_AVOID_VXRM -#endif - -#endif // HWY_RVV_CHOOSE_VXRM - -// Adding __RISCV_VXRM_* was a backwards-incompatible change and it is not clear -// how to detect whether it is supported or required. #ifdef __RISCV_VXRM_RDN -// does not work because it seems to be a compiler built-in, but neither does -// __has_builtin(__RISCV_VXRM_RDN). The intrinsics version was also not updated, -// so we require a macro to opt out of the new intrinsics. -#ifdef HWY_RVV_AVOID_VXRM -#define HWY_RVV_INSERT_VXRM(vxrm, avl) avl -#define __RISCV_VXRM_RNU -#define __RISCV_VXRM_RDN -#else // default: use new vxrm arguments -#define HWY_RVV_INSERT_VXRM(vxrm, avl) vxrm, avl -#endif - -// Extra rounding mode = up argument. -#define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \ - a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ - } - -HWY_RVV_FOREACH_U08(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL) -HWY_RVV_FOREACH_U16(HWY_RVV_RETV_AVERAGE, AverageRound, aaddu, _ALL) - -#undef HWY_RVV_RETV_AVERAGE - -// ------------------------------ ShiftLeft[Same] - -// Intrinsics do not define .vi forms, so use .vx instead. -#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \ - return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast(bits), \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL) - -// ------------------------------ ShiftRight[Same] - -HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL) - -#undef HWY_RVV_SHIFT - -// ------------------------------ SumsOf8 (ShiftRight, Add) -template -HWY_API VFromD>> SumsOf8(const VU8 v) { - const DFromV du8; - const RepartitionToWide du16; - const RepartitionToWide du32; - const RepartitionToWide du64; - using VU16 = VFromD; - - const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); - const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF); - const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); - - const VU16 szz_FE_zz_BA_zz_76_zz_32 = - BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); - const VU16 sxx_FC_xx_B8_xx_74_xx_30 = - Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); - const VU16 szz_zz_xx_FC_zz_zz_xx_74 = - BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); - const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = - Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); - return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull); -} - -// ------------------------------ RotateRight -template -HWY_API V RotateRight(const V v) { - constexpr size_t kSizeInBits = sizeof(TFromV) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -// ------------------------------ Shl -#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, bits, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL) - -#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \ - const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, BitCast(du, bits), \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL) - -// ------------------------------ Shr - -HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL) - -#undef HWY_RVV_SHIFT_II -#undef HWY_RVV_SHIFT_VV - -// ------------------------------ Min - -namespace detail { - -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MinS, minu_vx, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MinS, min_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MinS, fmin_vf, _ALL) - -} // namespace detail - -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL) - -// ------------------------------ Max - -namespace detail { - -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL) - -} // namespace detail - -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL) - -// ------------------------------ Mul - -// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL) - -// ------------------------------ MulHigh - -// Only for internal use (Highway only promises MulHigh for 16-bit inputs). -// Used by MulEven; vwmul does not work for m8. -namespace detail { -HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) -HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) -} // namespace detail - -HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL) -HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL) - -// ------------------------------ MulFixedPoint15 - -// Extra rounding mode = up argument. -#define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \ - a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \ - } - -HWY_RVV_FOREACH_I16(HWY_RVV_MUL15, MulFixedPoint15, smul, _ALL) - -#undef HWY_RVV_MUL15 - -// ------------------------------ Div -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL) - -// ------------------------------ ApproximateReciprocal -#ifdef HWY_NATIVE_F64_APPROX_RECIP -#undef HWY_NATIVE_F64_APPROX_RECIP -#else -#define HWY_NATIVE_F64_APPROX_RECIP -#endif - -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL) - -// ------------------------------ Sqrt -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL) - -// ------------------------------ ApproximateReciprocalSqrt -#ifdef HWY_NATIVE_F64_APPROX_RSQRT -#undef HWY_NATIVE_F64_APPROX_RSQRT -#else -#define HWY_NATIVE_F64_APPROX_RSQRT -#endif - -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL) - -// ------------------------------ MulAdd - -// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. -#ifdef HWY_NATIVE_INT_FMA -#undef HWY_NATIVE_INT_FMA -#else -#define HWY_NATIVE_INT_FMA -#endif - -// Note: op is still named vv, not vvv. -#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \ - HWY_RVV_V(BASE, SEW, LMUL) add) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_FMA, MulAdd, macc, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL) - -// ------------------------------ NegMulAdd -HWY_RVV_FOREACH_UI(HWY_RVV_FMA, NegMulAdd, nmsac, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL) - -// ------------------------------ MulSub -HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL) - -// ------------------------------ NegMulSub -HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL) - -#undef HWY_RVV_FMA - -// ================================================== COMPARE - -// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in -// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th -// of all bits; SEW=8 / LMUL=4 = half of all bits. - -// SFINAE for mapping Simd<> to MLEN (up to 64). -#define HWY_RVV_IF_MLEN_D(D, MLEN) \ - hwy::EnableIf* = nullptr - -// Specialized for RVV instead of the generic test_util-inl.h implementation -// because more efficient, and helps implement MFromD. - -#define HWY_RVV_MASK_FALSE(SEW, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_M(MLEN) NAME(D d) { \ - return __riscv_vm##OP##_m_b##MLEN(Lanes(d)); \ - } - -HWY_RVV_FOREACH_B(HWY_RVV_MASK_FALSE, MaskFalse, clr) -#undef HWY_RVV_MASK_FALSE -#undef HWY_RVV_IF_MLEN_D - -template -using MFromD = decltype(MaskFalse(D())); - -// mask = f(vector, vector) -#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_M(MLEN) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN( \ - a, b, HWY_RVV_AVL(SEW, SHIFT)); \ - } - -// mask = f(vector, scalar) -#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_M(MLEN) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL##_b##MLEN( \ - a, b, HWY_RVV_AVL(SEW, SHIFT)); \ - } - -// ------------------------------ Eq -HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL) - -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL) -} // namespace detail - -// ------------------------------ Ne -HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL) - -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL) -} // namespace detail - -// ------------------------------ Lt -HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL) - -namespace detail { -HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL) -HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL) -} // namespace detail - -// ------------------------------ Le -HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Le, msleu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Le, msle, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL) - -#undef HWY_RVV_RETM_ARGVV -#undef HWY_RVV_RETM_ARGVS - -// ------------------------------ Gt/Ge - -template -HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) { - return Le(b, a); -} - -template -HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) { - return Lt(b, a); -} - -// ------------------------------ TestBit -template -HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) { - return detail::NeS(And(a, bit), 0); -} - -// ------------------------------ Not -// NOLINTNEXTLINE -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not ) - -// ------------------------------ And - -// mask = f(mask_a, mask_b) (note arg2,arg1 order!) -#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \ - return __riscv_vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, And, and) - -// ------------------------------ AndNot -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn) - -// ------------------------------ Or -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or) - -// ------------------------------ Xor -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor) - -// ------------------------------ ExclusiveNeither -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor) - -#undef HWY_RVV_RETM_ARGMM - -// ------------------------------ IfThenElse - -#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \ - HWY_RVV_V(BASE, SEW, LMUL) no) { \ - return __riscv_v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL) - -#undef HWY_RVV_IF_THEN_ELSE - -// ------------------------------ IfThenElseZero -template -HWY_API V IfThenElseZero(const M mask, const V yes) { - return IfThenElse(mask, yes, Zero(DFromV())); -} - -// ------------------------------ IfThenZeroElse - -#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ - LMULH, SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(no, 0, m, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL) - -#undef HWY_RVV_IF_THEN_ZERO_ELSE - -// ------------------------------ MaskFromVec -template -HWY_API MFromD> MaskFromVec(const V v) { - return detail::NeS(v, 0); -} - -// ------------------------------ RebindMask -template -HWY_API MFromD RebindMask(const D /*d*/, const MFrom mask) { - // No need to check lane size/LMUL are the same: if not, casting MFrom to - // MFromD would fail. - return mask; -} - -// ------------------------------ VecFromMask - -// Returns mask ? ~0 : 0. No longer use sub.vx(Zero(), 1, mask) because per the -// default mask-agnostic policy, the result of inactive lanes may also be ~0. -#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \ - const RebindToSigned di; \ - using TI = TFromD; \ - return BitCast( \ - d, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, Lanes(d))); \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, VecFromMask, merge_vxm, _ALL_VIRT) - -#undef HWY_RVV_VEC_FROM_MASK - -template -HWY_API VFromD VecFromMask(const D d, MFromD mask) { - return BitCast(d, VecFromMask(RebindToUnsigned(), mask)); -} - -// ------------------------------ IfVecThenElse (MaskFromVec) -template -HWY_API V IfVecThenElse(const V mask, const V yes, const V no) { - return IfThenElse(MaskFromVec(mask), yes, no); -} - -// ------------------------------ ZeroIfNegative -template -HWY_API V ZeroIfNegative(const V v) { - return IfThenZeroElse(detail::LtS(v, 0), v); -} - -// ------------------------------ BroadcastSignBit -template -HWY_API V BroadcastSignBit(const V v) { - return ShiftRight) * 8 - 1>(v); -} - -// ------------------------------ IfNegativeThenElse (BroadcastSignBit) -template -HWY_API V IfNegativeThenElse(V v, V yes, V no) { - static_assert(IsSigned>(), "Only works for signed/float"); - const DFromV d; - const RebindToSigned di; - - MFromD m = detail::LtS(BitCast(di, v), 0); - return IfThenElse(m, yes, no); -} - -// ------------------------------ FindFirstTrue - -#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ - static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ - return __riscv_vfirst_m_b##MLEN(m, Lanes(d)); \ - } \ - template \ - HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \ - static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ - return static_cast(__riscv_vfirst_m_b##MLEN(m, Lanes(d))); \ - } - -HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _) -#undef HWY_RVV_FIND_FIRST_TRUE - -// ------------------------------ AllFalse -template -HWY_API bool AllFalse(D d, MFromD m) { - return FindFirstTrue(d, m) < 0; -} - -// ------------------------------ AllTrue - -#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \ - static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ - return AllFalse(d, __riscv_vmnot_m_b##MLEN(m, Lanes(d))); \ - } - -HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _) -#undef HWY_RVV_ALL_TRUE - -// ------------------------------ CountTrue - -#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \ - static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \ - return __riscv_vcpop_m_b##MLEN(m, Lanes(d)); \ - } - -HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _) -#undef HWY_RVV_COUNT_TRUE - -// ================================================== MEMORY - -// ------------------------------ Load - -#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT) -#undef HWY_RVV_LOAD - -// There is no native BF16, treat as uint16_t. -template -HWY_API VFromD> Load(Simd d, - const bfloat16_t* HWY_RESTRICT p) { - return Load(RebindToSigned(), - reinterpret_cast(p)); -} - -template -HWY_API void Store(VFromD> v, - Simd d, bfloat16_t* HWY_RESTRICT p) { - Store(v, RebindToSigned(), - reinterpret_cast(p)); -} - -#if !HWY_HAVE_FLOAT16 // Otherwise already defined above. - -// NOTE: different type for float16_t than bfloat16_t, see Set(). -template -HWY_API VFromD> Load(Simd d, - const float16_t* HWY_RESTRICT p) { - return Load(RebindToUnsigned(), - reinterpret_cast(p)); -} - -template -HWY_API void Store(VFromD> v, - Simd d, float16_t* HWY_RESTRICT p) { - Store(v, RebindToUnsigned(), - reinterpret_cast(p)); -} - -#endif // !HWY_HAVE_FLOAT16 - -// ------------------------------ LoadU -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - // RVV only requires element alignment, not vector alignment. - return Load(d, p); -} - -// ------------------------------ MaskedLoad - -#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu(m, Zero(d), p, \ - Lanes(d)); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu(m, v, p, Lanes(d)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT) -#undef HWY_RVV_MASKED_LOAD - -// ------------------------------ Store - -#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT) -#undef HWY_RVV_STORE - -// ------------------------------ BlendedStore - -#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT) -#undef HWY_RVV_BLENDED_STORE - -// ------------------------------ StoreN - -namespace detail { - -#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count); \ - } -HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT) -#undef HWY_RVV_STOREN - -} // namespace detail - -#ifdef HWY_NATIVE_STORE_N -#undef HWY_NATIVE_STORE_N -#else -#define HWY_NATIVE_STORE_N -#endif - -template , - hwy::EnableIf>>()>* = nullptr> -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - // NOTE: Need to call Lanes(d) and clamp max_lanes_to_store to Lanes(d), even - // if MaxLanes(d) >= MaxLanes(DFromV>()) is true, as it is possible - // for detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than - // Lanes(DFromV>()) lanes to p if - // max_lanes_to_store > Lanes(DFromV>()) and - // max_lanes_to_store < 2 * Lanes(DFromV>()) are both true. - - // Also need to make sure that no more than Lanes(d) lanes are stored to p - // if Lanes(d) < Lanes(DFromV>()) is true, which is possible if - // MaxLanes(d) < MaxLanes(DFromV>()) or - // d.Pow2() < DFromV>().Pow2() is true. - const size_t N = Lanes(d); - detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d, p); -} - -// StoreN for BF16/F16 vectors -template , - hwy::EnableIf>>()>* = nullptr, - HWY_IF_SPECIAL_FLOAT(T)> -HWY_API void StoreN(VFromD v, D /*d*/, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - using TStore = TFromV>; - const Rebind d_store; - const size_t N = Lanes(d_store); - detail::StoreN(HWY_MIN(max_lanes_to_store, N), v, d_store, - reinterpret_cast(p)); -} - -// ------------------------------ StoreU -template -HWY_API void StoreU(const V v, D d, TFromD* HWY_RESTRICT p) { - // RVV only requires element alignment, not vector alignment. - Store(v, d, p); -} - -// ------------------------------ Stream -template -HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) { - Store(v, d, aligned); -} - -// ------------------------------ ScatterOffset - -#ifdef HWY_NATIVE_SCATTER -#undef HWY_NATIVE_SCATTER -#else -#define HWY_NATIVE_SCATTER -#endif - -#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ - HWY_RVV_V(int, SEW, LMUL) offset) { \ - const RebindToUnsigned du; \ - return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ - base, BitCast(du, offset), v, Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT) -#undef HWY_RVV_SCATTER - -// ------------------------------ ScatterIndex -template -HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, - VFromD> indices) { - constexpr size_t kBits = CeilLog2(sizeof(TFromD)); - return ScatterOffset(v, d, base, ShiftLeft(indices)); -} - -// ------------------------------ MaskedScatterIndex - -#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ - LMULH, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ - HWY_RVV_V(int, SEW, LMUL) indices) { \ - const RebindToUnsigned du; \ - constexpr size_t kBits = CeilLog2(sizeof(TFromD)); \ - return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \ - m, base, ShiftLeft(BitCast(du, indices)), v, Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_MASKED_SCATTER, MaskedScatterIndex, sux, _ALL_VIRT) -#undef HWY_RVV_MASKED_SCATTER - -// ------------------------------ GatherOffset - -#ifdef HWY_NATIVE_GATHER -#undef HWY_NATIVE_GATHER -#else -#define HWY_NATIVE_GATHER -#endif - -#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ - HWY_RVV_V(int, SEW, LMUL) offset) { \ - const RebindToUnsigned du; \ - return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \ - base, BitCast(du, offset), Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT) -#undef HWY_RVV_GATHER - -// ------------------------------ GatherIndex - -template -HWY_API VFromD GatherIndex(D d, const TFromD* HWY_RESTRICT base, - const VFromD> index) { - constexpr size_t kBits = CeilLog2(sizeof(TFromD)); - return GatherOffset(d, base, ShiftLeft(index)); -} - -// ------------------------------ MaskedGatherIndex - -#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \ - HWY_RVV_V(int, SEW, LMUL) indices) { \ - const RebindToUnsigned du; \ - constexpr size_t kBits = CeilLog2(SEW / 8); \ - return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \ - m, Zero(d), base, ShiftLeft(BitCast(du, indices)), Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_MASKED_GATHER, MaskedGatherIndex, lux, _ALL_VIRT) -#undef HWY_RVV_MASKED_GATHER - -// ================================================== CONVERT - -// ------------------------------ PromoteTo - -// SEW is for the input. -#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ - HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##CHAR##SEWD##LMULD(v, Lanes(d)); \ - } - -HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) -HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) -HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT) -HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) -HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) -HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT) -HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, _EXT_VIRT) - -#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C - -HWY_RVV_FOREACH_F16_UNCONDITIONAL(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, - _EXT_VIRT) - -// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif -#endif // HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C - -#undef HWY_RVV_PROMOTE - -// The above X-macro cannot handle 4x promotion nor type switching. -// TODO(janwas): use BASE2 arg to allow the latter. -#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \ - SHIFT, ADD) \ - template \ - HWY_API HWY_RVV_V(BASE, BITS, LMUL) \ - PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \ - HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \ - return __riscv_v##OP##CHAR##BITS##LMUL(v, Lanes(d)); \ - } - -#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1) - -#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2) - -#define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \ - HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) - -#define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf8, -3, 3) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf4, -2, 3) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, mf2, -1, 3) \ - HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m1, 0, 3) - -HWY_RVV_PROMOTE_X8(zext_vf8_, uint, u, 64, uint, 8) -HWY_RVV_PROMOTE_X8(sext_vf8_, int, i, 64, int, 8) - -HWY_RVV_PROMOTE_X4_FROM_U8(zext_vf4_, uint, u, 32, uint, 8) -HWY_RVV_PROMOTE_X4_FROM_U8(sext_vf4_, int, i, 32, int, 8) -HWY_RVV_PROMOTE_X4(zext_vf4_, uint, u, 64, uint, 16) -HWY_RVV_PROMOTE_X4(sext_vf4_, int, i, 64, int, 16) - -// i32 to f64 -HWY_RVV_PROMOTE_X2(fwcvt_f_x_v_, float, f, 64, int, 32) - -#undef HWY_RVV_PROMOTE_X8 -#undef HWY_RVV_PROMOTE_X4_FROM_U8 -#undef HWY_RVV_PROMOTE_X4 -#undef HWY_RVV_PROMOTE_X2 -#undef HWY_RVV_PROMOTE - -// I16->I64 or U16->U64 PromoteTo with virtual LMUL -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return PromoteTo(ScalableTag(), v); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return PromoteTo(ScalableTag(), v); -} - -// Unsigned to signed: cast for unsigned promote. -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return BitCast(d, PromoteTo(RebindToUnsigned(), v)); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return BitCast(d, PromoteTo(RebindToUnsigned(), v)); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return BitCast(d, PromoteTo(RebindToUnsigned(), v)); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return BitCast(d, PromoteTo(RebindToUnsigned(), v)); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return BitCast(d, PromoteTo(RebindToUnsigned(), v)); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - return BitCast(d, PromoteTo(RebindToUnsigned(), v)); -} - -template -HWY_API auto PromoteTo(Simd d, - VFromD> v) - -> VFromD { - const RebindToSigned di32; - const Rebind du16; - return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -// ------------------------------ DemoteTo U - -// SEW is for the source so we can use _DEMOTE_VIRT. -#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ - HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##CHAR##SEWH##LMULH( \ - v, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \ - } - -// Unsigned -> unsigned -HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) -HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) -HWY_RVV_FOREACH_U64(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT) - -// SEW is for the source so we can use _DEMOTE_VIRT. -#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \ - HWY_RVV_D(uint, SEWH, N, SHIFT - 1) dn, HWY_RVV_V(int, SEW, LMUL) v) { \ - const HWY_RVV_D(uint, SEW, N, SHIFT) du; \ - /* First clamp negative numbers to zero to match x86 packus. */ \ - return DemoteTo(dn, BitCast(du, detail::MaxS(v, 0))); \ - } -HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) -HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) -HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT) -#undef HWY_RVV_DEMOTE_I_TO_U - -template -HWY_API vuint8mf8_t DemoteTo(Simd d, const vint32mf2_t v) { - return __riscv_vnclipu_wx_u8mf8( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8mf4_t DemoteTo(Simd d, const vint32m1_t v) { - return __riscv_vnclipu_wx_u8mf4( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8mf2_t DemoteTo(Simd d, const vint32m2_t v) { - return __riscv_vnclipu_wx_u8mf2( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8m1_t DemoteTo(Simd d, const vint32m4_t v) { - return __riscv_vnclipu_wx_u8m1( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8m2_t DemoteTo(Simd d, const vint32m8_t v) { - return __riscv_vnclipu_wx_u8m2( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} - -template -HWY_API vuint8mf8_t DemoteTo(Simd d, const vuint32mf2_t v) { - return __riscv_vnclipu_wx_u8mf8( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8mf4_t DemoteTo(Simd d, const vuint32m1_t v) { - return __riscv_vnclipu_wx_u8mf4( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8mf2_t DemoteTo(Simd d, const vuint32m2_t v) { - return __riscv_vnclipu_wx_u8mf2( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8m1_t DemoteTo(Simd d, const vuint32m4_t v) { - return __riscv_vnclipu_wx_u8m1( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} -template -HWY_API vuint8m2_t DemoteTo(Simd d, const vuint32m8_t v) { - return __riscv_vnclipu_wx_u8m2( - DemoteTo(Simd(), v), 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); -} - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) { - const size_t avl = Lanes(ScalableTag()); - return __riscv_vnclipu_wx_u8mf8( - __riscv_vnclipu_wx_u16mf4(v, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), - 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} -HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) { - const size_t avl = Lanes(ScalableTag()); - return __riscv_vnclipu_wx_u8mf4( - __riscv_vnclipu_wx_u16mf2(v, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), - 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} -HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) { - const size_t avl = Lanes(ScalableTag()); - return __riscv_vnclipu_wx_u8mf2( - __riscv_vnclipu_wx_u16m1(v, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), - 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} -HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) { - const size_t avl = Lanes(ScalableTag()); - return __riscv_vnclipu_wx_u8m1( - __riscv_vnclipu_wx_u16m2(v, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), - 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} -HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) { - const size_t avl = Lanes(ScalableTag()); - return __riscv_vnclipu_wx_u8m2( - __riscv_vnclipu_wx_u16m4(v, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)), - 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -// ------------------------------ Truncations - -template -HWY_API vuint8mf8_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m1_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - const vuint16mf4_t v3 = __riscv_vnclipu_wx_u16mf4( - v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8mf8(v3, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m2_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - const vuint16mf2_t v3 = __riscv_vnclipu_wx_u16mf2( - v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8mf4(v3, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m4_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - const vuint16m1_t v3 = __riscv_vnclipu_wx_u16m1( - v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8mf2(v3, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8m1_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m8_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - const vuint16m2_t v3 = __riscv_vnclipu_wx_u16m2( - v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8m1(v3, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16mf4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl); - const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u16mf4(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16mf2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m2_t v1 = __riscv_vand(v, 0xFFFF, avl); - const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u16mf2(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16m1_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m4_t v1 = __riscv_vand(v, 0xFFFF, avl); - const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u16m1(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16m2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m8_t v1 = __riscv_vand(v, 0xFFFF, avl); - const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u16m2(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint32mf2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); - return __riscv_vnclipu_wx_u32mf2(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint32m1_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m2_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); - return __riscv_vnclipu_wx_u32m1(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint32m2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m4_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); - return __riscv_vnclipu_wx_u32m2(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint32m4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint64m8_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl); - return __riscv_vnclipu_wx_u32m4(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf8_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32mf2_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint16mf4_t v2 = __riscv_vnclipu_wx_u16mf4( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8mf8(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m1_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint16mf2_t v2 = __riscv_vnclipu_wx_u16mf2( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8mf4(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m2_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint16m1_t v2 = __riscv_vnclipu_wx_u16m1( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8mf2(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8m1_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m4_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint16m2_t v2 = __riscv_vnclipu_wx_u16m2( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8m1(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8m2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m8_t v1 = __riscv_vand(v, 0xFF, avl); - const vuint16m4_t v2 = __riscv_vnclipu_wx_u16m4( - v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); - return __riscv_vnclipu_wx_u8m2(v2, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16mf4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl); - return __riscv_vnclipu_wx_u16mf4(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16mf2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m1_t v1 = __riscv_vand(v, 0xFFFF, avl); - return __riscv_vnclipu_wx_u16mf2(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16m1_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m2_t v1 = __riscv_vand(v, 0xFFFF, avl); - return __riscv_vnclipu_wx_u16m1(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16m2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m4_t v1 = __riscv_vand(v, 0xFFFF, avl); - return __riscv_vnclipu_wx_u16m2(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint16m4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint32m8_t v1 = __riscv_vand(v, 0xFFFF, avl); - return __riscv_vnclipu_wx_u16m4(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf8_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint16mf4_t v1 = __riscv_vand(v, 0xFF, avl); - return __riscv_vnclipu_wx_u8mf8(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint16mf2_t v1 = __riscv_vand(v, 0xFF, avl); - return __riscv_vnclipu_wx_u8mf4(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8mf2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint16m1_t v1 = __riscv_vand(v, 0xFF, avl); - return __riscv_vnclipu_wx_u8mf2(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8m1_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint16m2_t v1 = __riscv_vand(v, 0xFF, avl); - return __riscv_vnclipu_wx_u8m1(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8m2_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint16m4_t v1 = __riscv_vand(v, 0xFF, avl); - return __riscv_vnclipu_wx_u8m2(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -template -HWY_API vuint8m4_t TruncateTo(Simd d, - const VFromD> v) { - const size_t avl = Lanes(d); - const vuint16m8_t v1 = __riscv_vand(v, 0xFF, avl); - return __riscv_vnclipu_wx_u8m4(v1, 0, - HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)); -} - -// ------------------------------ DemoteTo I - -HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) -HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) -HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT) - -template -HWY_API vint8mf8_t DemoteTo(Simd d, const vint32mf2_t v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} -template -HWY_API vint8mf4_t DemoteTo(Simd d, const vint32m1_t v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} -template -HWY_API vint8mf2_t DemoteTo(Simd d, const vint32m2_t v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} -template -HWY_API vint8m1_t DemoteTo(Simd d, const vint32m4_t v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} -template -HWY_API vint8m2_t DemoteTo(Simd d, const vint32m8_t v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - return DemoteTo(d, DemoteTo(Simd(), v)); -} - -#undef HWY_RVV_DEMOTE - -// ------------------------------ DemoteTo F - -// SEW is for the source so we can use _DEMOTE_VIRT. -#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ - HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##SEWH##LMULH(v, Lanes(d)); \ - } - -#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C -HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT) -#endif -HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT) -#undef HWY_RVV_DEMOTE_F - -// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F. -template -HWY_API vint32mf2_t DemoteTo(Simd d, const vfloat64m1_t v) { - return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); -} -template -HWY_API vint32mf2_t DemoteTo(Simd d, const vfloat64m1_t v) { - return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d)); -} -template -HWY_API vint32m1_t DemoteTo(Simd d, const vfloat64m2_t v) { - return __riscv_vfncvt_rtz_x_f_w_i32m1(v, Lanes(d)); -} -template -HWY_API vint32m2_t DemoteTo(Simd d, const vfloat64m4_t v) { - return __riscv_vfncvt_rtz_x_f_w_i32m2(v, Lanes(d)); -} -template -HWY_API vint32m4_t DemoteTo(Simd d, const vfloat64m8_t v) { - return __riscv_vfncvt_rtz_x_f_w_i32m4(v, Lanes(d)); -} - -// SEW is for the source so we can use _DEMOTE_VIRT. -#define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ - LMULH, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \ - HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##CHAR##SEWH##LMULH( \ - v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \ - } -namespace detail { -HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_TO_SHR_16, DemoteToShr16, nclipu_wx_, - _DEMOTE_VIRT) -} -#undef HWY_RVV_DEMOTE_TO_SHR_16 - -template -HWY_API VFromD> DemoteTo( - Simd d, VFromD> v) { - const RebindToUnsigned du16; - const Rebind du32; - return BitCast(d, detail::DemoteToShr16(du16, BitCast(du32, v))); -} - -// ------------------------------ ConvertTo F - -#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \ - return __riscv_vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) { \ - return __riscv_vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \ - } \ - /* Truncates (rounds toward zero). */ \ - template \ - HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \ - HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \ - } \ -// API only requires f32 but we provide f64 for internal use. -HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT) -#undef HWY_RVV_CONVERT - -// Uses default rounding mode. Must be separate because there is no D arg. -#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \ - } -HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL) -#undef HWY_RVV_NEAREST - -// ================================================== COMBINE - -namespace detail { - -// For x86-compatible behaviour mandated by Highway API: TableLookupBytes -// offsets are implicitly relative to the start of their 128-bit block. -template -HWY_INLINE size_t LanesPerBlock(Simd d) { - // kMinVecBytes is the minimum size of VFromD in bytes - constexpr size_t kMinVecBytes = - ScaleByPower(16, HWY_MAX(HWY_MIN(kPow2, 3), -3)); - // kMinVecLanes is the minimum number of lanes in VFromD - constexpr size_t kMinVecLanes = (kMinVecBytes + sizeof(T) - 1) / sizeof(T); - // kMaxLpb is the maximum number of lanes per block - constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), MaxLanes(d)); - - // If kMaxLpb <= kMinVecLanes is true, then kMaxLpb <= Lanes(d) is true - if (kMaxLpb <= kMinVecLanes) return kMaxLpb; - - // Fractional LMUL: Lanes(d) may be smaller than kMaxLpb, so honor that. - const size_t lanes_per_vec = Lanes(d); - return HWY_MIN(lanes_per_vec, kMaxLpb); -} - -template -HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) { - using T = MakeUnsigned>; - return AndS(iota0, static_cast(~(LanesPerBlock(d) - 1))); -} - -template -HWY_INLINE MFromD FirstNPerBlock(D /* tag */) { - const RebindToUnsigned du; - const RebindToSigned di; - using TU = TFromD; - const auto idx_mod = AndS(Iota0(du), static_cast(LanesPerBlock(du) - 1)); - return LtS(BitCast(di, idx_mod), static_cast>(kLanes)); -} - -#define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \ - size_t lanes) { \ - return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -#define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) src, size_t lanes) { \ - return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(src, lanes, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_SLIDE_UP, SlideUp, slideup, _ALL) -HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL) - -#undef HWY_RVV_SLIDE_UP -#undef HWY_RVV_SLIDE_DOWN - -} // namespace detail - -// ------------------------------ SlideUpLanes -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { - return detail::SlideUp(Zero(d), v, amt); -} - -// ------------------------------ SlideDownLanes -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { - v = detail::SlideDown(v, amt); - // Zero out upper lanes if v is a partial vector - if (MaxLanes(d) < MaxLanes(DFromV())) { - v = IfThenElseZero(FirstN(d, Lanes(d) - amt), v); - } - return v; -} - -// ------------------------------ ConcatUpperLower -template -HWY_API V ConcatUpperLower(D d, const V hi, const V lo) { - return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); -} - -// ------------------------------ ConcatLowerLower -template -HWY_API V ConcatLowerLower(D d, const V hi, const V lo) { - return detail::SlideUp(lo, hi, Lanes(d) / 2); -} - -// ------------------------------ ConcatUpperUpper -template -HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) { - // Move upper half into lower - const auto lo_down = detail::SlideDown(lo, Lanes(d) / 2); - return ConcatUpperLower(d, hi, lo_down); -} - -// ------------------------------ ConcatLowerUpper -template -HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) { - // Move half of both inputs to the other half - const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2); - const auto lo_down = detail::SlideDown(lo, Lanes(d) / 2); - return ConcatUpperLower(d, hi_up, lo_down); -} - -// ------------------------------ Combine -template -HWY_API VFromD Combine(D2 d2, const V hi, const V lo) { - return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi), - Lanes(d2) / 2); -} - -// ------------------------------ ZeroExtendVector -template -HWY_API VFromD ZeroExtendVector(D2 d2, const V lo) { - return Combine(d2, Xor(lo, lo), lo); -} - -// ------------------------------ Lower/UpperHalf - -namespace detail { - -// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note -// that SEW = sizeof(T)*8 and LMUL = 1 << d.Pow2(). Add 3 to Pow2 to avoid -// negative shift counts. -template -constexpr bool IsSupportedLMUL(D d) { - return (size_t{1} << (d.Pow2() + 3)) >= sizeof(TFromD); -} - -} // namespace detail - -// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL. -template * = nullptr> -HWY_API VFromD LowerHalf(const DH /* tag */, const VFromD> v) { - return detail::Trunc(v); -} - -// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and -// the hardware may set "vill" if we attempt such an LMUL. However, the V -// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it -// still makes sense to have half of an SEW=64 vector. We instead just return -// the vector, and rely on the kPow2 in DH to halve the return value of Lanes(). -template * = nullptr> -HWY_API V LowerHalf(const DH /* tag */, const V v) { - return v; -} - -// Same, but without D arg -template -HWY_API VFromD>> LowerHalf(const V v) { - return LowerHalf(Half>(), v); -} - -template -HWY_API VFromD UpperHalf(const DH d2, const VFromD> v) { - return LowerHalf(d2, detail::SlideDown(v, Lanes(d2))); -} - -// ================================================== SWIZZLE - -namespace detail { -// Special instruction for 1 lane is presumably faster? -#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL) -HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL) -#undef HWY_RVV_SLIDE1 -} // namespace detail - -// ------------------------------ Slide1Up and Slide1Down -#ifdef HWY_NATIVE_SLIDE1_UP_DOWN -#undef HWY_NATIVE_SLIDE1_UP_DOWN -#else -#define HWY_NATIVE_SLIDE1_UP_DOWN -#endif - -template -HWY_API VFromD Slide1Up(D /*d*/, VFromD v) { - return detail::Slide1Up(v); -} - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { - v = detail::Slide1Down(v); - // Zero out upper lanes if v is a partial vector - if (MaxLanes(d) < MaxLanes(DFromV())) { - v = IfThenElseZero(FirstN(d, Lanes(d) - 1), v); - } - return v; -} - -// ------------------------------ GetLane - -#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \ - } - -HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetLane, fmv_f, _ALL) -#undef HWY_RVV_GET_LANE - -// ------------------------------ ExtractLane -template -HWY_API TFromV ExtractLane(const V v, size_t i) { - return GetLane(detail::SlideDown(v, i)); -} - -// ------------------------------ Additional mask logical operations - -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof) -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetBeforeFirst, sbf) -HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetAtOrBeforeFirst, sif) - -#define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_M(MLEN) SetAtOrAfterFirst(HWY_RVV_M(MLEN) m) { \ - return Not(SetBeforeFirst(m)); \ - } - -HWY_RVV_FOREACH_B(HWY_RVV_SET_AT_OR_AFTER_FIRST, _, _) -#undef HWY_RVV_SET_AT_OR_AFTER_FIRST - -// ------------------------------ InsertLane - -template -HWY_API V InsertLane(const V v, size_t i, TFromV t) { - const DFromV d; - const RebindToUnsigned du; // Iota0 is unsigned only - using TU = TFromD; - const auto is_i = detail::EqS(detail::Iota0(du), static_cast(i)); - return IfThenElse(RebindMask(d, is_i), Set(d, t), v); -} - -// For 8-bit lanes, Iota0 might overflow. -template -HWY_API V InsertLane(const V v, size_t i, TFromV t) { - const DFromV d; - const auto zero = Zero(d); - const auto one = Set(d, 1); - const auto ge_i = Eq(detail::SlideUp(zero, one, i), one); - const auto is_i = SetOnlyFirst(ge_i); - return IfThenElse(RebindMask(d, is_i), Set(d, t), v); -} - -// ------------------------------ OddEven - -namespace detail { - -// Faster version using a wide constant instead of Iota0 + AndS. -template -HWY_INLINE MFromD IsEven(D d) { - const RebindToUnsigned du; - const RepartitionToWide duw; - return RebindMask(d, detail::NeS(BitCast(du, Set(duw, 1)), 0u)); -} - -template -HWY_INLINE MFromD IsEven(D d) { - const RebindToUnsigned du; // Iota0 is unsigned only - return detail::EqS(detail::AndS(detail::Iota0(du), 1), 0); -} - -// Also provide the negated form because there is no native CompressNot. -template -HWY_INLINE MFromD IsOdd(D d) { - const RebindToUnsigned du; - const RepartitionToWide duw; - return RebindMask(d, detail::EqS(BitCast(du, Set(duw, 1)), 0u)); -} - -template -HWY_INLINE MFromD IsOdd(D d) { - const RebindToUnsigned du; // Iota0 is unsigned only - return detail::NeS(detail::AndS(detail::Iota0(du), 1), 0); -} - -} // namespace detail - -template -HWY_API V OddEven(const V a, const V b) { - return IfThenElse(detail::IsEven(DFromV()), b, a); -} - -// ------------------------------ DupEven (OddEven) -template -HWY_API V DupEven(const V v) { - const V up = detail::Slide1Up(v); - return OddEven(up, v); -} - -// ------------------------------ DupOdd (OddEven) -template -HWY_API V DupOdd(const V v) { - const V down = detail::Slide1Down(v); - return OddEven(v, down); -} - -// ------------------------------ OddEvenBlocks -template -HWY_API V OddEvenBlocks(const V a, const V b) { - const RebindToUnsigned> du; // Iota0 is unsigned only - constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV)); - const auto idx_block = ShiftRight(detail::Iota0(du)); - const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0); - return IfThenElse(is_even, b, a); -} - -// ------------------------------ SwapAdjacentBlocks -template -HWY_API V SwapAdjacentBlocks(const V v) { - const DFromV d; - const size_t lpb = detail::LanesPerBlock(d); - const V down = detail::SlideDown(v, lpb); - const V up = detail::SlideUp(v, v, lpb); - return OddEvenBlocks(up, down); -} - -// ------------------------------ TableLookupLanes - -template -HWY_API VFromD> IndicesFromVec(D d, VI vec) { - static_assert(sizeof(TFromD) == sizeof(TFromV), "Index != lane"); - const RebindToUnsigned du; // instead of : avoids unused d. - const auto indices = BitCast(du, vec); -#if HWY_IS_DEBUG_BUILD - using TU = TFromD; - const size_t twice_num_of_lanes = Lanes(d) * 2; - HWY_DASSERT(AllTrue( - du, Eq(indices, - detail::AndS(indices, static_cast(twice_num_of_lanes - 1))))); -#endif - return indices; -} - -template -HWY_API VFromD> SetTableIndices(D d, const TI* idx) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); - return IndicesFromVec(d, LoadU(Rebind(), idx)); -} - -// TODO(janwas): avoid using this for 8-bit; wrap in detail namespace. -// For large 8-bit vectors, index overflow will lead to incorrect results. -// Reverse already uses TableLookupLanes16 to prevent this. -#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL) -#undef HWY_RVV_TABLE - -namespace detail { - -// Used by I8/U8 Reverse -#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_UI08(HWY_RVV_TABLE16, TableLookupLanes16, rgatherei16, _EXT) -#undef HWY_RVV_TABLE16 - -// Used by Expand. -#define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \ - HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_MASKED_TABLE, MaskedTableLookupLanes, rgather, _ALL) -#undef HWY_RVV_MASKED_TABLE - -#define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ - LMULH, SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \ - HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_UI08(HWY_RVV_MASKED_TABLE16, MaskedTableLookupLanes16, - rgatherei16, _EXT) -#undef HWY_RVV_MASKED_TABLE16 - -} // namespace detail - -// ------------------------------ Reverse (TableLookupLanes) -template -HWY_API VFromD Reverse(D d, VFromD v) { - const Rebind du16; - const size_t N = Lanes(d); - const auto idx = - detail::ReverseSubS(detail::Iota0(du16), static_cast(N - 1)); - return detail::TableLookupLanes16(v, idx); -} - -template -HWY_API VFromD Reverse(D d, VFromD v) { - const Half dh; - const Rebind du16; - const size_t half_n = Lanes(dh); - const auto idx = detail::ReverseSubS(detail::Iota0(du16), - static_cast(half_n - 1)); - const auto reversed_lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx); - const auto reversed_hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx); - return Combine(d, reversed_lo, reversed_hi); -} - -template -HWY_API VFromD Reverse(D /* tag */, VFromD v) { - const RebindToUnsigned du; - using TU = TFromD; - const size_t N = Lanes(du); - const auto idx = - detail::ReverseSubS(detail::Iota0(du), static_cast(N - 1)); - return TableLookupLanes(v, idx); -} - -// ------------------------------ Reverse2 (RotateRight, OddEven) - -// Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -// Shifting and adding requires fewer instructions than blending, but casting to -// u32 only works for LMUL in [1/2, 8]. - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const detail::AdjustSimdTagToMinVecPow2> du16; - return ResizeBitCast(d, RotateRight<8>(ResizeBitCast(du16, v))); -} - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const detail::AdjustSimdTagToMinVecPow2> du32; - return ResizeBitCast(d, RotateRight<16>(ResizeBitCast(du32, v))); -} - -// Shifting and adding requires fewer instructions than blending, but casting to -// u64 does not work for LMUL < 1. -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const detail::AdjustSimdTagToMinVecPow2> du64; - return ResizeBitCast(d, RotateRight<32>(ResizeBitCast(du64, v))); -} - -template , HWY_IF_T_SIZE_D(D, 8)> -HWY_API V Reverse2(D /* tag */, const V v) { - const V up = detail::Slide1Up(v); - const V down = detail::Slide1Down(v); - return OddEven(up, down); -} - -// ------------------------------ Reverse4 (TableLookupLanes) - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - const detail::AdjustSimdTagToMinVecPow2> du16; - return ResizeBitCast(d, Reverse2(du16, ResizeBitCast(du16, Reverse2(d, v)))); -} - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - const RebindToUnsigned du; - const auto idx = detail::XorS(detail::Iota0(du), 3); - return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); -} - -// ------------------------------ Reverse8 (TableLookupLanes) - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const detail::AdjustSimdTagToMinVecPow2> du32; - return ResizeBitCast(d, Reverse2(du32, ResizeBitCast(du32, Reverse4(d, v)))); -} - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const RebindToUnsigned du; - const auto idx = detail::XorS(detail::Iota0(du), 7); - return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); -} - -// ------------------------------ ReverseBlocks (Reverse, Shuffle01) -template > -HWY_API V ReverseBlocks(D d, V v) { - const detail::AdjustSimdTagToMinVecPow2> du64; - const size_t N = Lanes(du64); - const auto rev = - detail::ReverseSubS(detail::Iota0(du64), static_cast(N - 1)); - // Swap lo/hi u64 within each block - const auto idx = detail::XorS(rev, 1); - return ResizeBitCast(d, TableLookupLanes(ResizeBitCast(du64, v), idx)); -} - -// ------------------------------ Compress - -// RVV supports all lane types natively. -#ifdef HWY_NATIVE_COMPRESS8 -#undef HWY_NATIVE_COMPRESS8 -#else -#define HWY_NATIVE_COMPRESS8 -#endif - -template -struct CompressIsPartition { - enum { value = 0 }; -}; - -#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \ - return __riscv_v##OP##_vm_##CHAR##SEW##LMUL(v, mask, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_COMPRESS, Compress, compress, _ALL) -#undef HWY_RVV_COMPRESS - -// ------------------------------ Expand - -#ifdef HWY_NATIVE_EXPAND -#undef HWY_NATIVE_EXPAND -#else -#define HWY_NATIVE_EXPAND -#endif - -// >= 2-byte lanes: idx lanes will not overflow. -template -HWY_API V Expand(V v, const M mask) { - const DFromV d; - const RebindToUnsigned du; - const auto idx = detail::MaskedIota(du, RebindMask(du, mask)); - const V zero = Zero(d); - return detail::MaskedTableLookupLanes(mask, zero, v, idx); -} - -// 1-byte lanes, LMUL < 8: promote idx to u16. -template , - HWY_IF_POW2_LE_D(D, 2)> -HWY_API V Expand(V v, const M mask) { - const D d; - const Rebind du16; - const auto idx = detail::MaskedIota(du16, RebindMask(du16, mask)); - const V zero = Zero(d); - return detail::MaskedTableLookupLanes16(mask, zero, v, idx); -} - -// 1-byte lanes, max LMUL: unroll 2x. -template , - HWY_IF_POW2_GT_D(DFromV, 2)> -HWY_API V Expand(V v, const M mask) { - const D d; - const Half dh; - const auto v0 = LowerHalf(dh, v); - // TODO(janwas): skip vec<->mask if we can cast masks. - const V vmask = VecFromMask(d, mask); - const auto m0 = MaskFromVec(LowerHalf(dh, vmask)); - - // Cannot just use UpperHalf, must shift by the number of inputs consumed. - const size_t count = CountTrue(dh, m0); - const auto v1 = detail::Trunc(detail::SlideDown(v, count)); - const auto m1 = MaskFromVec(UpperHalf(dh, vmask)); - return Combine(d, Expand(v1, m1), Expand(v0, m0)); -} - -// ------------------------------ LoadExpand -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { - return Expand(LoadU(d, unaligned), mask); -} - -// ------------------------------ CompressNot -template -HWY_API V CompressNot(V v, const M mask) { - return Compress(v, Not(mask)); -} - -// ------------------------------ CompressBlocksNot -template -HWY_API V CompressBlocksNot(V v, const M mask) { - return CompressNot(v, mask); -} - -// ------------------------------ CompressStore -template -HWY_API size_t CompressStore(const V v, const M mask, const D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(Compress(v, mask), d, unaligned); - return CountTrue(d, mask); -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d, - TFromD* HWY_RESTRICT unaligned) { - const size_t count = CountTrue(d, mask); - detail::StoreN(count, Compress(v, mask), d, unaligned); - return count; -} - -// ================================================== COMPARE (2) - -// ------------------------------ FindLastTrue - -template -HWY_API intptr_t FindLastTrue(D d, MFromD m) { - const RebindToSigned di; - const intptr_t fft_rev_idx = - FindFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m)))); - return (fft_rev_idx >= 0) - ? (static_cast(Lanes(d) - 1) - fft_rev_idx) - : intptr_t{-1}; -} - -template -HWY_API size_t FindKnownLastTrue(D d, MFromD m) { - const RebindToSigned di; - const size_t fft_rev_idx = - FindKnownFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m)))); - return Lanes(d) - 1 - fft_rev_idx; -} - -// ------------------------------ ConcatOdd (Compress) - -namespace detail { - -#define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEWD, LMULD) v) { \ - return __riscv_v##OP##_wx_##CHAR##SEW##LMUL(v, kShift, \ - HWY_RVV_AVL(SEWD, SHIFT + 1)); \ - } - -HWY_RVV_FOREACH_U08(HWY_RVV_NARROW, Narrow, nsrl, _EXT) -HWY_RVV_FOREACH_U16(HWY_RVV_NARROW, Narrow, nsrl, _EXT) -HWY_RVV_FOREACH_U32(HWY_RVV_NARROW, Narrow, nsrl, _EXT) -#undef HWY_RVV_NARROW - -} // namespace detail - -// Casting to wider and narrowing is the fastest for < 64-bit lanes. -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - constexpr size_t kBits = sizeof(TFromD) * 8; - const Twice dt; - const RepartitionToWide> dtuw; - const VFromD hl = BitCast(dtuw, Combine(dt, hi, lo)); - return BitCast(d, detail::Narrow(hl)); -} - -// 64-bit: Combine+Compress. -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const Twice dt; - const VFromD hl = Combine(dt, hi, lo); - return LowerHalf(d, Compress(hl, detail::IsOdd(dt))); -} - -// Any type, max LMUL: Compress both, then Combine. -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const Half dh; - const MFromD is_odd = detail::IsOdd(d); - const VFromD hi_odd = Compress(hi, is_odd); - const VFromD lo_odd = Compress(lo, is_odd); - return Combine(d, LowerHalf(dh, hi_odd), LowerHalf(dh, lo_odd)); -} - -// ------------------------------ ConcatEven (Compress) - -// Casting to wider and narrowing is the fastest for < 64-bit lanes. -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const Twice dt; - const RepartitionToWide> dtuw; - const VFromD hl = BitCast(dtuw, Combine(dt, hi, lo)); - return BitCast(d, detail::Narrow<0>(hl)); -} - -// 64-bit: Combine+Compress. -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const Twice dt; - const VFromD hl = Combine(dt, hi, lo); - return LowerHalf(d, Compress(hl, detail::IsEven(dt))); -} - -// Any type, max LMUL: Compress both, then Combine. -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const Half dh; - const MFromD is_even = detail::IsEven(d); - const VFromD hi_even = Compress(hi, is_even); - const VFromD lo_even = Compress(lo, is_even); - return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even)); -} - -// ================================================== BLOCKWISE - -// ------------------------------ CombineShiftRightBytes -template > -HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) { - const Repartition d8; - const auto hi8 = BitCast(d8, hi); - const auto lo8 = BitCast(d8, lo); - const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes); - const auto lo_down = detail::SlideDown(lo8, kBytes); - const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8); - return BitCast(d, IfThenElse(is_lo, lo_down, hi_up)); -} - -// ------------------------------ CombineShiftRightLanes -template > -HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) { - constexpr size_t kLanesUp = 16 / sizeof(TFromV) - kLanes; - const auto hi_up = detail::SlideUp(hi, hi, kLanesUp); - const auto lo_down = detail::SlideDown(lo, kLanes); - const auto is_lo = detail::FirstNPerBlock(d); - return IfThenElse(is_lo, lo_down, hi_up); -} - -// ------------------------------ Shuffle2301 (ShiftLeft) -template -HWY_API V Shuffle2301(const V v) { - const DFromV d; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - const Repartition du64; - const auto v64 = BitCast(du64, v); - return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64))); -} - -// ------------------------------ Shuffle2103 -template -HWY_API V Shuffle2103(const V v) { - const DFromV d; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - return CombineShiftRightLanes<3>(d, v, v); -} - -// ------------------------------ Shuffle0321 -template -HWY_API V Shuffle0321(const V v) { - const DFromV d; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - return CombineShiftRightLanes<1>(d, v, v); -} - -// ------------------------------ Shuffle1032 -template -HWY_API V Shuffle1032(const V v) { - const DFromV d; - static_assert(sizeof(TFromD) == 4, "Defined for 32-bit types"); - return CombineShiftRightLanes<2>(d, v, v); -} - -// ------------------------------ Shuffle01 -template -HWY_API V Shuffle01(const V v) { - const DFromV d; - static_assert(sizeof(TFromD) == 8, "Defined for 64-bit types"); - return CombineShiftRightLanes<1>(d, v, v); -} - -// ------------------------------ Shuffle0123 -template -HWY_API V Shuffle0123(const V v) { - return Shuffle2301(Shuffle1032(v)); -} - -// ------------------------------ TableLookupBytes - -// Extends or truncates a vector to match the given d. -namespace detail { - -template -HWY_INLINE VFromD ChangeLMUL(D /* d */, VFromD v) { - return v; -} - -// LMUL of VFromD < LMUL of V: need to truncate v -template , TFromV>()>* = nullptr, - HWY_IF_POW2_LE_D(DFromV>, DFromV().Pow2() - 1)> -HWY_INLINE VFromD ChangeLMUL(D d, V v) { - const DFromV d_from; - const Half dh_from; - static_assert( - DFromV>().Pow2() < DFromV().Pow2(), - "The LMUL of VFromD must be less than the LMUL of V"); - static_assert( - DFromV>().Pow2() <= DFromV>().Pow2(), - "The LMUL of VFromD must be less than or equal to the LMUL of " - "VFromD"); - return ChangeLMUL(d, Trunc(v)); -} - -// LMUL of VFromD > LMUL of V: need to extend v -template , TFromV>()>* = nullptr, - HWY_IF_POW2_GT_D(DFromV>, DFromV().Pow2())> -HWY_INLINE VFromD ChangeLMUL(D d, V v) { - const DFromV d_from; - const Twice dt_from; - static_assert(DFromV>().Pow2() > DFromV().Pow2(), - "The LMUL of VFromD must be greater than " - "the LMUL of V"); - static_assert( - DFromV>().Pow2() >= DFromV>().Pow2(), - "The LMUL of VFromD must be greater than or equal to the LMUL of " - "VFromD"); - return ChangeLMUL(d, Ext(dt_from, v)); -} - -} // namespace detail - -template -HWY_API VI TableLookupBytes(const VT vt, const VI vi) { - const DFromV dt; // T=table, I=index. - const DFromV di; - const Repartition dt8; - const Repartition di8; - // Required for producing half-vectors with table lookups from a full vector. - // If we instead run at the LMUL of the index vector, lookups into the table - // would be truncated. Thus we run at the larger of the two LMULs and truncate - // the result vector to the original index LMUL. - constexpr int kPow2T = dt8.Pow2(); - constexpr int kPow2I = di8.Pow2(); - const Simd dm8; // m=max - const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt)); - const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi)); - auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8)); - // If the table is shorter, wrap around offsets so they do not reference - // undefined lanes in the newly extended vmt. - if (kPow2T < kPow2I) { - offsets = detail::AndS(offsets, static_cast(Lanes(dt8) - 1)); - } - const auto out = TableLookupLanes(vmt, Add(vmi, offsets)); - return BitCast(di, detail::ChangeLMUL(di8, out)); -} - -template -HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) { - const DFromV di; - const Repartition di8; - const auto idx8 = BitCast(di8, idx); - const auto lookup = TableLookupBytes(vt, idx8); - return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup)); -} - -// ------------------------------ TwoTablesLookupLanes - -// TODO(janwas): special-case 8-bit lanes to safely handle VL >= 256 -template -HWY_API VFromD TwoTablesLookupLanes(D d, VFromD a, VFromD b, - VFromD> idx) { - const Twice dt; - const RebindToUnsigned dt_u; - const auto combined_tbl = Combine(dt, b, a); - const auto combined_idx = Combine(dt_u, idx, idx); - return LowerHalf(d, TableLookupLanes(combined_tbl, combined_idx)); -} - -template -HWY_API VFromD TwoTablesLookupLanes(D d, VFromD a, VFromD b, - VFromD> idx) { - const RebindToUnsigned du; - using TU = TFromD; - - const size_t num_of_lanes = Lanes(d); - const auto idx_mod = detail::AndS(idx, static_cast(num_of_lanes - 1)); - const auto sel_a_mask = Ne(idx, idx_mod); // FALSE if a - - const auto a_lookup_result = TableLookupLanes(a, idx_mod); - return detail::MaskedTableLookupLanes(sel_a_mask, a_lookup_result, b, - idx_mod); -} - -template -HWY_API V TwoTablesLookupLanes(V a, V b, - VFromD>> idx) { - const DFromV d; - return TwoTablesLookupLanes(d, a, b, idx); -} - -// ------------------------------ Broadcast -template -HWY_API V Broadcast(const V v) { - const DFromV d; - const RebindToUnsigned du; - HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d)); - auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du)); - if (kLane != 0) { - idx = detail::AddS(idx, kLane); - } - return TableLookupLanes(v, idx); -} - -// ------------------------------ BroadcastLane -#ifdef HWY_NATIVE_BROADCASTLANE -#undef HWY_NATIVE_BROADCASTLANE -#else -#define HWY_NATIVE_BROADCASTLANE -#endif - -namespace detail { - -#define HWY_RVV_BROADCAST_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \ - LMULH, SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t idx) { \ - return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, idx, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH(HWY_RVV_BROADCAST_LANE, BroadcastLane, rgather, _ALL) -#undef HWY_RVV_BROADCAST_LANE - -} // namespace detail - -template -HWY_API V BroadcastLane(V v) { - static_assert(0 <= kLane && kLane < HWY_MAX_LANES_V(V), "Invalid lane"); - return detail::BroadcastLane(v, static_cast(kLane)); -} - -// ------------------------------ InsertBlock -#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT -#undef HWY_NATIVE_BLK_INSERT_EXTRACT -#else -#define HWY_NATIVE_BLK_INSERT_EXTRACT -#endif - -template -HWY_API V InsertBlock(V v, VFromD>> blk_to_insert) { - const DFromV d; - using TU = If<(sizeof(TFromV) == 1 && DFromV().Pow2() >= -2), uint16_t, - MakeUnsigned>>; - using TIdx = If; - - const Repartition du; - const Rebind d_idx; - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU); - - constexpr size_t kBlkByteOffset = - static_cast(kBlockIdx) * kMaxLanesPerBlock; - const auto vu = BitCast(du, v); - const auto vblk = ResizeBitCast(du, blk_to_insert); - const auto vblk_shifted = detail::SlideUp(vblk, vblk, kBlkByteOffset); - const auto insert_mask = RebindMask( - du, detail::LtS(detail::SubS(detail::Iota0(d_idx), - static_cast(kBlkByteOffset)), - static_cast(kMaxLanesPerBlock))); - - return BitCast(d, IfThenElse(insert_mask, vblk_shifted, vu)); -} - -// ------------------------------ BroadcastBlock -template , -3)> -HWY_API V BroadcastBlock(V v) { - const DFromV d; - const Repartition du8; - const Rebind du16; - - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - - const auto idx = detail::AddS(detail::AndS(detail::Iota0(du16), uint16_t{15}), - static_cast(kBlockIdx * 16)); - return BitCast(d, detail::TableLookupLanes16(BitCast(du8, v), idx)); -} - -template , -3)> -HWY_API V BroadcastBlock(V v) { - const DFromV d; - using TU = If) == 1, uint16_t, MakeUnsigned>>; - const Repartition du; - - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU); - - const auto idx = detail::AddS( - detail::AndS(detail::Iota0(du), static_cast(kMaxLanesPerBlock - 1)), - static_cast(static_cast(kBlockIdx) * kMaxLanesPerBlock)); - return BitCast(d, TableLookupLanes(BitCast(du, v), idx)); -} - -// ------------------------------ ExtractBlock -template -HWY_API VFromD>> ExtractBlock(V v) { - const DFromV d; - const BlockDFromD d_block; - - static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(), - "Invalid block index"); - constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TFromD); - constexpr size_t kBlkByteOffset = - static_cast(kBlockIdx) * kMaxLanesPerBlock; - - return ResizeBitCast(d_block, detail::SlideDown(v, kBlkByteOffset)); -} - -// ------------------------------ ShiftLeftLanes - -template > -HWY_API V ShiftLeftLanes(const D d, const V v) { - const RebindToSigned di; - const RebindToUnsigned du; - using TI = TFromD; - const auto shifted = detail::SlideUp(v, v, kLanes); - // Match x86 semantics by zeroing lower lanes in 128-bit blocks - const auto idx_mod = - detail::AndS(BitCast(di, detail::Iota0(du)), - static_cast(detail::LanesPerBlock(di) - 1)); - const auto clear = detail::LtS(idx_mod, static_cast(kLanes)); - return IfThenZeroElse(clear, shifted); -} - -template -HWY_API V ShiftLeftLanes(const V v) { - return ShiftLeftLanes(DFromV(), v); -} - -// ------------------------------ ShiftLeftBytes - -template -HWY_API VFromD ShiftLeftBytes(D d, const VFromD v) { - const Repartition d8; - return BitCast(d, ShiftLeftLanes(BitCast(d8, v))); -} - -template -HWY_API V ShiftLeftBytes(const V v) { - return ShiftLeftBytes(DFromV(), v); -} - -// ------------------------------ ShiftRightLanes -template >> -HWY_API V ShiftRightLanes(const Simd d, V v) { - const RebindToSigned di; - const RebindToUnsigned du; - using TI = TFromD; - // For partial vectors, clear upper lanes so we shift in zeros. - if (N <= 16 / sizeof(T)) { - v = IfThenElseZero(FirstN(d, N), v); - } - - const auto shifted = detail::SlideDown(v, kLanes); - // Match x86 semantics by zeroing upper lanes in 128-bit blocks - const size_t lpb = detail::LanesPerBlock(di); - const auto idx_mod = - detail::AndS(BitCast(di, detail::Iota0(du)), static_cast(lpb - 1)); - const auto keep = detail::LtS(idx_mod, static_cast(lpb - kLanes)); - return IfThenElseZero(keep, shifted); -} - -// ------------------------------ ShiftRightBytes -template > -HWY_API V ShiftRightBytes(const D d, const V v) { - const Repartition d8; - return BitCast(d, ShiftRightLanes(d8, BitCast(d8, v))); -} - -// ------------------------------ InterleaveLower - -template -HWY_API V InterleaveLower(D d, const V a, const V b) { - static_assert(IsSame, TFromV>(), "D/V mismatch"); - const RebindToUnsigned du; - using TU = TFromD; - const auto i = detail::Iota0(du); - const auto idx_mod = ShiftRight<1>( - detail::AndS(i, static_cast(detail::LanesPerBlock(du) - 1))); - const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i)); - const auto is_even = detail::EqS(detail::AndS(i, 1), 0u); - return IfThenElse(is_even, TableLookupLanes(a, idx), - TableLookupLanes(b, idx)); -} - -template -HWY_API V InterleaveLower(const V a, const V b) { - return InterleaveLower(DFromV(), a, b); -} - -// ------------------------------ InterleaveUpper - -template -HWY_API V InterleaveUpper(const D d, const V a, const V b) { - static_assert(IsSame, TFromV>(), "D/V mismatch"); - const RebindToUnsigned du; - using TU = TFromD; - const size_t lpb = detail::LanesPerBlock(du); - const auto i = detail::Iota0(du); - const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast(lpb - 1))); - const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i)); - const auto idx = detail::AddS(idx_lower, static_cast(lpb / 2)); - const auto is_even = detail::EqS(detail::AndS(i, 1), 0u); - return IfThenElse(is_even, TableLookupLanes(a, idx), - TableLookupLanes(b, idx)); -} - -// ------------------------------ ZipLower - -template >> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - const RepartitionToNarrow dn; - static_assert(IsSame, TFromV>(), "D/V mismatch"); - return BitCast(dw, InterleaveLower(dn, a, b)); -} - -template >> -HWY_API VFromD ZipLower(V a, V b) { - return BitCast(DW(), InterleaveLower(a, b)); -} - -// ------------------------------ ZipUpper -template -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - const RepartitionToNarrow dn; - static_assert(IsSame, TFromV>(), "D/V mismatch"); - return BitCast(dw, InterleaveUpper(dn, a, b)); -} - -// ================================================== REDUCE - -// vector = f(vector, zero_m1) -#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) { \ - return Set(d, \ - GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \ - v, v0, Lanes(d)))); \ - } - -// ------------------------------ SumOfLanes - -namespace detail { -HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL) -} // namespace detail - -template -HWY_API VFromD SumOfLanes(D d, const VFromD v) { - const auto v0 = Zero(ScalableTag>()); // always m1 - return detail::RedSum(d, v, v0); -} - -template -HWY_API TFromD ReduceSum(D d, const VFromD v) { - return GetLane(SumOfLanes(d, v)); -} - -// ------------------------------ MinOfLanes -namespace detail { -HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL) -} // namespace detail - -template -HWY_API VFromD MinOfLanes(D d, const VFromD v) { - using T = TFromD; - const ScalableTag d1; // always m1 - const auto neutral = Set(d1, HighestValue()); - return detail::RedMin(d, v, neutral); -} - -// ------------------------------ MaxOfLanes -namespace detail { -HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL) -HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL) -HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL) -} // namespace detail - -template -HWY_API VFromD MaxOfLanes(D d, const VFromD v) { - using T = TFromD; - const ScalableTag d1; // always m1 - const auto neutral = Set(d1, LowestValue()); - return detail::RedMax(d, v, neutral); -} - -#undef HWY_RVV_REDUCE - -// ================================================== Ops with dependencies - -// ------------------------------ LoadInterleaved2 - -// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. -#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#else -#define HWY_NATIVE_LOAD_STORE_INTERLEAVED -#endif - -// Requires Clang 16+, GCC 14+; otherwise emulated in generic_ops-inl.h. -#if HWY_HAVE_TUPLE - -#define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME##2(HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x2_##CHAR##SEW##LMUL(tup, \ - kIndex); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME##3(HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x3_##CHAR##SEW##LMUL(tup, \ - kIndex); \ - } \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME##4(HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x4_##CHAR##SEW##LMUL(tup, \ - kIndex); \ - } - -HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _LE2) -#undef HWY_RVV_GET - -#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) NAME##2( \ - HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x2( \ - tup, kIndex, v); \ - } \ - template \ - HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \ - HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x3( \ - tup, kIndex, v); \ - } \ - template \ - HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) NAME##4( \ - HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x4( \ - tup, kIndex, v); \ - } - -HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _LE2) -#undef HWY_RVV_SET - -// RVV does not provide vcreate, so implement using Set. -#define HWY_RVV_CREATE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) \ - NAME##2(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, \ - HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1) { \ - HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup{}; \ - tup = Set2<0>(tup, v0); \ - tup = Set2<1>(tup, v1); \ - return tup; \ - } \ - template \ - HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, HWY_RVV_V(BASE, SEW, LMUL) v0, \ - HWY_RVV_V(BASE, SEW, LMUL) v1, HWY_RVV_V(BASE, SEW, LMUL) v2) { \ - HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup{}; \ - tup = Set3<0>(tup, v0); \ - tup = Set3<1>(tup, v1); \ - tup = Set3<2>(tup, v2); \ - return tup; \ - } \ - template \ - HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) \ - NAME##4(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, \ - HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ - HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3) { \ - HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup{}; \ - tup = Set4<0>(tup, v0); \ - tup = Set4<1>(tup, v1); \ - tup = Set4<2>(tup, v2); \ - tup = Set4<3>(tup, v3); \ - return tup; \ - } - -HWY_RVV_FOREACH(HWY_RVV_CREATE, Create, xx, _LE2_VIRT) -#undef HWY_RVV_CREATE - -template -using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D()))); -template -using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D()))); -template -using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D()))); - -#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ - HWY_RVV_V(BASE, SEW, LMUL) & v0, \ - HWY_RVV_V(BASE, SEW, LMUL) & v1) { \ - const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = \ - __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, Lanes(d)); \ - v0 = Get2<0>(tup); \ - v1 = Get2<1>(tup); \ - } -// Segments are limited to 8 registers, so we can only go up to LMUL=2. -HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT) -#undef HWY_RVV_LOAD2 - -// ------------------------------ LoadInterleaved3 - -#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ - HWY_RVV_V(BASE, SEW, LMUL) & v0, \ - HWY_RVV_V(BASE, SEW, LMUL) & v1, \ - HWY_RVV_V(BASE, SEW, LMUL) & v2) { \ - const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = \ - __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, Lanes(d)); \ - v0 = Get3<0>(tup); \ - v1 = Get3<1>(tup); \ - v2 = Get3<2>(tup); \ - } -// Segments are limited to 8 registers, so we can only go up to LMUL=2. -HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT) -#undef HWY_RVV_LOAD3 - -// ------------------------------ LoadInterleaved4 - -#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME( \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \ - HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \ - HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \ - const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = \ - __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, Lanes(d)); \ - v0 = Get4<0>(tup); \ - v1 = Get4<1>(tup); \ - v2 = Get4<2>(tup); \ - v3 = Get4<3>(tup); \ - } -// Segments are limited to 8 registers, so we can only go up to LMUL=2. -HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT) -#undef HWY_RVV_LOAD4 - -// ------------------------------ StoreInterleaved2 - -#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \ - HWY_RVV_V(BASE, SEW, LMUL) v1, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ - const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = Create2(d, v0, v1); \ - __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, tup, Lanes(d)); \ - } -// Segments are limited to 8 registers, so we can only go up to LMUL=2. -HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT) -#undef HWY_RVV_STORE2 - -// ------------------------------ StoreInterleaved3 - -#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME( \ - HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ - HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ - const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = Create3(d, v0, v1, v2); \ - __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, tup, Lanes(d)); \ - } -// Segments are limited to 8 registers, so we can only go up to LMUL=2. -HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT) -#undef HWY_RVV_STORE3 - -// ------------------------------ StoreInterleaved4 - -#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \ - MLEN, NAME, OP) \ - template \ - HWY_API void NAME( \ - HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \ - HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \ - const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = Create4(d, v0, v1, v2, v3); \ - __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, tup, Lanes(d)); \ - } -// Segments are limited to 8 registers, so we can only go up to LMUL=2. -HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT) -#undef HWY_RVV_STORE4 - -#else // !HWY_HAVE_TUPLE - -template > -HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1) { - const VFromD A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] - const VFromD B = LoadU(d, unaligned + Lanes(d)); - v0 = ConcatEven(d, B, A); - v1 = ConcatOdd(d, B, A); -} - -namespace detail { -#define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) \ - NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ - p, static_cast(stride), Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT) -#undef HWY_RVV_LOAD_STRIDED -} // namespace detail - -template > -HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2) { - // Offsets are bytes, and this is not documented. - v0 = detail::LoadStrided(d, unaligned + 0, 3 * sizeof(T)); - v1 = detail::LoadStrided(d, unaligned + 1, 3 * sizeof(T)); - v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T)); -} - -template > -HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& v0, VFromD& v1, VFromD& v2, - VFromD& v3) { - // Offsets are bytes, and this is not documented. - v0 = detail::LoadStrided(d, unaligned + 0, 4 * sizeof(T)); - v1 = detail::LoadStrided(d, unaligned + 1, 4 * sizeof(T)); - v2 = detail::LoadStrided(d, unaligned + 2, 4 * sizeof(T)); - v3 = detail::LoadStrided(d, unaligned + 3, 4 * sizeof(T)); -} - -// Not 64-bit / max LMUL: interleave via promote, slide, OddEven. -template , HWY_IF_NOT_T_SIZE_D(D, 8), - HWY_IF_POW2_LE_D(D, 2)> -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - T* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - const Twice> duw; - const Twice dt; - // Interleave with zero by promoting to wider (unsigned) type. - const VFromD w0 = BitCast(dt, PromoteTo(duw, BitCast(du, v0))); - const VFromD w1 = BitCast(dt, PromoteTo(duw, BitCast(du, v1))); - // OR second vector into the zero-valued lanes (faster than OddEven). - StoreU(Or(w0, detail::Slide1Up(w1)), dt, unaligned); -} - -// Can promote, max LMUL: two half-length -template , HWY_IF_NOT_T_SIZE_D(D, 8), - HWY_IF_POW2_GT_D(D, 2)> -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - T* HWY_RESTRICT unaligned) { - const Half dh; - StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), d, unaligned); - StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), d, - unaligned + Lanes(d)); -} - -namespace detail { -#define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \ - HWY_RVV_D(BASE, SEW, N, SHIFT) d, \ - HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \ - return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \ - p, static_cast(stride), v, Lanes(d)); \ - } -HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT) -#undef HWY_RVV_STORE_STRIDED -} // namespace detail - -// 64-bit: strided -template , HWY_IF_T_SIZE_D(D, 8)> -HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, - T* HWY_RESTRICT unaligned) { - // Offsets are bytes, and this is not documented. - detail::StoreStrided(v0, d, unaligned + 0, 2 * sizeof(T)); - detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T)); -} - -template > -HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, - T* HWY_RESTRICT unaligned) { - // Offsets are bytes, and this is not documented. - detail::StoreStrided(v0, d, unaligned + 0, 3 * sizeof(T)); - detail::StoreStrided(v1, d, unaligned + 1, 3 * sizeof(T)); - detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T)); -} - -template > -HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, - VFromD v3, D d, T* HWY_RESTRICT unaligned) { - // Offsets are bytes, and this is not documented. - detail::StoreStrided(v0, d, unaligned + 0, 4 * sizeof(T)); - detail::StoreStrided(v1, d, unaligned + 1, 4 * sizeof(T)); - detail::StoreStrided(v2, d, unaligned + 2, 4 * sizeof(T)); - detail::StoreStrided(v3, d, unaligned + 3, 4 * sizeof(T)); -} - -#endif // HWY_HAVE_TUPLE - -// ------------------------------ ResizeBitCast - -template -HWY_API VFromD ResizeBitCast(D /*d*/, FromV v) { - const DFromV d_from; - const Repartition du8_from; - const DFromV> d_to; - const Repartition du8_to; - return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v))); -} - -// ------------------------------ PopulationCount (ShiftRight) - -// Handles LMUL < 2 or capped vectors, which generic_ops-inl cannot. -template , HWY_IF_U8_D(D), - hwy::EnableIf* = nullptr> -HWY_API V PopulationCount(V v) { - // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 - v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55)); - v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33)); - return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F); -} - -// ------------------------------ LoadDup128 - -template -HWY_API VFromD LoadDup128(D d, const TFromD* const HWY_RESTRICT p) { - const RebindToUnsigned du; - - // Make sure that no more than 16 bytes are loaded from p - constexpr int kLoadPow2 = d.Pow2(); - constexpr size_t kMaxLanesToLoad = - HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD)); - constexpr size_t kLoadN = D::template NewN(); - const Simd, kLoadN, kLoadPow2> d_load; - static_assert(d_load.MaxBytes() <= 16, - "d_load.MaxBytes() <= 16 must be true"); - static_assert((d.MaxBytes() < 16) || (d_load.MaxBytes() == 16), - "d_load.MaxBytes() == 16 must be true if d.MaxBytes() >= 16 is " - "true"); - static_assert((d.MaxBytes() >= 16) || (d_load.MaxBytes() == d.MaxBytes()), - "d_load.MaxBytes() == d.MaxBytes() must be true if " - "d.MaxBytes() < 16 is true"); - - const VFromD loaded = Load(d_load, p); - if (d.MaxBytes() <= 16) return loaded; - - // idx must be unsigned for TableLookupLanes. - using TU = TFromD; - const TU mask = static_cast(detail::LanesPerBlock(d) - 1); - // Broadcast the first block. - const VFromD> idx = detail::AndS(detail::Iota0(du), mask); - // Safe even for 8-bit lanes because indices never exceed 15. - return TableLookupLanes(loaded, idx); -} - -// ------------------------------ LoadMaskBits - -// Support all combinations of T and SHIFT(LMUL) without explicit overloads for -// each. First overload for MLEN=1..64. -namespace detail { - -// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN -// increases with lane size and decreases for increasing LMUL. Cap at 64, the -// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL -// e.g. vuint16mf8_t: (8*2 << 3) == 128. -template -using MaskTag = hwy::SizeTag), -D().Pow2()))>; - -#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ - HWY_INLINE HWY_RVV_M(MLEN) \ - NAME(hwy::SizeTag /* tag */, const uint8_t* bits, size_t N) { \ - return __riscv_v##OP##_v_b##MLEN(bits, N); \ - } -HWY_RVV_FOREACH_B(HWY_RVV_LOAD_MASK_BITS, LoadMaskBits, lm) -#undef HWY_RVV_LOAD_MASK_BITS -} // namespace detail - -template > -HWY_API auto LoadMaskBits(D d, const uint8_t* bits) - -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) { - return detail::LoadMaskBits(MT(), bits, Lanes(d)); -} - -// ------------------------------ StoreMaskBits -#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \ - const size_t N = Lanes(d); \ - __riscv_v##OP##_v_b##MLEN(bits, m, N); \ - /* Non-full byte, need to clear the undefined upper bits. */ \ - /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \ - constexpr bool kLessThan8 = \ - detail::ScaleByPower(16 / sizeof(TFromD), d.Pow2()) < 8; \ - if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \ - const int mask = (1 << N) - 1; \ - bits[0] = static_cast(bits[0] & mask); \ - } \ - return (N + 7) / 8; \ - } -HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, StoreMaskBits, sm) -#undef HWY_RVV_STORE_MASK_BITS - -// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits) - -template -HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { - return Compress(v, LoadMaskBits(DFromV(), bits)); -} - -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); -} - -// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp) - -// Disallow for 8-bit because Iota is likely to overflow. -template -HWY_API MFromD FirstN(const D d, const size_t n) { - const RebindToUnsigned du; - using TU = TFromD; - return RebindMask(d, detail::LtS(detail::Iota0(du), static_cast(n))); -} - -template -HWY_API MFromD FirstN(const D d, const size_t n) { - const auto zero = Zero(d); - const auto one = Set(d, 1); - return Eq(detail::SlideUp(one, zero, n), one); -} - -// ------------------------------ Neg (Sub) - -template -HWY_API V Neg(const V v) { - return detail::ReverseSubS(v, 0); -} - -// vector = f(vector), but argument is repeated -#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \ - return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \ - HWY_RVV_AVL(SEW, SHIFT)); \ - } - -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL) - -#if !HWY_HAVE_FLOAT16 - -template )> // float16_t -HWY_API V Neg(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask()))); -} - -#endif // !HWY_HAVE_FLOAT16 - -// ------------------------------ Abs (Max, Neg) - -template -HWY_API V Abs(const V v) { - return Max(v, Neg(v)); -} - -HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL) - -#undef HWY_RVV_RETV_ARGV2 - -// ------------------------------ AbsDiff (Abs, Sub) -template -HWY_API V AbsDiff(const V a, const V b) { - return Abs(Sub(a, b)); -} - -// ------------------------------ Round (NearestInt, ConvertTo, CopySign) - -// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have -// a dedicated instruction for that. Rounding to integer and converting back to -// float is correct except when the input magnitude is large, in which case the -// input was already an integer (because mantissa >> exponent is zero). - -namespace detail { -enum RoundingModes { kNear, kTrunc, kDown, kUp }; - -template -HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) { - return detail::LtS(Abs(v), MantissaEnd>()); -} - -} // namespace detail - -template -HWY_API V Round(const V v) { - const DFromV df; - - const auto integer = NearestInt(v); // round using current mode - const auto int_f = ConvertTo(df, integer); - - return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); -} - -// ------------------------------ Trunc (ConvertTo) -template -HWY_API V Trunc(const V v) { - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); -} - -// ------------------------------ Ceil -template -HWY_API V Ceil(const V v) { - asm volatile("fsrm %0" ::"r"(detail::kUp)); - const auto ret = Round(v); - asm volatile("fsrm %0" ::"r"(detail::kNear)); - return ret; -} - -// ------------------------------ Floor -template -HWY_API V Floor(const V v) { - asm volatile("fsrm %0" ::"r"(detail::kDown)); - const auto ret = Round(v); - asm volatile("fsrm %0" ::"r"(detail::kNear)); - return ret; -} - -// ------------------------------ Floating-point classification (Ne) - -// vfclass does not help because it would require 3 instructions (to AND and -// then compare the bits), whereas these are just 1-3 integer instructions. - -template -HWY_API MFromD> IsNaN(const V v) { - return Ne(v, v); -} - -template > -HWY_API MFromD IsInf(const V v) { - const D d; - const RebindToSigned di; - using T = TFromD; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2())); -} - -// Returns whether normal/subnormal/zero. -template > -HWY_API MFromD IsFinite(const V v) { - const D d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - using T = TFromD; - const VFromD vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). - const VFromD exp = - BitCast(di, ShiftRight() + 1>(Add(vu, vu))); - return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField())); -} - -// ------------------------------ Iota (ConvertTo) - -template -HWY_API VFromD Iota(const D d, TFromD first) { - return detail::AddS(detail::Iota0(d), first); -} - -template -HWY_API VFromD Iota(const D d, TFromD first) { - const RebindToUnsigned du; - return detail::AddS(BitCast(d, detail::Iota0(du)), first); -} - -template -HWY_API VFromD Iota(const D d, TFromD first) { - const RebindToUnsigned du; - const RebindToSigned di; - return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first); -} - -// ------------------------------ MulEven/Odd (Mul, OddEven) - -template , class DW = RepartitionToWide> -HWY_API VFromD MulEven(const V a, const V b) { - const auto lo = Mul(a, b); - const auto hi = detail::MulHigh(a, b); - return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo)); -} - -template , class DW = RepartitionToWide> -HWY_API VFromD MulOdd(const V a, const V b) { - const auto lo = Mul(a, b); - const auto hi = detail::MulHigh(a, b); - return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo))); -} - -// There is no 64x64 vwmul. -template -HWY_INLINE V MulEven(const V a, const V b) { - const auto lo = Mul(a, b); - const auto hi = detail::MulHigh(a, b); - return OddEven(detail::Slide1Up(hi), lo); -} - -template -HWY_INLINE V MulOdd(const V a, const V b) { - const auto lo = Mul(a, b); - const auto hi = detail::MulHigh(a, b); - return OddEven(hi, detail::Slide1Down(lo)); -} - -// ------------------------------ ReorderDemote2To (OddEven, Combine) - -template -HWY_API VFromD> ReorderDemote2To( - Simd dbf16, - VFromD> a, - VFromD> b) { - const RebindToUnsigned du16; - const RebindToUnsigned> du32; - const VFromD b_in_even = ShiftRight<16>(BitCast(du32, b)); - return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); -} - -// If LMUL is not the max, Combine first to avoid another DemoteTo. -template ), - HWY_IF_POW2_LE_D(DN, 2), class V, HWY_IF_SIGNED_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Rebind, DN> dt; - const VFromD ab = Combine(dt, b, a); - return DemoteTo(dn, ab); -} - -template ) * 2), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Rebind, DN> dt; - const VFromD ab = Combine(dt, b, a); - return DemoteTo(dn, ab); -} - -// Max LMUL: must DemoteTo first, then Combine. -template ), - HWY_IF_POW2_GT_D(DN, 2), class V, HWY_IF_SIGNED_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Half dnh; - const VFromD demoted_a = DemoteTo(dnh, a); - const VFromD demoted_b = DemoteTo(dnh, b); - return Combine(dn, demoted_b, demoted_a); -} - -template ) * 2), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { - const Half dnh; - const VFromD demoted_a = DemoteTo(dnh, a); - const VFromD demoted_b = DemoteTo(dnh, b); - return Combine(dn, demoted_b, demoted_a); -} - -// If LMUL is not the max, Combine first to avoid another DemoteTo. -template ), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD OrderedDemote2To(DN dn, V a, V b) { - const Rebind, DN> dt; - const VFromD ab = Combine(dt, b, a); - return DemoteTo(dn, ab); -} - -// Max LMUL: must DemoteTo first, then Combine. -template ), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD OrderedDemote2To(DN dn, V a, V b) { - const Half dnh; - const RebindToUnsigned dn_u; - const RebindToUnsigned dnh_u; - const auto demoted_a = BitCast(dnh_u, DemoteTo(dnh, a)); - const auto demoted_b = BitCast(dnh_u, DemoteTo(dnh, b)); - return BitCast(dn, Combine(dn_u, demoted_b, demoted_a)); -} - -template ), class V, - HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - class V2 = VFromD, DN>>, - hwy::EnableIf().Pow2() == DFromV().Pow2()>* = nullptr> -HWY_API VFromD OrderedDemote2To(DN dn, V a, V b) { - return ReorderDemote2To(dn, a, b); -} - -// ------------------------------ WidenMulPairwiseAdd - -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { - const RebindToUnsigned du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 - // Using shift/and instead of Zip leads to the odd/even order that - // RearrangeToOddPlusEven prefers. - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), - Mul(BitCast(df32, ao), BitCast(df32, bo))); -} - -template -HWY_API VFromD WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) { - using VI32 = VFromD; - // Manual sign extension requires two shifts for even lanes. - const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a))); - const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b))); - const VI32 ao = ShiftRight<16>(BitCast(d32, a)); - const VI32 bo = ShiftRight<16>(BitCast(d32, b)); - return Add(Mul(ae, be), Mul(ao, bo)); -} - -template -HWY_API VFromD WidenMulPairwiseAdd(D du32, VI16 a, VI16 b) { - using VU32 = VFromD; - // Manual sign extension requires two shifts for even lanes. - const VU32 ae = detail::AndS(BitCast(du32, a), uint32_t{0x0000FFFFu}); - const VU32 be = detail::AndS(BitCast(du32, b), uint32_t{0x0000FFFFu}); - const VU32 ao = ShiftRight<16>(BitCast(du32, a)); - const VU32 bo = ShiftRight<16>(BitCast(du32, b)); - return Add(Mul(ae, be), Mul(ao, bo)); -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -namespace detail { - -// Non-overloaded wrapper function so we can define DF32 in template args. -template , - class VF32 = VFromD, - class DBF16 = Repartition>> -HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd df32, - VFromD a, VFromD b, - const VF32 sum0, VF32& sum1) { - const RebindToUnsigned du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 - // Using shift/and instead of Zip leads to the odd/even order that - // RearrangeToOddPlusEven prefers. - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -} - -#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \ - SHIFT, MLEN, NAME, OP) \ - template \ - HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \ - HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \ - HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \ - return __riscv_v##OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \ - } - -HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmacc_vv_, _EXT_VIRT) -HWY_RVV_FOREACH_U16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmaccu_vv_, _EXT_VIRT) -#undef HWY_RVV_WIDEN_MACC - -// If LMUL is not the max, we can WidenMul first (3 instructions). -template , - class D16 = RepartitionToNarrow> -HWY_API VFromD ReorderWidenMulAccumulateI16(D32 d32, VFromD a, - VFromD b, const V32 sum0, - V32& sum1) { - const Twice d32t; - using V32T = VFromD; - V32T sum = Combine(d32t, sum1, sum0); - sum = detail::WidenMulAcc(d32t, sum, a, b); - sum1 = UpperHalf(d32, sum); - return LowerHalf(d32, sum); -} - -// Max LMUL: must LowerHalf first (4 instructions). -template , - class D16 = RepartitionToNarrow> -HWY_API VFromD ReorderWidenMulAccumulateI16(D32 d32, VFromD a, - VFromD b, const V32 sum0, - V32& sum1) { - const Half d16h; - using V16H = VFromD; - const V16H a0 = LowerHalf(d16h, a); - const V16H a1 = UpperHalf(d16h, a); - const V16H b0 = LowerHalf(d16h, b); - const V16H b1 = UpperHalf(d16h, b); - sum1 = detail::WidenMulAcc(d32, sum1, a1, b1); - return detail::WidenMulAcc(d32, sum0, a0, b0); -} - -// If LMUL is not the max, we can WidenMul first (3 instructions). -template , - class D16 = RepartitionToNarrow> -HWY_API VFromD ReorderWidenMulAccumulateU16(D32 d32, VFromD a, - VFromD b, const V32 sum0, - V32& sum1) { - const Twice d32t; - using V32T = VFromD; - V32T sum = Combine(d32t, sum1, sum0); - sum = detail::WidenMulAcc(d32t, sum, a, b); - sum1 = UpperHalf(d32, sum); - return LowerHalf(d32, sum); -} - -// Max LMUL: must LowerHalf first (4 instructions). -template , - class D16 = RepartitionToNarrow> -HWY_API VFromD ReorderWidenMulAccumulateU16(D32 d32, VFromD a, - VFromD b, const V32 sum0, - V32& sum1) { - const Half d16h; - using V16H = VFromD; - const V16H a0 = LowerHalf(d16h, a); - const V16H a1 = UpperHalf(d16h, a); - const V16H b0 = LowerHalf(d16h, b); - const V16H b1 = UpperHalf(d16h, b); - sum1 = detail::WidenMulAcc(d32, sum1, a1, b1); - return detail::WidenMulAcc(d32, sum0, a0, b0); -} - -} // namespace detail - -template -HWY_API VW ReorderWidenMulAccumulate(Simd d32, VN a, VN b, - const VW sum0, VW& sum1) { - return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1); -} - -template -HWY_API VW ReorderWidenMulAccumulate(Simd d32, VN a, VN b, - const VW sum0, VW& sum1) { - return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1); -} - -template -HWY_API VW ReorderWidenMulAccumulate(Simd d32, VN a, VN b, - const VW sum0, VW& sum1) { - return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1); -} - -// ------------------------------ RearrangeToOddPlusEven - -template // vint32_t* -HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { - // vwmacc doubles LMUL, so we require a pairwise sum here. This op is - // expected to be less frequent than ReorderWidenMulAccumulate, hence it's - // preferable to do the extra work here rather than do manual odd/even - // extraction there. - const DFromV di32; - const RebindToUnsigned du32; - const Twice di32x2; - const RepartitionToWide di64x2; - const RebindToUnsigned du64x2; - const auto combined = BitCast(di64x2, Combine(di32x2, sum1, sum0)); - // Isolate odd/even int32 in int64 lanes. - const auto even = ShiftRight<32>(ShiftLeft<32>(combined)); // sign extend - const auto odd = ShiftRight<32>(combined); - return BitCast(di32, TruncateTo(du32, BitCast(du64x2, Add(even, odd)))); -} - -// For max LMUL, we cannot Combine again and instead manually unroll. -HWY_API vint32m8_t RearrangeToOddPlusEven(vint32m8_t sum0, vint32m8_t sum1) { - const DFromV d; - const Half dh; - const vint32m4_t lo = - RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0)); - const vint32m4_t hi = - RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1)); - return Combine(d, hi, lo); -} - -template // vuint32_t* -HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { - // vwmacc doubles LMUL, so we require a pairwise sum here. This op is - // expected to be less frequent than ReorderWidenMulAccumulate, hence it's - // preferable to do the extra work here rather than do manual odd/even - // extraction there. - const DFromV du32; - const Twice du32x2; - const RepartitionToWide du64x2; - const auto combined = BitCast(du64x2, Combine(du32x2, sum1, sum0)); - // Isolate odd/even int32 in int64 lanes. - const auto even = detail::AndS(combined, uint64_t{0xFFFFFFFFu}); - const auto odd = ShiftRight<32>(combined); - return TruncateTo(du32, Add(even, odd)); -} - -// For max LMUL, we cannot Combine again and instead manually unroll. -HWY_API vuint32m8_t RearrangeToOddPlusEven(vuint32m8_t sum0, vuint32m8_t sum1) { - const DFromV d; - const Half dh; - const vuint32m4_t lo = - RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0)); - const vuint32m4_t hi = - RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1)); - return Combine(d, hi, lo); -} - -template // vfloat* -HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { - return Add(sum0, sum1); // invariant already holds -} - -// ------------------------------ Lt128 -template -HWY_INLINE MFromD Lt128(D d, const VFromD a, const VFromD b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - // Truth table of Eq and Compare for Hi and Lo u64. - // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) - // =H =L cH cL | out = cH | (=H & cL) - // 0 0 0 0 | 0 - // 0 0 0 1 | 0 - // 0 0 1 0 | 1 - // 0 0 1 1 | 1 - // 0 1 0 0 | 0 - // 0 1 0 1 | 0 - // 0 1 1 0 | 1 - // 1 0 0 0 | 0 - // 1 0 0 1 | 1 - // 1 1 0 0 | 0 - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - // Shift leftward so L can influence H. - const VFromD ltLx = detail::Slide1Up(ltHL); - const VFromD vecHx = OrAnd(ltHL, eqHL, ltLx); - // Replicate H to its neighbor. - return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx))); -} - -// ------------------------------ Lt128Upper -template -HWY_INLINE MFromD Lt128Upper(D d, const VFromD a, const VFromD b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - const VFromD down = detail::Slide1Down(ltHL); - // b(267743505): Clang compiler bug, workaround is DoNotOptimize - asm volatile("" : : "r,m"(GetLane(down)) : "memory"); - // Replicate H to its neighbor. - return MaskFromVec(OddEven(ltHL, down)); -} - -// ------------------------------ Eq128 -template -HWY_INLINE MFromD Eq128(D d, const VFromD a, const VFromD b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - const VFromD eqLH = Reverse2(d, eqHL); - const VFromD eq = And(eqHL, eqLH); - // b(267743505): Clang compiler bug, workaround is DoNotOptimize - asm volatile("" : : "r,m"(GetLane(eq)) : "memory"); - return MaskFromVec(eq); -} - -// ------------------------------ Eq128Upper -template -HWY_INLINE MFromD Eq128Upper(D d, const VFromD a, const VFromD b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - // Replicate H to its neighbor. - return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL))); -} - -// ------------------------------ Ne128 -template -HWY_INLINE MFromD Ne128(D d, const VFromD a, const VFromD b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const VFromD neHL = VecFromMask(d, Ne(a, b)); - const VFromD neLH = Reverse2(d, neHL); - // b(267743505): Clang compiler bug, workaround is DoNotOptimize - asm volatile("" : : "r,m"(GetLane(neLH)) : "memory"); - return MaskFromVec(Or(neHL, neLH)); -} - -// ------------------------------ Ne128Upper -template -HWY_INLINE MFromD Ne128Upper(D d, const VFromD a, const VFromD b) { - static_assert(IsSame, uint64_t>(), "D must be u64"); - const VFromD neHL = VecFromMask(d, Ne(a, b)); - const VFromD down = detail::Slide1Down(neHL); - // b(267743505): Clang compiler bug, workaround is DoNotOptimize - asm volatile("" : : "r,m"(GetLane(down)) : "memory"); - // Replicate H to its neighbor. - return MaskFromVec(OddEven(neHL, down)); -} - -// ------------------------------ Min128, Max128 (Lt128) - -template -HWY_INLINE VFromD Min128(D /* tag */, const VFromD a, const VFromD b) { - const VFromD aXH = detail::Slide1Down(a); - const VFromD bXH = detail::Slide1Down(b); - const VFromD minHL = Min(a, b); - const MFromD ltXH = Lt(aXH, bXH); - const MFromD eqXH = Eq(aXH, bXH); - // If the upper lane is the decider, take lo from the same reg. - const VFromD lo = IfThenElse(ltXH, a, b); - // The upper lane is just minHL; if they are equal, we also need to use the - // actual min of the lower lanes. - return OddEven(minHL, IfThenElse(eqXH, minHL, lo)); -} - -template -HWY_INLINE VFromD Max128(D /* tag */, const VFromD a, const VFromD b) { - const VFromD aXH = detail::Slide1Down(a); - const VFromD bXH = detail::Slide1Down(b); - const VFromD maxHL = Max(a, b); - const MFromD ltXH = Lt(aXH, bXH); - const MFromD eqXH = Eq(aXH, bXH); - // If the upper lane is the decider, take lo from the same reg. - const VFromD lo = IfThenElse(ltXH, b, a); - // The upper lane is just maxHL; if they are equal, we also need to use the - // actual min of the lower lanes. - return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo)); -} - -template -HWY_INLINE VFromD Min128Upper(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128Upper(d, a, b), a, b); -} - -template -HWY_INLINE VFromD Max128Upper(D d, VFromD a, VFromD b) { - return IfThenElse(Lt128Upper(d, b, a), a, b); -} - -// ================================================== END MACROS -namespace detail { // for code folding -#undef HWY_RVV_AVL -#undef HWY_RVV_D -#undef HWY_RVV_FOREACH -#undef HWY_RVV_FOREACH_08_ALL -#undef HWY_RVV_FOREACH_08_ALL_VIRT -#undef HWY_RVV_FOREACH_08_DEMOTE -#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT -#undef HWY_RVV_FOREACH_08_EXT -#undef HWY_RVV_FOREACH_08_EXT_VIRT -#undef HWY_RVV_FOREACH_08_TRUNC -#undef HWY_RVV_FOREACH_08_VIRT -#undef HWY_RVV_FOREACH_16_ALL -#undef HWY_RVV_FOREACH_16_ALL_VIRT -#undef HWY_RVV_FOREACH_16_DEMOTE -#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT -#undef HWY_RVV_FOREACH_16_EXT -#undef HWY_RVV_FOREACH_16_EXT_VIRT -#undef HWY_RVV_FOREACH_16_TRUNC -#undef HWY_RVV_FOREACH_16_VIRT -#undef HWY_RVV_FOREACH_32_ALL -#undef HWY_RVV_FOREACH_32_ALL_VIRT -#undef HWY_RVV_FOREACH_32_DEMOTE -#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT -#undef HWY_RVV_FOREACH_32_EXT -#undef HWY_RVV_FOREACH_32_EXT_VIRT -#undef HWY_RVV_FOREACH_32_TRUNC -#undef HWY_RVV_FOREACH_32_VIRT -#undef HWY_RVV_FOREACH_64_ALL -#undef HWY_RVV_FOREACH_64_ALL_VIRT -#undef HWY_RVV_FOREACH_64_DEMOTE -#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT -#undef HWY_RVV_FOREACH_64_EXT -#undef HWY_RVV_FOREACH_64_EXT_VIRT -#undef HWY_RVV_FOREACH_64_TRUNC -#undef HWY_RVV_FOREACH_64_VIRT -#undef HWY_RVV_FOREACH_B -#undef HWY_RVV_FOREACH_F -#undef HWY_RVV_FOREACH_F16 -#undef HWY_RVV_FOREACH_F32 -#undef HWY_RVV_FOREACH_F3264 -#undef HWY_RVV_FOREACH_F64 -#undef HWY_RVV_FOREACH_I -#undef HWY_RVV_FOREACH_I08 -#undef HWY_RVV_FOREACH_I16 -#undef HWY_RVV_FOREACH_I163264 -#undef HWY_RVV_FOREACH_I32 -#undef HWY_RVV_FOREACH_I64 -#undef HWY_RVV_FOREACH_U -#undef HWY_RVV_FOREACH_U08 -#undef HWY_RVV_FOREACH_U16 -#undef HWY_RVV_FOREACH_U163264 -#undef HWY_RVV_FOREACH_U32 -#undef HWY_RVV_FOREACH_U64 -#undef HWY_RVV_FOREACH_UI -#undef HWY_RVV_FOREACH_UI08 -#undef HWY_RVV_FOREACH_UI16 -#undef HWY_RVV_FOREACH_UI163264 -#undef HWY_RVV_FOREACH_UI32 -#undef HWY_RVV_FOREACH_UI3264 -#undef HWY_RVV_FOREACH_UI64 -#undef HWY_RVV_INSERT_VXRM -#undef HWY_RVV_M -#undef HWY_RVV_RETM_ARGM -#undef HWY_RVV_RETV_ARGV -#undef HWY_RVV_RETV_ARGVS -#undef HWY_RVV_RETV_ARGVV -#undef HWY_RVV_T -#undef HWY_RVV_V -} // namespace detail -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/scalar-inl.h b/deps/highway/include/hwy/ops/scalar-inl.h deleted file mode 100644 index cfd98f7f..00000000 --- a/deps/highway/include/hwy/ops/scalar-inl.h +++ /dev/null @@ -1,1921 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Single-element vectors and operations. -// External include guard in highway.h - see comment there. - -#ifndef HWY_NO_LIBCXX -#include // sqrtf -#endif - -#include "hwy/ops/shared-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -// Single instruction, single data. -template -using Sisd = Simd; - -// (Wrapper class required for overloading comparison operators.) -template -struct Vec1 { - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = 1; // only for DFromV - - HWY_INLINE Vec1() = default; - Vec1(const Vec1&) = default; - Vec1& operator=(const Vec1&) = default; - HWY_INLINE explicit Vec1(const T t) : raw(t) {} - - HWY_INLINE Vec1& operator*=(const Vec1 other) { - return *this = (*this * other); - } - HWY_INLINE Vec1& operator/=(const Vec1 other) { - return *this = (*this / other); - } - HWY_INLINE Vec1& operator+=(const Vec1 other) { - return *this = (*this + other); - } - HWY_INLINE Vec1& operator-=(const Vec1 other) { - return *this = (*this - other); - } - HWY_INLINE Vec1& operator&=(const Vec1 other) { - return *this = (*this & other); - } - HWY_INLINE Vec1& operator|=(const Vec1 other) { - return *this = (*this | other); - } - HWY_INLINE Vec1& operator^=(const Vec1 other) { - return *this = (*this ^ other); - } - - T raw; -}; - -// 0 or FF..FF, same size as Vec1. -template -class Mask1 { - using Raw = hwy::MakeUnsigned; - - public: - static HWY_INLINE Mask1 FromBool(bool b) { - Mask1 mask; - mask.bits = b ? static_cast(~Raw{0}) : 0; - return mask; - } - - Raw bits; -}; - -template -using DFromV = Simd; - -template -using TFromV = typename V::PrivateT; - -// ------------------------------ BitCast - -template , typename TFrom> -HWY_API Vec1 BitCast(DTo /* tag */, Vec1 v) { - static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined"); - TTo to; - CopyBytes(&v.raw, &to); // not same size - ok to shrink - return Vec1(to); -} - -// ------------------------------ Zero - -template > -HWY_API Vec1 Zero(D /* tag */) { - Vec1 v; - ZeroBytes(&v.raw); - return v; -} - -template -using VFromD = decltype(Zero(D())); - -// ------------------------------ Tuple (VFromD) -#include "hwy/ops/tuple-inl.h" - -// ------------------------------ Set -template , typename T2> -HWY_API Vec1 Set(D /* tag */, const T2 t) { - return Vec1(static_cast(t)); -} - -// ------------------------------ Undefined -template > -HWY_API Vec1 Undefined(D d) { - return Zero(d); -} - -// ------------------------------ Iota -template , typename T2> -HWY_API Vec1 Iota(const D /* tag */, const T2 first) { - return Vec1(static_cast(first)); -} - -// ------------------------------ ResizeBitCast - -template -HWY_API VFromD ResizeBitCast(D /* tag */, FromV v) { - using TFrom = TFromV; - using TTo = TFromD; - constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo)); - TTo to = TTo{0}; - CopyBytes(&v.raw, &to); - return VFromD(to); -} - -namespace detail { - -// ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if -// sizeof(TFromD) is greater than sizeof(TFromV) -template -HWY_INLINE VFromD ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, - ToSizeTag /* to_size_tag */, - DTo d_to, DFrom /*d_from*/, - VFromD v) { - return ResizeBitCast(d_to, v); -} - -} // namespace detail - -// ================================================== LOGICAL - -// ------------------------------ Not - -template -HWY_API Vec1 Not(const Vec1 v) { - using TU = MakeUnsigned; - const Sisd du; - return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, v).raw))); -} - -// ------------------------------ And - -template -HWY_API Vec1 And(const Vec1 a, const Vec1 b) { - using TU = MakeUnsigned; - const Sisd du; - return BitCast(Sisd(), Vec1(BitCast(du, a).raw & BitCast(du, b).raw)); -} -template -HWY_API Vec1 operator&(const Vec1 a, const Vec1 b) { - return And(a, b); -} - -// ------------------------------ AndNot - -template -HWY_API Vec1 AndNot(const Vec1 a, const Vec1 b) { - using TU = MakeUnsigned; - const Sisd du; - return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, a).raw & - BitCast(du, b).raw))); -} - -// ------------------------------ Or - -template -HWY_API Vec1 Or(const Vec1 a, const Vec1 b) { - using TU = MakeUnsigned; - const Sisd du; - return BitCast(Sisd(), Vec1(BitCast(du, a).raw | BitCast(du, b).raw)); -} -template -HWY_API Vec1 operator|(const Vec1 a, const Vec1 b) { - return Or(a, b); -} - -// ------------------------------ Xor - -template -HWY_API Vec1 Xor(const Vec1 a, const Vec1 b) { - using TU = MakeUnsigned; - const Sisd du; - return BitCast(Sisd(), Vec1(BitCast(du, a).raw ^ BitCast(du, b).raw)); -} -template -HWY_API Vec1 operator^(const Vec1 a, const Vec1 b) { - return Xor(a, b); -} - -// ------------------------------ Xor3 - -template -HWY_API Vec1 Xor3(Vec1 x1, Vec1 x2, Vec1 x3) { - return Xor(x1, Xor(x2, x3)); -} - -// ------------------------------ Or3 - -template -HWY_API Vec1 Or3(Vec1 o1, Vec1 o2, Vec1 o3) { - return Or(o1, Or(o2, o3)); -} - -// ------------------------------ OrAnd - -template -HWY_API Vec1 OrAnd(const Vec1 o, const Vec1 a1, const Vec1 a2) { - return Or(o, And(a1, a2)); -} - -// ------------------------------ Mask - -template , typename TFrom> -HWY_API Mask1 RebindMask(DTo /*tag*/, Mask1 m) { - static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); - return Mask1{m.bits}; -} - -// v must be 0 or FF..FF. -template -HWY_API Mask1 MaskFromVec(const Vec1 v) { - Mask1 mask; - CopySameSize(&v, &mask); - return mask; -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -template -Vec1 VecFromMask(const Mask1 mask) { - Vec1 v; - CopySameSize(&mask, &v); - return v; -} - -template > -Vec1 VecFromMask(D /* tag */, const Mask1 mask) { - Vec1 v; - CopySameSize(&mask, &v); - return v; -} - -template > -HWY_API Mask1 FirstN(D /*tag*/, size_t n) { - return Mask1::FromBool(n != 0); -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec1 IfVecThenElse(Vec1 mask, Vec1 yes, Vec1 no) { - return IfThenElse(MaskFromVec(mask), yes, no); -} - -// ------------------------------ CopySign -template -HWY_API Vec1 CopySign(const Vec1 magn, const Vec1 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return BitwiseIfThenElse(SignBit(d), sign, magn); -} - -// ------------------------------ CopySignToAbs -template -HWY_API Vec1 CopySignToAbs(const Vec1 abs, const Vec1 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const Sisd d; - return OrAnd(abs, SignBit(d), sign); -} - -// ------------------------------ BroadcastSignBit -template -HWY_API Vec1 BroadcastSignBit(const Vec1 v) { - // This is used inside ShiftRight, so we cannot implement in terms of it. - return v.raw < 0 ? Vec1(T(-1)) : Vec1(0); -} - -// ------------------------------ PopulationCount - -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -template -HWY_API Vec1 PopulationCount(Vec1 v) { - return Vec1(static_cast(PopCount(v.raw))); -} - -// ------------------------------ IfThenElse - -// Returns mask ? yes : no. -template -HWY_API Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, - const Vec1 no) { - return mask.bits ? yes : no; -} - -template -HWY_API Vec1 IfThenElseZero(const Mask1 mask, const Vec1 yes) { - return mask.bits ? yes : Vec1(0); -} - -template -HWY_API Vec1 IfThenZeroElse(const Mask1 mask, const Vec1 no) { - return mask.bits ? Vec1(0) : no; -} - -template -HWY_API Vec1 IfNegativeThenElse(Vec1 v, Vec1 yes, Vec1 no) { - const DFromV d; - const RebindToSigned di; - const auto vi = BitCast(di, v); - - return vi.raw < 0 ? yes : no; -} - -template -HWY_API Vec1 ZeroIfNegative(const Vec1 v) { - return v.raw < 0 ? Vec1(0) : v; -} - -// ------------------------------ Mask logical - -template -HWY_API Mask1 Not(const Mask1 m) { - return MaskFromVec(Not(VecFromMask(Sisd(), m))); -} - -template -HWY_API Mask1 And(const Mask1 a, Mask1 b) { - const Sisd d; - return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask1 AndNot(const Mask1 a, Mask1 b) { - const Sisd d; - return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask1 Or(const Mask1 a, Mask1 b) { - const Sisd d; - return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask1 Xor(const Mask1 a, Mask1 b) { - const Sisd d; - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask1 ExclusiveNeither(const Mask1 a, Mask1 b) { - const Sisd d; - return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); -} - -template -HWY_API Mask1 SetAtOrAfterFirst(Mask1 mask) { - return mask; -} - -template -HWY_API Mask1 SetBeforeFirst(Mask1 mask) { - return Not(mask); -} - -template -HWY_API Mask1 SetOnlyFirst(Mask1 mask) { - return mask; -} - -template -HWY_API Mask1 SetAtOrBeforeFirst(Mask1 /*mask*/) { - return Mask1::FromBool(true); -} - -// ================================================== SHIFTS - -// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) - -template -HWY_API Vec1 ShiftLeft(const Vec1 v) { - static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); - return Vec1( - static_cast(static_cast>(v.raw) << kBits)); -} - -template -HWY_API Vec1 ShiftRight(const Vec1 v) { - static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); -#if __cplusplus >= 202002L - // Signed right shift is now guaranteed to be arithmetic (rounding toward - // negative infinity, i.e. shifting in the sign bit). - return Vec1(static_cast(v.raw >> kBits)); -#else - if (IsSigned()) { - // Emulate arithmetic shift using only logical (unsigned) shifts, because - // signed shifts are still implementation-defined. - using TU = hwy::MakeUnsigned; - const Sisd du; - const TU shifted = static_cast(BitCast(du, v).raw >> kBits); - const TU sign = BitCast(du, BroadcastSignBit(v)).raw; - const size_t sign_shift = - static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); - const TU upper = static_cast(sign << sign_shift); - return BitCast(Sisd(), Vec1(shifted | upper)); - } else { // T is unsigned - return Vec1(static_cast(v.raw >> kBits)); - } -#endif -} - -// ------------------------------ RotateRight (ShiftRight) -template -HWY_API Vec1 RotateRight(const Vec1 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift"); - if (kBits == 0) return v; - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -// ------------------------------ ShiftLeftSame (BroadcastSignBit) - -template -HWY_API Vec1 ShiftLeftSame(const Vec1 v, int bits) { - return Vec1( - static_cast(static_cast>(v.raw) << bits)); -} - -template -HWY_API Vec1 ShiftRightSame(const Vec1 v, int bits) { -#if __cplusplus >= 202002L - // Signed right shift is now guaranteed to be arithmetic (rounding toward - // negative infinity, i.e. shifting in the sign bit). - return Vec1(static_cast(v.raw >> bits)); -#else - if (IsSigned()) { - // Emulate arithmetic shift using only logical (unsigned) shifts, because - // signed shifts are still implementation-defined. - using TU = hwy::MakeUnsigned; - const Sisd du; - const TU shifted = static_cast(BitCast(du, v).raw >> bits); - const TU sign = BitCast(du, BroadcastSignBit(v)).raw; - const size_t sign_shift = - static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); - const TU upper = static_cast(sign << sign_shift); - return BitCast(Sisd(), Vec1(shifted | upper)); - } else { // T is unsigned - return Vec1(static_cast(v.raw >> bits)); - } -#endif -} - -// ------------------------------ Shl - -// Single-lane => same as ShiftLeftSame except for the argument type. -template -HWY_API Vec1 operator<<(const Vec1 v, const Vec1 bits) { - return ShiftLeftSame(v, static_cast(bits.raw)); -} - -template -HWY_API Vec1 operator>>(const Vec1 v, const Vec1 bits) { - return ShiftRightSame(v, static_cast(bits.raw)); -} - -// ================================================== ARITHMETIC - -template -HWY_API Vec1 operator+(Vec1 a, Vec1 b) { - const uint64_t a64 = static_cast(a.raw); - const uint64_t b64 = static_cast(b.raw); - return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); -} -HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { - return Vec1(a.raw + b.raw); -} -HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { - return Vec1(a.raw + b.raw); -} - -template -HWY_API Vec1 operator-(Vec1 a, Vec1 b) { - const uint64_t a64 = static_cast(a.raw); - const uint64_t b64 = static_cast(b.raw); - return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); -} -HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { - return Vec1(a.raw - b.raw); -} -HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { - return Vec1(a.raw - b.raw); -} - -// ------------------------------ SumsOf8 - -HWY_API Vec1 SumsOf8(const Vec1 v) { - return Vec1(v.raw); -} - -// ------------------------------ SaturatedAdd - -// Returns a + b clamped to the destination range. - -// Unsigned -HWY_API Vec1 SaturatedAdd(const Vec1 a, - const Vec1 b) { - return Vec1( - static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); -} -HWY_API Vec1 SaturatedAdd(const Vec1 a, - const Vec1 b) { - return Vec1(static_cast( - HWY_MIN(HWY_MAX(0, static_cast(a.raw) + b.raw), 65535))); -} - -// Signed -HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { - return Vec1( - static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); -} -HWY_API Vec1 SaturatedAdd(const Vec1 a, - const Vec1 b) { - return Vec1(static_cast( - HWY_MIN(HWY_MAX(-32768, static_cast(a.raw) + b.raw), 32767))); -} - -// ------------------------------ Saturating subtraction - -// Returns a - b clamped to the destination range. - -// Unsigned -HWY_API Vec1 SaturatedSub(const Vec1 a, - const Vec1 b) { - return Vec1( - static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); -} -HWY_API Vec1 SaturatedSub(const Vec1 a, - const Vec1 b) { - return Vec1(static_cast( - HWY_MIN(HWY_MAX(0, static_cast(a.raw) - b.raw), 65535))); -} - -// Signed -HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { - return Vec1( - static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); -} -HWY_API Vec1 SaturatedSub(const Vec1 a, - const Vec1 b) { - return Vec1(static_cast( - HWY_MIN(HWY_MAX(-32768, static_cast(a.raw) - b.raw), 32767))); -} - -// ------------------------------ Average - -// Returns (a + b + 1) / 2 - -HWY_API Vec1 AverageRound(const Vec1 a, - const Vec1 b) { - return Vec1(static_cast((a.raw + b.raw + 1) / 2)); -} -HWY_API Vec1 AverageRound(const Vec1 a, - const Vec1 b) { - return Vec1(static_cast((a.raw + b.raw + 1) / 2)); -} - -// ------------------------------ Absolute value - -template -HWY_API Vec1 Abs(const Vec1 a) { - const T i = a.raw; - if (i >= 0 || i == hwy::LimitsMin()) return a; - return Vec1(static_cast(-i & T{-1})); -} -HWY_API Vec1 Abs(Vec1 a) { - int32_t i; - CopyBytes(&a.raw, &i); - i &= 0x7FFFFFFF; - CopyBytes(&i, &a.raw); - return a; -} -HWY_API Vec1 Abs(Vec1 a) { - int64_t i; - CopyBytes(&a.raw, &i); - i &= 0x7FFFFFFFFFFFFFFFL; - CopyBytes(&i, &a.raw); - return a; -} - -// ------------------------------ Min/Max - -// may be unavailable, so implement our own. -namespace detail { - -static inline float Abs(float f) { - uint32_t i; - CopyBytes<4>(&f, &i); - i &= 0x7FFFFFFFu; - CopyBytes<4>(&i, &f); - return f; -} -static inline double Abs(double f) { - uint64_t i; - CopyBytes<8>(&f, &i); - i &= 0x7FFFFFFFFFFFFFFFull; - CopyBytes<8>(&i, &f); - return f; -} - -static inline bool SignBit(float f) { - uint32_t i; - CopyBytes<4>(&f, &i); - return (i >> 31) != 0; -} -static inline bool SignBit(double f) { - uint64_t i; - CopyBytes<8>(&f, &i); - return (i >> 63) != 0; -} - -} // namespace detail - -template -HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MIN(a.raw, b.raw)); -} - -template -HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { - if (isnan(a.raw)) return b; - if (isnan(b.raw)) return a; - return Vec1(HWY_MIN(a.raw, b.raw)); -} - -template -HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { - return Vec1(HWY_MAX(a.raw, b.raw)); -} - -template -HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { - if (isnan(a.raw)) return b; - if (isnan(b.raw)) return a; - return Vec1(HWY_MAX(a.raw, b.raw)); -} - -// ------------------------------ Floating-point negate - -template -HWY_API Vec1 Neg(const Vec1 v) { - return Xor(v, SignBit(Sisd())); -} - -template -HWY_API Vec1 Neg(const Vec1 v) { - return Zero(Sisd()) - v; -} - -// ------------------------------ mul/div - -// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. -#ifdef HWY_NATIVE_MUL_8 -#undef HWY_NATIVE_MUL_8 -#else -#define HWY_NATIVE_MUL_8 -#endif -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -template -HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { - return Vec1(static_cast(double{a.raw} * b.raw)); -} - -template -HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { - return Vec1(static_cast(static_cast(a.raw) * - static_cast(b.raw))); -} - -template -HWY_API Vec1 operator/(const Vec1 a, const Vec1 b) { - return Vec1(a.raw / b.raw); -} - -// Returns the upper 16 bits of a * b in each lane. -HWY_API Vec1 MulHigh(const Vec1 a, const Vec1 b) { - return Vec1(static_cast((a.raw * b.raw) >> 16)); -} -HWY_API Vec1 MulHigh(const Vec1 a, const Vec1 b) { - // Cast to uint32_t first to prevent overflow. Otherwise the result of - // uint16_t * uint16_t is in "int" which may overflow. In practice the result - // is the same but this way it is also defined. - return Vec1(static_cast( - (static_cast(a.raw) * static_cast(b.raw)) >> 16)); -} - -HWY_API Vec1 MulFixedPoint15(Vec1 a, Vec1 b) { - return Vec1(static_cast((a.raw * b.raw + 16384) >> 15)); -} - -// Multiplies even lanes (0, 2 ..) and returns the double-wide result. -template -HWY_API Vec1> MulEven(const Vec1 a, const Vec1 b) { - using TW = MakeWide; - const TW a_wide = a.raw; - return Vec1(static_cast(a_wide * b.raw)); -} - -// Approximate reciprocal -HWY_API Vec1 ApproximateReciprocal(const Vec1 v) { - // Zero inputs are allowed, but callers are responsible for replacing the - // return value with something else (typically using IfThenElse). This check - // avoids a ubsan error. The return value is arbitrary. - if (v.raw == 0.0f) return Vec1(0.0f); - return Vec1(1.0f / v.raw); -} - -// generic_ops takes care of integer T. -template -HWY_API Vec1 AbsDiff(const Vec1 a, const Vec1 b) { - return Abs(a - b); -} - -// ------------------------------ Floating-point multiply-add variants - -template -HWY_API Vec1 MulAdd(const Vec1 mul, const Vec1 x, const Vec1 add) { - return mul * x + add; -} - -template -HWY_API Vec1 NegMulAdd(const Vec1 mul, const Vec1 x, - const Vec1 add) { - return add - mul * x; -} - -template -HWY_API Vec1 MulSub(const Vec1 mul, const Vec1 x, const Vec1 sub) { - return mul * x - sub; -} - -template -HWY_API Vec1 NegMulSub(const Vec1 mul, const Vec1 x, - const Vec1 sub) { - return Neg(mul) * x - sub; -} - -// ------------------------------ Floating-point square root - -// Approximate reciprocal square root -HWY_API Vec1 ApproximateReciprocalSqrt(const Vec1 v) { - float f = v.raw; - const float half = f * 0.5f; - uint32_t bits; - CopySameSize(&f, &bits); - // Initial guess based on log2(f) - bits = 0x5F3759DF - (bits >> 1); - CopySameSize(&bits, &f); - // One Newton-Raphson iteration - return Vec1(f * (1.5f - (half * f * f))); -} - -// Square root -HWY_API Vec1 Sqrt(Vec1 v) { -#if defined(HWY_NO_LIBCXX) -#if HWY_COMPILER_GCC_ACTUAL - return Vec1(__builtin_sqrt(v.raw)); -#else - uint32_t bits; - CopyBytes(&v, &bits); - // Coarse approximation, letting the exponent LSB leak into the mantissa - bits = (1 << 29) + (bits >> 1) - (1 << 22); - CopyBytes(&bits, &v); - return v; -#endif // !HWY_COMPILER_GCC_ACTUAL -#else - return Vec1(sqrtf(v.raw)); -#endif // !HWY_NO_LIBCXX -} -HWY_API Vec1 Sqrt(Vec1 v) { -#if defined(HWY_NO_LIBCXX) -#if HWY_COMPILER_GCC_ACTUAL - return Vec1(__builtin_sqrt(v.raw)); -#else - uint64_t bits; - CopyBytes(&v, &bits); - // Coarse approximation, letting the exponent LSB leak into the mantissa - bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51); - CopyBytes(&bits, &v); - return v; -#endif // !HWY_COMPILER_GCC_ACTUAL -#else - return Vec1(sqrt(v.raw)); -#endif // HWY_NO_LIBCXX -} - -// ------------------------------ Floating-point rounding - -template -HWY_API Vec1 Round(const Vec1 v) { - using TI = MakeSigned; - if (!(Abs(v).raw < MantissaEnd())) { // Huge or NaN - return v; - } - const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); - const TI rounded = static_cast(v.raw + bias); - if (rounded == 0) return CopySignToAbs(Vec1(0), v); - // Round to even - if ((rounded & 1) && detail::Abs(static_cast(rounded) - v.raw) == T(0.5)) { - return Vec1(static_cast(rounded - (v.raw < T(0) ? -1 : 1))); - } - return Vec1(static_cast(rounded)); -} - -// Round-to-nearest even. -HWY_API Vec1 NearestInt(const Vec1 v) { - using T = float; - using TI = int32_t; - - const T abs = Abs(v).raw; - const bool is_sign = detail::SignBit(v.raw); - - if (!(abs < MantissaEnd())) { // Huge or NaN - // Check if too large to cast or NaN - if (!(abs <= static_cast(LimitsMax()))) { - return Vec1(is_sign ? LimitsMin() : LimitsMax()); - } - return Vec1(static_cast(v.raw)); - } - const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5); - const TI rounded = static_cast(v.raw + bias); - if (rounded == 0) return Vec1(0); - // Round to even - if ((rounded & 1) && detail::Abs(static_cast(rounded) - v.raw) == T(0.5)) { - return Vec1(rounded - (is_sign ? -1 : 1)); - } - return Vec1(rounded); -} - -template -HWY_API Vec1 Trunc(const Vec1 v) { - using TI = MakeSigned; - if (!(Abs(v).raw <= MantissaEnd())) { // Huge or NaN - return v; - } - const TI truncated = static_cast(v.raw); - if (truncated == 0) return CopySignToAbs(Vec1(0), v); - return Vec1(static_cast(truncated)); -} - -template -V Ceiling(const V v) { - const Bits kExponentMask = (1ull << kExponentBits) - 1; - const Bits kMantissaMask = (1ull << kMantissaBits) - 1; - const Bits kBias = kExponentMask / 2; - - Float f = v.raw; - const bool positive = f > Float(0.0); - - Bits bits; - CopySameSize(&v, &bits); - - const int exponent = - static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); - // Already an integer. - if (exponent >= kMantissaBits) return v; - // |v| <= 1 => 0 or 1. - if (exponent < 0) return positive ? V(1) : V(-0.0); - - const Bits mantissa_mask = kMantissaMask >> exponent; - // Already an integer - if ((bits & mantissa_mask) == 0) return v; - - // Clear fractional bits and round up - if (positive) bits += (kMantissaMask + 1) >> exponent; - bits &= ~mantissa_mask; - - CopySameSize(&bits, &f); - return V(f); -} - -template -V Floor(const V v) { - const Bits kExponentMask = (1ull << kExponentBits) - 1; - const Bits kMantissaMask = (1ull << kMantissaBits) - 1; - const Bits kBias = kExponentMask / 2; - - Float f = v.raw; - const bool negative = f < Float(0.0); - - Bits bits; - CopySameSize(&v, &bits); - - const int exponent = - static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); - // Already an integer. - if (exponent >= kMantissaBits) return v; - // |v| <= 1 => -1 or 0. - if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0)); - - const Bits mantissa_mask = kMantissaMask >> exponent; - // Already an integer - if ((bits & mantissa_mask) == 0) return v; - - // Clear fractional bits and round down - if (negative) bits += (kMantissaMask + 1) >> exponent; - bits &= ~mantissa_mask; - - CopySameSize(&bits, &f); - return V(f); -} - -// Toward +infinity, aka ceiling -HWY_API Vec1 Ceil(const Vec1 v) { - return Ceiling(v); -} -HWY_API Vec1 Ceil(const Vec1 v) { - return Ceiling(v); -} - -// Toward -infinity, aka floor -HWY_API Vec1 Floor(const Vec1 v) { - return Floor(v); -} -HWY_API Vec1 Floor(const Vec1 v) { - return Floor(v); -} - -// ================================================== COMPARE - -template -HWY_API Mask1 operator==(const Vec1 a, const Vec1 b) { - return Mask1::FromBool(a.raw == b.raw); -} - -template -HWY_API Mask1 operator!=(const Vec1 a, const Vec1 b) { - return Mask1::FromBool(a.raw != b.raw); -} - -template -HWY_API Mask1 TestBit(const Vec1 v, const Vec1 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} - -template -HWY_API Mask1 operator<(const Vec1 a, const Vec1 b) { - return Mask1::FromBool(a.raw < b.raw); -} -template -HWY_API Mask1 operator>(const Vec1 a, const Vec1 b) { - return Mask1::FromBool(a.raw > b.raw); -} - -template -HWY_API Mask1 operator<=(const Vec1 a, const Vec1 b) { - return Mask1::FromBool(a.raw <= b.raw); -} -template -HWY_API Mask1 operator>=(const Vec1 a, const Vec1 b) { - return Mask1::FromBool(a.raw >= b.raw); -} - -// ------------------------------ Floating-point classification (==) - -template -HWY_API Mask1 IsNaN(const Vec1 v) { - // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. - MakeUnsigned bits; - CopySameSize(&v, &bits); - bits += bits; - bits >>= 1; // clear sign bit - // NaN if all exponent bits are set and the mantissa is not zero. - return Mask1::FromBool(bits > ExponentMask()); -} - -HWY_API Mask1 IsInf(const Vec1 v) { - const Sisd d; - const RebindToUnsigned du; - const Vec1 vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u)); -} -HWY_API Mask1 IsInf(const Vec1 v) { - const Sisd d; - const RebindToUnsigned du; - const Vec1 vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull)); -} - -HWY_API Mask1 IsFinite(const Vec1 v) { - const Vec1 vu = BitCast(Sisd(), v); - // Shift left to clear the sign bit, check whether exponent != max value. - return Mask1::FromBool((vu.raw << 1) < 0xFF000000u); -} -HWY_API Mask1 IsFinite(const Vec1 v) { - const Vec1 vu = BitCast(Sisd(), v); - // Shift left to clear the sign bit, check whether exponent != max value. - return Mask1::FromBool((vu.raw << 1) < 0xFFE0000000000000ull); -} - -// ================================================== MEMORY - -// ------------------------------ Load - -template > -HWY_API Vec1 Load(D /* tag */, const T* HWY_RESTRICT aligned) { - T t; - CopySameSize(aligned, &t); - return Vec1(t); -} - -template > -HWY_API Vec1 MaskedLoad(Mask1 m, D d, const T* HWY_RESTRICT aligned) { - return IfThenElseZero(m, Load(d, aligned)); -} - -template > -HWY_API Vec1 MaskedLoadOr(Vec1 v, Mask1 m, D d, - const T* HWY_RESTRICT aligned) { - return IfThenElse(m, Load(d, aligned), v); -} - -template > -HWY_API Vec1 LoadU(D d, const T* HWY_RESTRICT p) { - return Load(d, p); -} - -// In some use cases, "load single lane" is sufficient; otherwise avoid this. -template > -HWY_API Vec1 LoadDup128(D d, const T* HWY_RESTRICT aligned) { - return Load(d, aligned); -} - -#ifdef HWY_NATIVE_LOAD_N -#undef HWY_NATIVE_LOAD_N -#else -#define HWY_NATIVE_LOAD_N -#endif - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d); -} - -// ------------------------------ Store - -template > -HWY_API void Store(const Vec1 v, D /* tag */, T* HWY_RESTRICT aligned) { - CopySameSize(&v.raw, aligned); -} - -template > -HWY_API void StoreU(const Vec1 v, D d, T* HWY_RESTRICT p) { - return Store(v, d, p); -} - -template > -HWY_API void BlendedStore(const Vec1 v, Mask1 m, D d, T* HWY_RESTRICT p) { - if (!m.bits) return; - StoreU(v, d, p); -} - -#ifdef HWY_NATIVE_STORE_N -#undef HWY_NATIVE_STORE_N -#else -#define HWY_NATIVE_STORE_N -#endif - -template > -HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, - size_t max_lanes_to_store) { - if (max_lanes_to_store > 0) { - Store(v, d, p); - } -} - -// ------------------------------ LoadInterleaved2/3/4 - -// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2. -#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED -#else -#define HWY_NATIVE_LOAD_STORE_INTERLEAVED -#endif - -template > -HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1& v0, - Vec1& v1) { - v0 = LoadU(d, unaligned + 0); - v1 = LoadU(d, unaligned + 1); -} - -template > -HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1& v0, - Vec1& v1, Vec1& v2) { - v0 = LoadU(d, unaligned + 0); - v1 = LoadU(d, unaligned + 1); - v2 = LoadU(d, unaligned + 2); -} - -template > -HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1& v0, - Vec1& v1, Vec1& v2, Vec1& v3) { - v0 = LoadU(d, unaligned + 0); - v1 = LoadU(d, unaligned + 1); - v2 = LoadU(d, unaligned + 2); - v3 = LoadU(d, unaligned + 3); -} - -// ------------------------------ StoreInterleaved2/3/4 - -template > -HWY_API void StoreInterleaved2(const Vec1 v0, const Vec1 v1, D d, - T* HWY_RESTRICT unaligned) { - StoreU(v0, d, unaligned + 0); - StoreU(v1, d, unaligned + 1); -} - -template > -HWY_API void StoreInterleaved3(const Vec1 v0, const Vec1 v1, - const Vec1 v2, D d, - T* HWY_RESTRICT unaligned) { - StoreU(v0, d, unaligned + 0); - StoreU(v1, d, unaligned + 1); - StoreU(v2, d, unaligned + 2); -} - -template > -HWY_API void StoreInterleaved4(const Vec1 v0, const Vec1 v1, - const Vec1 v2, const Vec1 v3, D d, - T* HWY_RESTRICT unaligned) { - StoreU(v0, d, unaligned + 0); - StoreU(v1, d, unaligned + 1); - StoreU(v2, d, unaligned + 2); - StoreU(v3, d, unaligned + 3); -} - -// ------------------------------ Stream - -template > -HWY_API void Stream(const Vec1 v, D d, T* HWY_RESTRICT aligned) { - return Store(v, d, aligned); -} - -// ------------------------------ Scatter - -#ifdef HWY_NATIVE_SCATTER -#undef HWY_NATIVE_SCATTER -#else -#define HWY_NATIVE_SCATTER -#endif - -template , typename TI> -HWY_API void ScatterOffset(Vec1 v, D d, T* base, Vec1 offset) { - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - uint8_t* const base8 = reinterpret_cast(base) + offset.raw; - Store(v, d, reinterpret_cast(base8)); -} - -template , typename TI> -HWY_API void ScatterIndex(Vec1 v, D d, T* HWY_RESTRICT base, - Vec1 index) { - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - Store(v, d, base + index.raw); -} - -template , typename TI> -HWY_API void MaskedScatterIndex(Vec1 v, Mask1 m, D d, - T* HWY_RESTRICT base, Vec1 index) { - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - if (m.bits) Store(v, d, base + index.raw); -} - -// ------------------------------ Gather - -#ifdef HWY_NATIVE_GATHER -#undef HWY_NATIVE_GATHER -#else -#define HWY_NATIVE_GATHER -#endif - -template , typename TI> -HWY_API Vec1 GatherOffset(D d, const T* base, Vec1 offset) { - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - const intptr_t addr = - reinterpret_cast(base) + static_cast(offset.raw); - return Load(d, reinterpret_cast(addr)); -} - -template , typename TI> -HWY_API Vec1 GatherIndex(D d, const T* HWY_RESTRICT base, Vec1 index) { - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - return Load(d, base + index.raw); -} - -template , typename TI> -HWY_API Vec1 MaskedGatherIndex(Mask1 m, D d, const T* HWY_RESTRICT base, - Vec1 index) { - static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); - return MaskedLoad(m, d, base + index.raw); -} - -// ================================================== CONVERT - -// ConvertTo and DemoteTo with floating-point input and integer output truncate -// (rounding toward zero). - -template , typename TFrom> -HWY_API Vec1 PromoteTo(DTo /* tag */, Vec1 from) { - static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting"); - // For bits Y > X, floatX->floatY and intX->intY are always representable. - return Vec1(static_cast(from.raw)); -} - -// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, -// so we overload for TFrom=double and TTo={float,int32_t}. -template -HWY_API Vec1 DemoteTo(D /* tag */, Vec1 from) { - // Prevent ubsan errors when converting float to narrower integer/float - if (IsInf(from).bits || - Abs(from).raw > static_cast(HighestValue())) { - return Vec1(detail::SignBit(from.raw) ? LowestValue() - : HighestValue()); - } - return Vec1(static_cast(from.raw)); -} -template -HWY_API Vec1 DemoteTo(D /* tag */, Vec1 from) { - // Prevent ubsan errors when converting int32_t to narrower integer/int32_t - if (IsInf(from).bits || - Abs(from).raw > static_cast(HighestValue())) { - return Vec1(detail::SignBit(from.raw) ? LowestValue() - : HighestValue()); - } - return Vec1(static_cast(from.raw)); -} - -template , typename TFrom, - HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD)> -HWY_API Vec1 DemoteTo(DTo /* tag */, Vec1 from) { - static_assert(!IsFloat(), "TFrom=double are handled above"); - static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); - - // Int to int: choose closest value in TTo to `from` (avoids UB) - from.raw = HWY_MIN(HWY_MAX(LimitsMin(), from.raw), LimitsMax()); - return Vec1(static_cast(from.raw)); -} - -template , typename TFrom, - HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED_D(DTo)> -HWY_API Vec1 DemoteTo(DTo /* tag */, Vec1 from) { - static_assert(!IsFloat(), "TFrom=double are handled above"); - static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); - - // Int to int: choose closest value in TTo to `from` (avoids UB) - from.raw = HWY_MIN(from.raw, LimitsMax()); - return Vec1(static_cast(from.raw)); -} - -// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions; -// use this scalar version to verify the vector implementation. -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif - -template -HWY_API Vec1 PromoteTo(D /* tag */, const Vec1 v) { - return Vec1(F32FromF16(v.raw)); -} - -template -HWY_API Vec1 PromoteTo(D d, const Vec1 v) { - return Set(d, F32FromBF16(v.raw)); -} - -template -HWY_API Vec1 DemoteTo(D /* tag */, const Vec1 v) { - return Vec1(F16FromF32(v.raw)); -} - -template -HWY_API Vec1 DemoteTo(D d, const Vec1 v) { - return Set(d, BF16FromF32(v.raw)); -} - -template , typename TFrom, - HWY_IF_FLOAT(TFrom)> -HWY_API Vec1 ConvertTo(DTo /* tag */, Vec1 from) { - static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); - // float## -> int##: return closest representable value. We cannot exactly - // represent LimitsMax in TFrom, so use double. - const double f = static_cast(from.raw); - if (IsInf(from).bits || - Abs(Vec1(f)).raw > static_cast(LimitsMax())) { - return Vec1(detail::SignBit(from.raw) ? LimitsMin() - : LimitsMax()); - } - return Vec1(static_cast(from.raw)); -} - -template , typename TFrom, - HWY_IF_NOT_FLOAT(TFrom)> -HWY_API Vec1 ConvertTo(DTo /* tag */, Vec1 from) { - static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); - // int## -> float##: no check needed - return Vec1(static_cast(from.raw)); -} - -HWY_API Vec1 U8FromU32(const Vec1 v) { - return DemoteTo(Sisd(), v); -} - -// ------------------------------ TruncateTo - -template -HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { - return Vec1{static_cast(v.raw & 0xFF)}; -} - -template -HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { - return Vec1{static_cast(v.raw & 0xFFFF)}; -} - -template -HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { - return Vec1{static_cast(v.raw & 0xFFFFFFFFu)}; -} - -template -HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { - return Vec1{static_cast(v.raw & 0xFF)}; -} - -template -HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { - return Vec1{static_cast(v.raw & 0xFFFF)}; -} - -template -HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { - return Vec1{static_cast(v.raw & 0xFF)}; -} - -// ================================================== COMBINE -// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported. - -template -HWY_API Vec1 LowerHalf(Vec1 v) { - return v; -} - -template > -HWY_API Vec1 LowerHalf(D /* tag */, Vec1 v) { - return v; -} - -// ================================================== SWIZZLE - -template -HWY_API T GetLane(const Vec1 v) { - return v.raw; -} - -template -HWY_API T ExtractLane(const Vec1 v, size_t i) { - HWY_DASSERT(i == 0); - (void)i; - return v.raw; -} - -template -HWY_API Vec1 InsertLane(Vec1 v, size_t i, T t) { - HWY_DASSERT(i == 0); - (void)i; - v.raw = t; - return v; -} - -template -HWY_API Vec1 DupEven(Vec1 v) { - return v; -} -// DupOdd is unsupported. - -template -HWY_API Vec1 OddEven(Vec1 /* odd */, Vec1 even) { - return even; -} - -template -HWY_API Vec1 OddEvenBlocks(Vec1 /* odd */, Vec1 even) { - return even; -} - -// ------------------------------ SwapAdjacentBlocks - -template -HWY_API Vec1 SwapAdjacentBlocks(Vec1 v) { - return v; -} - -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices for use by TableLookupLanes. -template -struct Indices1 { - MakeSigned raw; -}; - -template , typename TI> -HWY_API Indices1 IndicesFromVec(D, Vec1 vec) { - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); - HWY_DASSERT(vec.raw <= 1); - return Indices1{static_cast>(vec.raw)}; -} - -template , typename TI> -HWY_API Indices1 SetTableIndices(D d, const TI* idx) { - return IndicesFromVec(d, LoadU(Sisd(), idx)); -} - -template -HWY_API Vec1 TableLookupLanes(const Vec1 v, const Indices1 /* idx */) { - return v; -} - -template -HWY_API Vec1 TwoTablesLookupLanes(const Vec1 a, const Vec1 b, - const Indices1 idx) { - return (idx.raw == 0) ? a : b; -} - -// ------------------------------ ReverseBlocks - -// Single block: no change -template > -HWY_API Vec1 ReverseBlocks(D /* tag */, const Vec1 v) { - return v; -} - -// ------------------------------ Reverse - -template > -HWY_API Vec1 Reverse(D /* tag */, const Vec1 v) { - return v; -} - -// Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. -#ifdef HWY_NATIVE_REVERSE2_8 -#undef HWY_NATIVE_REVERSE2_8 -#else -#define HWY_NATIVE_REVERSE2_8 -#endif - -// Must not be called: -template > -HWY_API Vec1 Reverse2(D /* tag */, const Vec1 v) { - return v; -} - -template > -HWY_API Vec1 Reverse4(D /* tag */, const Vec1 v) { - return v; -} - -template > -HWY_API Vec1 Reverse8(D /* tag */, const Vec1 v) { - return v; -} - -// ------------------------------ ReverseLaneBytes - -#ifdef HWY_NATIVE_REVERSE_LANE_BYTES -#undef HWY_NATIVE_REVERSE_LANE_BYTES -#else -#define HWY_NATIVE_REVERSE_LANE_BYTES -#endif - -HWY_API Vec1 ReverseLaneBytes(Vec1 v) { - const uint32_t val{v.raw}; - return Vec1( - static_cast(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu))); -} - -HWY_API Vec1 ReverseLaneBytes(Vec1 v) { - const uint32_t val = v.raw; - return Vec1(static_cast( - ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) | - ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu))); -} - -HWY_API Vec1 ReverseLaneBytes(Vec1 v) { - const uint64_t val = v.raw; - return Vec1(static_cast( - ((val << 56) & 0xFF00000000000000u) | - ((val << 40) & 0x00FF000000000000u) | - ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) | - ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) | - ((val >> 40) & 0x000000000000FF00u) | - ((val >> 56) & 0x00000000000000FFu))); -} - -template -HWY_API V ReverseLaneBytes(V v) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, ReverseLaneBytes(BitCast(du, v))); -} - -// ------------------------------ ReverseBits -#ifdef HWY_NATIVE_REVERSE_BITS_UI8 -#undef HWY_NATIVE_REVERSE_BITS_UI8 -#else -#define HWY_NATIVE_REVERSE_BITS_UI8 -#endif - -#ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#else -#define HWY_NATIVE_REVERSE_BITS_UI16_32_64 -#endif - -namespace detail { - -template -HWY_INLINE T ReverseBitsOfEachByte(T val) { - using TU = MakeUnsigned; - constexpr TU kMaxUnsignedVal{LimitsMax()}; - constexpr TU kShrMask1 = - static_cast(0x5555555555555555u & kMaxUnsignedVal); - constexpr TU kShrMask2 = - static_cast(0x3333333333333333u & kMaxUnsignedVal); - constexpr TU kShrMask3 = - static_cast(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal); - - constexpr TU kShlMask1 = static_cast(~kShrMask1); - constexpr TU kShlMask2 = static_cast(~kShrMask2); - constexpr TU kShlMask3 = static_cast(~kShrMask3); - - TU result = static_cast(val); - result = static_cast(((result << 1) & kShlMask1) | - ((result >> 1) & kShrMask1)); - result = static_cast(((result << 2) & kShlMask2) | - ((result >> 2) & kShrMask2)); - result = static_cast(((result << 4) & kShlMask3) | - ((result >> 4) & kShrMask3)); - return static_cast(result); -} - -} // namespace detail - -template -HWY_API V ReverseBits(V v) { - return V(detail::ReverseBitsOfEachByte(v.raw)); -} - -template -HWY_API V ReverseBits(V v) { - return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw))); -} - -template -HWY_API V ReverseBits(V v) { - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, ReverseBits(BitCast(du, v))); -} - -// ------------------------------ SlideUpLanes - -template -HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -// ------------------------------ SlideDownLanes - -template -HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -// ================================================== BLOCKWISE -// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported. - -// ------------------------------ Broadcast/splat any lane - -template -HWY_API Vec1 Broadcast(const Vec1 v) { - static_assert(kLane == 0, "Scalar only has one lane"); - return v; -} - -// ------------------------------ TableLookupBytes, TableLookupBytesOr0 - -template -HWY_API Vec1 TableLookupBytes(const Vec1 in, const Vec1 indices) { - uint8_t in_bytes[sizeof(T)]; - uint8_t idx_bytes[sizeof(T)]; - uint8_t out_bytes[sizeof(T)]; - CopyBytes(&in, &in_bytes); // copy to bytes - CopyBytes(&indices, &idx_bytes); - for (size_t i = 0; i < sizeof(T); ++i) { - out_bytes[i] = in_bytes[idx_bytes[i]]; - } - TI out; - CopyBytes(&out_bytes, &out); - return Vec1{out}; -} - -template -HWY_API Vec1 TableLookupBytesOr0(const Vec1 in, const Vec1 indices) { - uint8_t in_bytes[sizeof(T)]; - uint8_t idx_bytes[sizeof(T)]; - uint8_t out_bytes[sizeof(T)]; - CopyBytes(&in, &in_bytes); // copy to bytes - CopyBytes(&indices, &idx_bytes); - for (size_t i = 0; i < sizeof(T); ++i) { - out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]]; - } - TI out; - CopyBytes(&out_bytes, &out); - return Vec1{out}; -} - -// ------------------------------ ZipLower - -HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { - return Vec1(static_cast((uint32_t{b.raw} << 8) + a.raw)); -} -HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { - return Vec1((uint32_t{b.raw} << 16) + a.raw); -} -HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { - return Vec1((uint64_t{b.raw} << 32) + a.raw); -} -HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { - return Vec1(static_cast((int32_t{b.raw} << 8) + a.raw)); -} -HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { - return Vec1((int32_t{b.raw} << 16) + a.raw); -} -HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { - return Vec1((int64_t{b.raw} << 32) + a.raw); -} - -template , typename TN = MakeNarrow> -HWY_API Vec1 ZipLower(DW /* tag */, Vec1 a, Vec1 b) { - return Vec1(static_cast((TW{b.raw} << (sizeof(TN) * 8)) + a.raw)); -} - -// ================================================== MASK - -template > -HWY_API bool AllFalse(D /* tag */, const Mask1 mask) { - return mask.bits == 0; -} - -template > -HWY_API bool AllTrue(D /* tag */, const Mask1 mask) { - return mask.bits != 0; -} - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template > -HWY_API Mask1 LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { - return Mask1::FromBool((bits[0] & 1) != 0); -} - -// `p` points to at least 8 writable bytes. -template > -HWY_API size_t StoreMaskBits(D d, const Mask1 mask, uint8_t* bits) { - *bits = AllTrue(d, mask); - return 1; -} - -template > -HWY_API size_t CountTrue(D /* tag */, const Mask1 mask) { - return mask.bits == 0 ? 0 : 1; -} - -template > -HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1 mask) { - return mask.bits == 0 ? -1 : 0; -} - -template > -HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1 /* m */) { - return 0; // There is only one lane and we know it is true. -} - -template > -HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1 mask) { - return mask.bits == 0 ? -1 : 0; -} - -template > -HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1 /* m */) { - return 0; // There is only one lane and we know it is true. -} - -// ------------------------------ Compress, CompressBits - -template -struct CompressIsPartition { - enum { value = 1 }; -}; - -template -HWY_API Vec1 Compress(Vec1 v, const Mask1 /* mask */) { - // A single lane is already partitioned by definition. - return v; -} - -template -HWY_API Vec1 CompressNot(Vec1 v, const Mask1 /* mask */) { - // A single lane is already partitioned by definition. - return v; -} - -// ------------------------------ CompressStore -template > -HWY_API size_t CompressStore(Vec1 v, const Mask1 mask, D d, - T* HWY_RESTRICT unaligned) { - StoreU(Compress(v, mask), d, unaligned); - return CountTrue(d, mask); -} - -// ------------------------------ CompressBlendedStore -template > -HWY_API size_t CompressBlendedStore(Vec1 v, const Mask1 mask, D d, - T* HWY_RESTRICT unaligned) { - if (!mask.bits) return 0; - StoreU(v, d, unaligned); - return 1; -} - -// ------------------------------ CompressBits -template -HWY_API Vec1 CompressBits(Vec1 v, const uint8_t* HWY_RESTRICT /*bits*/) { - return v; -} - -// ------------------------------ CompressBitsStore -template > -HWY_API size_t CompressBitsStore(Vec1 v, const uint8_t* HWY_RESTRICT bits, - D d, T* HWY_RESTRICT unaligned) { - const Mask1 mask = LoadMaskBits(d, bits); - StoreU(Compress(v, mask), d, unaligned); - return CountTrue(d, mask); -} - -// ------------------------------ Expand - -// generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here. -#ifdef HWY_NATIVE_EXPAND -#undef HWY_NATIVE_EXPAND -#else -#define HWY_NATIVE_EXPAND -#endif - -template -HWY_API Vec1 Expand(Vec1 v, const Mask1 mask) { - return IfThenElseZero(mask, v); -} - -// ------------------------------ LoadExpand -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { - return MaskedLoad(mask, d, unaligned); -} - -// ------------------------------ WidenMulPairwiseAdd - -template -HWY_API Vec1 WidenMulPairwiseAdd(D32 /* tag */, Vec1 a, - Vec1 b) { - return Vec1(F32FromBF16(a.raw)) * Vec1(F32FromBF16(b.raw)); -} - -template -HWY_API Vec1 WidenMulPairwiseAdd(D32 /* tag */, Vec1 a, - Vec1 b) { - return Vec1(a.raw * b.raw); -} - -// ------------------------------ SatWidenMulPairwiseAdd - -#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#else -#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#endif - -template -HWY_API Vec1 SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1 a, - Vec1 b) { - // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the - // input vectors only have 1 lane on the HWY_SCALAR target and as - // a.raw * b.raw is between -32640 and 32385, which is already within the - // range of an int16_t. - - // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed - // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if - // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the - // same sign. - - return Vec1(static_cast(a.raw) * - static_cast(b.raw)); -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -template -HWY_API Vec1 ReorderWidenMulAccumulate(D32 /* tag */, Vec1 a, - Vec1 b, - const Vec1 sum0, - Vec1& /* sum1 */) { - return MulAdd(Vec1(F32FromBF16(a.raw)), - Vec1(F32FromBF16(b.raw)), sum0); -} - -template -HWY_API Vec1 ReorderWidenMulAccumulate(D32 /* tag */, Vec1 a, - Vec1 b, - const Vec1 sum0, - Vec1& /* sum1 */) { - return Vec1(a.raw * b.raw + sum0.raw); -} - -template -HWY_API Vec1 ReorderWidenMulAccumulate(DU32 /* tag */, - Vec1 a, - Vec1 b, - const Vec1 sum0, - Vec1& /* sum1 */) { - return Vec1(static_cast(a.raw) * b.raw + sum0.raw); -} - -// ------------------------------ RearrangeToOddPlusEven -template -HWY_API Vec1 RearrangeToOddPlusEven(Vec1 sum0, Vec1 /* sum1 */) { - return sum0; // invariant already holds -} - -// ================================================== REDUCTIONS - -// Sum of all lanes, i.e. the only one. -template > -HWY_API Vec1 SumOfLanes(D /* tag */, const Vec1 v) { - return v; -} -template > -HWY_API T ReduceSum(D /* tag */, const Vec1 v) { - return GetLane(v); -} -template > -HWY_API Vec1 MinOfLanes(D /* tag */, const Vec1 v) { - return v; -} -template > -HWY_API Vec1 MaxOfLanes(D /* tag */, const Vec1 v) { - return v; -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/set_macros-inl.h b/deps/highway/include/hwy/ops/set_macros-inl.h deleted file mode 100644 index d8bed3e2..00000000 --- a/deps/highway/include/hwy/ops/set_macros-inl.h +++ /dev/null @@ -1,578 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Sets macros based on HWY_TARGET. - -// This include guard is toggled by foreach_target, so avoid the usual _H_ -// suffix to prevent copybara from renaming it. -#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE) -#ifdef HWY_SET_MACROS_PER_TARGET -#undef HWY_SET_MACROS_PER_TARGET -#else -#define HWY_SET_MACROS_PER_TARGET -#endif - -#endif // HWY_SET_MACROS_PER_TARGET - -#include "hwy/detect_compiler_arch.h" // IWYU: export -#include "hwy/detect_targets.h" // IWYU: export - -#undef HWY_NAMESPACE -#undef HWY_ALIGN -#undef HWY_MAX_BYTES -#undef HWY_LANES - -#undef HWY_HAVE_SCALABLE -#undef HWY_HAVE_TUPLE -#undef HWY_HAVE_INTEGER64 -#undef HWY_HAVE_FLOAT16 -#undef HWY_HAVE_FLOAT64 -#undef HWY_MEM_OPS_MIGHT_FAULT -#undef HWY_NATIVE_FMA -#undef HWY_CAP_GE256 -#undef HWY_CAP_GE512 - -// Supported on all targets except RVV (requires GCC 14 or upcoming Clang) -#if HWY_TARGET == HWY_RVV && \ - ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \ - (HWY_COMPILER_CLANG)) -#define HWY_HAVE_TUPLE 0 -#else -#define HWY_HAVE_TUPLE 1 -#endif - -// For internal use (clamping/validating N for Simd<>) -#undef HWY_MAX_N -#if HWY_TARGET == HWY_SCALAR -#define HWY_MAX_N 1 -#else -#define HWY_MAX_N 65536 -#endif - -// For internal use (clamping kPow2 for Simd<>) -#undef HWY_MAX_POW2 -// For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to -// support say Rebind> d; whose kPow2 is also 3. -// However, those other targets do not actually support multiple vectors, and -// thus Lanes(d) must not exceed Lanes(ScalableTag()). -#define HWY_MAX_POW2 3 - -// User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >> -// (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions. -#undef HWY_MIN_POW2 -#if HWY_TARGET == HWY_RVV -#define HWY_MIN_POW2 -16 -#else -// Tighter bound for other targets, whose vectors are smaller, to potentially -// save compile time. -#define HWY_MIN_POW2 -8 -#endif // HWY_TARGET == HWY_RVV - -#undef HWY_TARGET_STR - -#if defined(HWY_DISABLE_PCLMUL_AES) -#define HWY_TARGET_STR_PCLMUL_AES "" -#else -#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes" -#endif - -#if defined(HWY_DISABLE_BMI2_FMA) -#define HWY_TARGET_STR_BMI2_FMA "" -#else -#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma" -#endif - -#if defined(HWY_DISABLE_F16C) -#define HWY_TARGET_STR_F16C "" -#else -#define HWY_TARGET_STR_F16C ",f16c" -#endif - -#define HWY_TARGET_STR_SSE2 "sse2" - -#define HWY_TARGET_STR_SSSE3 "sse2,ssse3" - -#define HWY_TARGET_STR_SSE4 \ - HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES -// Include previous targets, which are the half-vectors of the next target. -#define HWY_TARGET_STR_AVX2 \ - HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C -#define HWY_TARGET_STR_AVX3 \ - HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" -#define HWY_TARGET_STR_AVX3_DL \ - HWY_TARGET_STR_AVX3 \ - ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \ - "avx512vpopcntdq,gfni" - -#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_DL ",avx512fp16" - -#if defined(HWY_DISABLE_PPC8_CRYPTO) -#define HWY_TARGET_STR_PPC8_CRYPTO "" -#else -#define HWY_TARGET_STR_PPC8_CRYPTO ",crypto" -#endif - -#define HWY_TARGET_STR_PPC8 \ - "altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO -#define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector" - -#if HWY_COMPILER_CLANG -#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector" -#else -#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10" -#endif - -// Before include guard so we redefine HWY_TARGET_STR on each include, -// governed by the current HWY_TARGET. - -//----------------------------------------------------------------------------- -// SSE2 -#if HWY_TARGET == HWY_SSE2 - -#define HWY_NAMESPACE N_SSE2 -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#define HWY_TARGET_STR HWY_TARGET_STR_SSE2 -//----------------------------------------------------------------------------- -// SSSE3 -#elif HWY_TARGET == HWY_SSSE3 - -#define HWY_NAMESPACE N_SSSE3 -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3 - -//----------------------------------------------------------------------------- -// SSE4 -#elif HWY_TARGET == HWY_SSE4 - -#define HWY_NAMESPACE N_SSE4 -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#define HWY_TARGET_STR HWY_TARGET_STR_SSE4 - -//----------------------------------------------------------------------------- -// AVX2 -#elif HWY_TARGET == HWY_AVX2 - -#define HWY_NAMESPACE N_AVX2 -#define HWY_ALIGN alignas(32) -#define HWY_MAX_BYTES 32 -#define HWY_LANES(T) (32 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 - -#ifdef HWY_DISABLE_BMI2_FMA -#define HWY_NATIVE_FMA 0 -#else -#define HWY_NATIVE_FMA 1 -#endif - -#define HWY_CAP_GE256 1 -#define HWY_CAP_GE512 0 - -#define HWY_TARGET_STR HWY_TARGET_STR_AVX2 - -//----------------------------------------------------------------------------- -// AVX3[_DL] -#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ - HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR - -#define HWY_ALIGN alignas(64) -#define HWY_MAX_BYTES 64 -#define HWY_LANES(T) (64 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#if (HWY_TARGET == HWY_AVX3_SPR) && 0 // TODO(janwas): enable after testing -#define HWY_HAVE_FLOAT16 1 -#else -#define HWY_HAVE_FLOAT16 0 -#endif -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 0 -#define HWY_NATIVE_FMA 1 -#define HWY_CAP_GE256 1 -#define HWY_CAP_GE512 1 - -#if HWY_TARGET == HWY_AVX3 - -#define HWY_NAMESPACE N_AVX3 -#define HWY_TARGET_STR HWY_TARGET_STR_AVX3 - -#elif HWY_TARGET == HWY_AVX3_DL - -#define HWY_NAMESPACE N_AVX3_DL -#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL - -#elif HWY_TARGET == HWY_AVX3_ZEN4 - -#define HWY_NAMESPACE N_AVX3_ZEN4 -// Currently the same as HWY_AVX3_DL: both support Icelake. -#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL - -#elif HWY_TARGET == HWY_AVX3_SPR - -#define HWY_NAMESPACE N_AVX3_SPR -#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR - -#else -#error "Logic error" -#endif // HWY_TARGET - -//----------------------------------------------------------------------------- -// PPC8, PPC9, PPC10 -#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \ - HWY_TARGET == HWY_PPC10 - -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 1 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#if HWY_TARGET == HWY_PPC8 - -#define HWY_NAMESPACE N_PPC8 -#define HWY_TARGET_STR HWY_TARGET_STR_PPC8 - -#elif HWY_TARGET == HWY_PPC9 - -#define HWY_NAMESPACE N_PPC9 -#define HWY_TARGET_STR HWY_TARGET_STR_PPC9 - -#elif HWY_TARGET == HWY_PPC10 - -#define HWY_NAMESPACE N_PPC10 -#define HWY_TARGET_STR HWY_TARGET_STR_PPC10 - -#else -#error "Logic error" -#endif // HWY_TARGET == HWY_PPC10 - -//----------------------------------------------------------------------------- -// NEON -#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES - -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -#define HWY_HAVE_FLOAT16 1 -#else -#define HWY_HAVE_FLOAT16 0 -#endif - -#if HWY_ARCH_ARM_A64 -#define HWY_HAVE_FLOAT64 1 -#else -#define HWY_HAVE_FLOAT64 0 -#endif - -#define HWY_MEM_OPS_MIGHT_FAULT 1 - -#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 -#define HWY_NATIVE_FMA 1 -#else -#define HWY_NATIVE_FMA 0 -#endif - -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#if HWY_TARGET == HWY_NEON_WITHOUT_AES -#define HWY_NAMESPACE N_NEON_WITHOUT_AES -#else -#define HWY_NAMESPACE N_NEON -#endif - -// Can use pragmas instead of -march compiler flag -#if HWY_HAVE_RUNTIME_DISPATCH -#if HWY_ARCH_ARM_V7 - -// The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8. -#if HWY_COMPILER_GCC_ACTUAL >= 800 -#define HWY_TARGET_STR "+neon-vfpv4" -#else // GCC < 7 -// Do not define HWY_TARGET_STR (no pragma). -#endif // HWY_COMPILER_GCC_ACTUAL - -#else // !HWY_ARCH_ARM_V7 - -#if HWY_TARGET == HWY_NEON_WITHOUT_AES -// Do not define HWY_TARGET_STR (no pragma). -#else -#define HWY_TARGET_STR "+crypto" -#endif // HWY_TARGET == HWY_NEON_WITHOUT_AES - -#endif // HWY_ARCH_ARM_V7 -#else // !HWY_HAVE_RUNTIME_DISPATCH -// HWY_TARGET_STR remains undefined -#endif - -//----------------------------------------------------------------------------- -// SVE[2] -#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \ - HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 - -// SVE only requires lane alignment, not natural alignment of the entire vector. -#define HWY_ALIGN alignas(8) - -// Value ensures MaxLanes() is the tightest possible upper bound to reduce -// overallocation. -#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T)) - -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 0 -#define HWY_NATIVE_FMA 1 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#if HWY_TARGET == HWY_SVE2 -#define HWY_NAMESPACE N_SVE2 -#define HWY_MAX_BYTES 256 -#define HWY_HAVE_SCALABLE 1 -#elif HWY_TARGET == HWY_SVE_256 -#define HWY_NAMESPACE N_SVE_256 -#define HWY_MAX_BYTES 32 -#define HWY_HAVE_SCALABLE 0 -#elif HWY_TARGET == HWY_SVE2_128 -#define HWY_NAMESPACE N_SVE2_128 -#define HWY_MAX_BYTES 16 -#define HWY_HAVE_SCALABLE 0 -#else -#define HWY_NAMESPACE N_SVE -#define HWY_MAX_BYTES 256 -#define HWY_HAVE_SCALABLE 1 -#endif - -// Can use pragmas instead of -march compiler flag -#if HWY_HAVE_RUNTIME_DISPATCH -#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128 -#define HWY_TARGET_STR "+sve2-aes" -#else -#define HWY_TARGET_STR "+sve" -#endif -#else -// HWY_TARGET_STR remains undefined -#endif - -//----------------------------------------------------------------------------- -// WASM -#elif HWY_TARGET == HWY_WASM - -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#define HWY_NAMESPACE N_WASM - -#define HWY_TARGET_STR "simd128" - -//----------------------------------------------------------------------------- -// WASM_EMU256 -#elif HWY_TARGET == HWY_WASM_EMU256 - -#define HWY_ALIGN alignas(32) -#define HWY_MAX_BYTES 32 -#define HWY_LANES(T) (32 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 0 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 1 -#define HWY_CAP_GE512 0 - -#define HWY_NAMESPACE N_WASM_EMU256 - -#define HWY_TARGET_STR "simd128" - -//----------------------------------------------------------------------------- -// RVV -#elif HWY_TARGET == HWY_RVV - -// RVV only requires lane alignment, not natural alignment of the entire vector, -// and the compiler already aligns builtin types, so nothing to do here. -#define HWY_ALIGN - -// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8). -#define HWY_MAX_BYTES 65536 - -// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual -// LMUL. This is the tightest possible upper bound. -#define HWY_LANES(T) (8192 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 1 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 0 -#define HWY_NATIVE_FMA 1 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#if defined(__riscv_zvfh) -#define HWY_HAVE_FLOAT16 1 -#else -#define HWY_HAVE_FLOAT16 0 -#endif - -#define HWY_NAMESPACE N_RVV - -// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. -// (rv64gcv is not a valid target) - -//----------------------------------------------------------------------------- -// EMU128 -#elif HWY_TARGET == HWY_EMU128 - -#define HWY_ALIGN alignas(16) -#define HWY_MAX_BYTES 16 -#define HWY_LANES(T) (16 / sizeof(T)) - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#define HWY_NAMESPACE N_EMU128 - -// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. - -//----------------------------------------------------------------------------- -// SCALAR -#elif HWY_TARGET == HWY_SCALAR - -#define HWY_ALIGN -#define HWY_MAX_BYTES 8 -#define HWY_LANES(T) 1 - -#define HWY_HAVE_SCALABLE 0 -#define HWY_HAVE_INTEGER64 1 -#define HWY_HAVE_FLOAT16 0 -#define HWY_HAVE_FLOAT64 1 -#define HWY_MEM_OPS_MIGHT_FAULT 0 -#define HWY_NATIVE_FMA 0 -#define HWY_CAP_GE256 0 -#define HWY_CAP_GE512 0 - -#define HWY_NAMESPACE N_SCALAR - -// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op. - -#else -#pragma message("HWY_TARGET does not match any known target") -#endif // HWY_TARGET - -// Override this to 1 in asan/msan builds, which will still fault. -#if HWY_IS_ASAN || HWY_IS_MSAN -#undef HWY_MEM_OPS_MIGHT_FAULT -#define HWY_MEM_OPS_MIGHT_FAULT 1 -#endif - -// Clang <9 requires this be invoked at file scope, before any namespace. -#undef HWY_BEFORE_NAMESPACE -#if defined(HWY_TARGET_STR) -#define HWY_BEFORE_NAMESPACE() \ - HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \ - static_assert(true, "For requiring trailing semicolon") -#else -// avoids compiler warning if no HWY_TARGET_STR -#define HWY_BEFORE_NAMESPACE() \ - static_assert(true, "For requiring trailing semicolon") -#endif - -// Clang <9 requires any namespaces be closed before this macro. -#undef HWY_AFTER_NAMESPACE -#if defined(HWY_TARGET_STR) -#define HWY_AFTER_NAMESPACE() \ - HWY_POP_ATTRIBUTES \ - static_assert(true, "For requiring trailing semicolon") -#else -// avoids compiler warning if no HWY_TARGET_STR -#define HWY_AFTER_NAMESPACE() \ - static_assert(true, "For requiring trailing semicolon") -#endif - -#undef HWY_ATTR -#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target) -#define HWY_ATTR __attribute__((target(HWY_TARGET_STR))) -#else -#define HWY_ATTR -#endif diff --git a/deps/highway/include/hwy/ops/shared-inl.h b/deps/highway/include/hwy/ops/shared-inl.h deleted file mode 100644 index 8b4ec41a..00000000 --- a/deps/highway/include/hwy/ops/shared-inl.h +++ /dev/null @@ -1,520 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Per-target definitions shared by ops/*.h and user code. - -// IWYU pragma: begin_exports -// Export does not seem to be recursive, so re-export these (also in base.h) -#include - -#include "hwy/base.h" -// "IWYU pragma: keep" does not work for this include, so hide it from the IDE. -#if !HWY_IDE -#include -#endif - -#include "hwy/detect_compiler_arch.h" - -// Separate header because foreach_target.h re-enables its include guard. -#include "hwy/ops/set_macros-inl.h" - -// IWYU pragma: end_exports - -#if HWY_IS_MSAN -#include -#endif - -// We are covered by the highway.h include guard, but generic_ops-inl.h -// includes this again #if HWY_IDE. -#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE) -#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE -#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE -#else -#define HIGHWAY_HWY_OPS_SHARED_TOGGLE -#endif - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -// NOTE: GCC generates incorrect code for vector arguments to non-inlined -// functions in two situations: -// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads: -// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412. -// - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not -// all) tests to fail. -// -// We therefore pass by const& only on GCC and (Windows or aarch64). This alias -// must be used for all vector/mask parameters of functions marked HWY_NOINLINE, -// and possibly also other functions that are not inlined. -#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64) -template -using VecArg = const V&; -#else -template -using VecArg = V; -#endif - -namespace detail { - -// Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the -// desired fraction or multiple of it, see Simd<>. `pow2` is most often in -// [-3, 3] but can also be lower for user-specified fractions. -constexpr size_t ScaleByPower(size_t N, int pow2) { - return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); -} - -template -HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) { - // Workaround for MSAN not marking compressstore as initialized (b/233326619) -#if HWY_IS_MSAN - __msan_unpoison(unaligned, count * sizeof(T)); -#else - (void)unaligned; - (void)count; -#endif -} - -} // namespace detail - -// Highway operations are implemented as overloaded functions selected using a -// zero-sized tag type D := Simd. T denotes the lane type. -// -// N defines how many lanes are in a 'full' vector, typically equal to -// HWY_LANES(T) (which is the actual count on targets with vectors of known -// size, and an upper bound in case of scalable vectors), otherwise a -// user-specified limit at most that large. -// -// 2^kPow2 is a _subsequently_ applied scaling factor that indicates the -// desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3 -// means two/four/eight full vectors ganged together. The largest supported -// kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping -// user-specified values to that. Note that `Simd` and `Simd` -// have the same `MaxLanes` and `Lanes`. -// -// We can theoretically keep halving Lanes(), but recursive instantiations of -// kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count. -// Users must terminate such compile-time recursions at or above HWY_MIN_POW2. -// -// WARNING: do not use N directly because it may be a special representation of -// a fractional MaxLanes. This arises when we Rebind Simd to -// Simd. RVV requires that the last argument (kPow2) be two, -// but we want MaxLanes to be the same in both cases. Hence ?? is a -// fixed-point encoding of 1/4. -// -// Instead of referring to Simd<> directly, users create D via aliases: -// - ScalableTag for a full vector; -// - ScalableTag() for a fraction/group, where `kPow2` is -// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`; -// - CappedTag for a vector with up to kLimit lanes; or -// - FixedTag for a vector with exactly kNumLanes lanes. -// -// Instead of N, use Lanes(D()) for the actual number of lanes at runtime and -// D().MaxLanes() for a constexpr upper bound. Both are powers of two. -template -struct Simd { - constexpr Simd() = default; - using T = Lane; - - private: - static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit"); - // 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of - // N when kFrac == 0, otherwise it is one (see FracN). - static constexpr size_t kWhole = N & 0xFFFFF; - // Fractional part is in the bits above kWhole. - static constexpr int kFrac = static_cast(N >> 20); - // Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger - // type to u8 results in fractions). - static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range"); - static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1"); - static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x"); - // Important to check this here because kPow2 <= -64 causes confusing - // compile errors (invalid shift count). - static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?"); - // However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to - // Rebind> in order to discover that its - // kPow2 is out of bounds. - - public: - // Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the - // common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2. - // E.g. Rebind> is Simd. - // The resulting number of lanes is still 1 because this N represents 1/4 - // (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of - // the sizes so that the correct LMUL overloads are chosen, even if N is - // small enough that it would fit in an LMUL=1 vector. - // - // Cannot be an enum because GCC warns when using enums and non-enums in the - // same expression. Cannot be a static constexpr function (MSVC limitation). - // Rounded up to one so this is a valid array length. - // - // Do not use this directly - only 'public' so it is visible from the accessor - // macro required by MSVC. - static constexpr size_t kPrivateLanes = - HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac)); - - constexpr size_t MaxLanes() const { return kPrivateLanes; } - constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); } - constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; } - // For SFINAE on RVV. - constexpr int Pow2() const { return kPow2; } - - // ------------------------------ Changing lane type or count - // Do not use any of these directly. Anything used from member typedefs cannot - // be made private, but functions only used within other functions can. - - // Returns number of NewT lanes that fit within MaxBytes(). - template - static constexpr size_t RepartitionLanes() { - // Round up to correctly handle larger NewT. - return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); - } - - // Returns the new kPow2 required for lanes of type NewT. - template - static constexpr int RebindPow2() { - return kPow2 + - ((sizeof(NewT) >= sizeof(T)) - ? static_cast(CeilLog2(sizeof(NewT) / sizeof(T))) - : -static_cast(CeilLog2(sizeof(T) / sizeof(NewT)))); - } - - private: - // Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2. - template - static constexpr size_t WholeN() { - return detail::ScaleByPower(kNewMaxLanes, -kNewPow2); - } - - // Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2. - template - static constexpr size_t FracN() { - // Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN - // would not have been zero), but clamp to zero to avoid warnings. kFrac is - // the difference, stored in the upper bits of N, and we also set kWhole = - // 1 so that the new kPrivateLanes = kNewMaxLanes. - static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift"); - return static_cast( - 1 + (HWY_MAX(0, kNewPow2 - static_cast(CeilLog2(kNewMaxLanes))) - << 20)); - } - - public: - // Returns (whole or fractional) NewN, see above. - template - static constexpr size_t NewN() { - // We require a fraction if inverting kNewPow2 results in 0. - return WholeN() == 0 - ? FracN() - : WholeN(); - } - - // PromoteTo/DemoteTo() with another lane type, but same number of lanes. - template - using Rebind = - Simd(), kPrivateLanes>(), RebindPow2()>; - - // Change lane type while keeping the same vector size, e.g. for MulEven. - template - using Repartition = - Simd()>(), kPow2>; - - // Half the lanes while keeping the same lane type, e.g. for LowerHalf. - using Half = Simd; - - // Twice the lanes while keeping the same lane type, e.g. for Combine. - using Twice = Simd; -}; - -namespace detail { - -template -constexpr bool IsFull(Simd /* d */) { - return N == HWY_LANES(T) && kPow2 == 0; -} - -// Struct wrappers enable validation of arguments via static_assert. -template -struct ClampNAndPow2 { - using type = Simd; -}; - -template -struct ScalableTagChecker { - using type = typename ClampNAndPow2::type; -}; - -template -struct CappedTagChecker { - static_assert(kLimit != 0, "Does not make sense to have zero lanes"); - // Safely handle non-power-of-two inputs by rounding down, which is allowed by - // CappedTag. Otherwise, Simd would static_assert. - static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit); - static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T)); - using type = typename ClampNAndPow2::type; -}; - -template -struct FixedTagChecker { - static_assert(kNumLanes != 0, "Does not make sense to have zero lanes"); - static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes"); - using type = Simd; -}; - -} // namespace detail - -// ------------------------------ Aliases for Simd<> - -// Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D -// loops where the application does not care about the vector size) or a -// fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or -// return values of type promotion and demotion. User-specified kPow2 is -// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. -template -using ScalableTag = typename detail::ScalableTagChecker::type; - -// Tag describing a vector with *up to* kLimit active lanes, even on targets -// with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may -// be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for -// 1D loops with a relatively low application-defined upper bound, e.g. for 8x8 -// DCTs. However, it is better if data structures are designed to be -// vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >= -// MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would -// enable vector-length-agnostic loops using ScalableTag). User-specified kPow2 -// is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`. -template -using CappedTag = typename detail::CappedTagChecker::type; - -#if !HWY_HAVE_SCALABLE -// If the vector size is known, and the app knows it does not want more than -// kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower -// IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2. -template -using CappedTagIfFixed = CappedTag; -#else // HWY_HAVE_SCALABLE -// .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit. -template -using CappedTagIfFixed = ScalableTag; -#endif - -// Alias for a tag describing a vector with *exactly* kNumLanes active lanes, -// even on targets with scalable vectors. Requires `kNumLanes` to be a power of -// two not exceeding `HWY_LANES(T)`. -// -// NOTE: if the application does not need to support HWY_SCALAR (+), use this -// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes. -// This is useful for data structures that rely on exactly 128-bit SIMD, but -// these are discouraged because they cannot benefit from wider vectors. -// Instead, applications would ideally define a larger problem size and loop -// over it with the (unknown size) vectors from ScalableTag. -// -// + e.g. if the baseline is known to support SIMD, or the application requires -// ops such as TableLookupBytes not supported by HWY_SCALAR. -template -using FixedTag = typename detail::FixedTagChecker::type; - -// Convenience form for fixed sizes. -template -using Full16 = Simd; - -template -using Full32 = Simd; - -template -using Full64 = Simd; - -template -using Full128 = Simd; - -// ------------------------------ Accessors for Simd<> - -// Lane type. -template -using TFromD = typename D::T; - -// Upper bound on the number of lanes, typically used for SFINAE conditions and -// to allocate storage for targets with known vector sizes. Note: this may be a -// loose bound, instead use Lanes() as the actual size for AllocateAligned. -// MSVC workaround: use static constant directly instead of a function. -#define HWY_MAX_LANES_D(D) D::kPrivateLanes - -// Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the -// macro form may be required for MSVC, which has limitations on deducing -// arguments. -template -HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { - return HWY_MAX_LANES_D(D); -} - -#if !HWY_HAVE_SCALABLE - -// If non-scalable, this is constexpr; otherwise the target's header defines a -// non-constexpr version of this function. This is the actual vector length, -// used when advancing loop counters. -template -HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) { - return HWY_MAX_LANES_D(D); -} - -#endif // !HWY_HAVE_SCALABLE - -// Tag for the same number of lanes as D, but with the LaneType T. -template -using Rebind = typename D::template Rebind; - -template -using RebindToSigned = Rebind>, D>; -template -using RebindToUnsigned = Rebind>, D>; -template -using RebindToFloat = Rebind>, D>; - -// Tag for the same total size as D, but with the LaneType T. -template -using Repartition = typename D::template Repartition; - -template -using RepartitionToWide = Repartition>, D>; -template -using RepartitionToNarrow = Repartition>, D>; - -// Tag for the same lane type as D, but half the lanes. -template -using Half = typename D::Half; - -// Tag for the same lane type as D, but twice the lanes. -template -using Twice = typename D::Twice; - -// Tag for a 16-byte block with the same lane type as D -#if HWY_HAVE_SCALABLE -namespace detail { - -template -class BlockDFromD_t {}; - -template -class BlockDFromD_t> { - using D = Simd; - static constexpr int kNewPow2 = HWY_MIN(kPow2, 0); - static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D)); - static constexpr size_t kNewN = D::template NewN(); - - public: - using type = Simd; -}; - -} // namespace detail - -template -using BlockDFromD = typename detail::BlockDFromD_t>::type; -#else -template -using BlockDFromD = - Simd, HWY_MIN(16 / sizeof(TFromD), HWY_MAX_LANES_D(D)), 0>; -#endif - -// ------------------------------ Choosing overloads (SFINAE) - -// Same as base.h macros but with a Simd argument instead of T. -#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD) -#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD) -#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD) -#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD) -#define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(TFromD) -#define HWY_IF_NOT_FLOAT3264_D(D) HWY_IF_NOT_FLOAT3264(TFromD) -#define HWY_IF_SPECIAL_FLOAT_D(D) HWY_IF_SPECIAL_FLOAT(TFromD) -#define HWY_IF_NOT_SPECIAL_FLOAT_D(D) HWY_IF_NOT_SPECIAL_FLOAT(TFromD) -#define HWY_IF_FLOAT_OR_SPECIAL_D(D) HWY_IF_FLOAT_OR_SPECIAL(TFromD) -#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \ - HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD) - -#define HWY_IF_T_SIZE_D(D, bytes) HWY_IF_T_SIZE(TFromD, bytes) -#define HWY_IF_NOT_T_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE(TFromD, bytes) -#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \ - HWY_IF_T_SIZE_ONE_OF(TFromD, bit_array) - -#define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes) -#define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes) -#define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes) -#define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \ - HWY_IF_LANES_PER_BLOCK( \ - TFromD, HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD)), lanes) - -#define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf* = nullptr -#define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr - -#define HWY_IF_U8_D(D) hwy::EnableIf, uint8_t>()>* = nullptr -#define HWY_IF_U16_D(D) hwy::EnableIf, uint16_t>()>* = nullptr -#define HWY_IF_U32_D(D) hwy::EnableIf, uint32_t>()>* = nullptr -#define HWY_IF_U64_D(D) hwy::EnableIf, uint64_t>()>* = nullptr - -#define HWY_IF_I8_D(D) hwy::EnableIf, int8_t>()>* = nullptr -#define HWY_IF_I16_D(D) hwy::EnableIf, int16_t>()>* = nullptr -#define HWY_IF_I32_D(D) hwy::EnableIf, int32_t>()>* = nullptr -#define HWY_IF_I64_D(D) hwy::EnableIf, int64_t>()>* = nullptr - -// Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double -// overloads. -#define HWY_IF_UI16_D(D) HWY_IF_UI16(TFromD) -#define HWY_IF_UI32_D(D) HWY_IF_UI32(TFromD) -#define HWY_IF_UI64_D(D) HWY_IF_UI64(TFromD) - -#define HWY_IF_BF16_D(D) \ - hwy::EnableIf, hwy::bfloat16_t>()>* = nullptr -#define HWY_IF_F16_D(D) \ - hwy::EnableIf, hwy::float16_t>()>* = nullptr -#define HWY_IF_F32_D(D) hwy::EnableIf, float>()>* = nullptr -#define HWY_IF_F64_D(D) hwy::EnableIf, double>()>* = nullptr - -#define HWY_IF_V_SIZE_D(D, bytes) \ - HWY_IF_V_SIZE(TFromD, HWY_MAX_LANES_D(D), bytes) -#define HWY_IF_V_SIZE_LE_D(D, bytes) \ - HWY_IF_V_SIZE_LE(TFromD, HWY_MAX_LANES_D(D), bytes) -#define HWY_IF_V_SIZE_GT_D(D, bytes) \ - HWY_IF_V_SIZE_GT(TFromD, HWY_MAX_LANES_D(D), bytes) - -// Same, but with a vector argument. ops/*-inl.h define their own TFromV. -#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV) -#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV) -#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV) -#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(TFromV) -#define HWY_IF_SPECIAL_FLOAT_V(V) HWY_IF_SPECIAL_FLOAT(TFromV) -#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \ - HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromV) - -#define HWY_IF_T_SIZE_V(V, bytes) HWY_IF_T_SIZE(TFromV, bytes) -#define HWY_IF_NOT_T_SIZE_V(V, bytes) HWY_IF_NOT_T_SIZE(TFromV, bytes) -#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \ - HWY_IF_T_SIZE_ONE_OF(TFromV, bit_array) - -#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV) -#define HWY_IF_V_SIZE_V(V, bytes) \ - HWY_IF_V_SIZE(TFromV, HWY_MAX_LANES_V(V), bytes) -#define HWY_IF_V_SIZE_LE_V(V, bytes) \ - HWY_IF_V_SIZE_LE(TFromV, HWY_MAX_LANES_V(V), bytes) -#define HWY_IF_V_SIZE_GT_V(V, bytes) \ - HWY_IF_V_SIZE_GT(TFromV, HWY_MAX_LANES_V(V), bytes) - -// Old names (deprecated) -#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes) -#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes) - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE diff --git a/deps/highway/include/hwy/ops/tuple-inl.h b/deps/highway/include/hwy/ops/tuple-inl.h deleted file mode 100644 index 9def0610..00000000 --- a/deps/highway/include/hwy/ops/tuple-inl.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright 2023 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Tuple support. Included by those ops/* that lack native tuple types, after -// they define VFromD and before they use the tuples e.g. for LoadInterleaved2. -// Assumes we are already in the HWY_NAMESPACE and under an include guard. - -// If viewing this header standalone, define VFromD to avoid IDE warnings. -// This is normally set by set_macros-inl.h before this header is included. -#if !defined(HWY_NAMESPACE) -#include "hwy/base.h" -template -using VFromD = int; -#endif - -// On SVE, Vec2..4 are aliases to built-in types. -template -struct Vec2 { - VFromD v0; - VFromD v1; -}; - -template -struct Vec3 { - VFromD v0; - VFromD v1; - VFromD v2; -}; - -template -struct Vec4 { - VFromD v0; - VFromD v1; - VFromD v2; - VFromD v3; -}; - -// D arg is unused but allows deducing D. -template -HWY_API Vec2 Create2(D /* tag */, VFromD v0, VFromD v1) { - return Vec2{v0, v1}; -} - -template -HWY_API Vec3 Create3(D /* tag */, VFromD v0, VFromD v1, VFromD v2) { - return Vec3{v0, v1, v2}; -} - -template -HWY_API Vec4 Create4(D /* tag */, VFromD v0, VFromD v1, VFromD v2, - VFromD v3) { - return Vec4{v0, v1, v2, v3}; -} - -template -HWY_API VFromD Get2(Vec2 tuple) { - static_assert(kIndex < 2, "Tuple index out of bounds"); - return kIndex == 0 ? tuple.v0 : tuple.v1; -} - -template -HWY_API VFromD Get3(Vec3 tuple) { - static_assert(kIndex < 3, "Tuple index out of bounds"); - return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2; -} - -template -HWY_API VFromD Get4(Vec4 tuple) { - static_assert(kIndex < 4, "Tuple index out of bounds"); - return kIndex == 0 ? tuple.v0 - : kIndex == 1 ? tuple.v1 - : kIndex == 2 ? tuple.v2 - : tuple.v3; -} - -template -HWY_API Vec2 Set2(Vec2 tuple, VFromD val) { - static_assert(kIndex < 2, "Tuple index out of bounds"); - if (kIndex == 0) { - tuple.v0 = val; - } else { - tuple.v1 = val; - } - return tuple; -} - -template -HWY_API Vec3 Set3(Vec3 tuple, VFromD val) { - static_assert(kIndex < 3, "Tuple index out of bounds"); - if (kIndex == 0) { - tuple.v0 = val; - } else if (kIndex == 1) { - tuple.v1 = val; - } else { - tuple.v2 = val; - } - return tuple; -} - -template -HWY_API Vec4 Set4(Vec4 tuple, VFromD val) { - static_assert(kIndex < 4, "Tuple index out of bounds"); - if (kIndex == 0) { - tuple.v0 = val; - } else if (kIndex == 1) { - tuple.v1 = val; - } else if (kIndex == 2) { - tuple.v2 = val; - } else { - tuple.v3 = val; - } - return tuple; -} \ No newline at end of file diff --git a/deps/highway/include/hwy/ops/wasm_128-inl.h b/deps/highway/include/hwy/ops/wasm_128-inl.h deleted file mode 100644 index b3f1b66d..00000000 --- a/deps/highway/include/hwy/ops/wasm_128-inl.h +++ /dev/null @@ -1,5718 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// 128-bit WASM vectors and operations. -// External include guard in highway.h - see comment there. - -#include - -#include "hwy/base.h" -#include "hwy/ops/shared-inl.h" - -#ifdef HWY_WASM_OLD_NAMES -#define wasm_i8x16_shuffle wasm_v8x16_shuffle -#define wasm_i16x8_shuffle wasm_v16x8_shuffle -#define wasm_i32x4_shuffle wasm_v32x4_shuffle -#define wasm_i64x2_shuffle wasm_v64x2_shuffle -#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 -#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 -#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 -#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 -#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 -#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 -#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 -#define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2 -#define wasm_u8x16_add_sat wasm_u8x16_add_saturate -#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate -#define wasm_u16x8_add_sat wasm_u16x8_add_saturate -#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate -#define wasm_i8x16_add_sat wasm_i8x16_add_saturate -#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate -#define wasm_i16x8_add_sat wasm_i16x8_add_saturate -#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate -#endif - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -#if HWY_TARGET == HWY_WASM_EMU256 -template -using Full256 = Simd; -#endif - -namespace detail { - -template -struct Raw128 { - using type = __v128_u; -}; -template <> -struct Raw128 { - using type = __f32x4; -}; -template <> -struct Raw128 { - using type = __f64x2; -}; - -} // namespace detail - -template -class Vec128 { - using Raw = typename detail::Raw128::type; - - public: - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = N; // only for DFromV - - // Compound assignment. Only usable if there is a corresponding non-member - // binary operator overload. For example, only f32 and f64 support division. - HWY_INLINE Vec128& operator*=(const Vec128 other) { - return *this = (*this * other); - } - HWY_INLINE Vec128& operator/=(const Vec128 other) { - return *this = (*this / other); - } - HWY_INLINE Vec128& operator+=(const Vec128 other) { - return *this = (*this + other); - } - HWY_INLINE Vec128& operator-=(const Vec128 other) { - return *this = (*this - other); - } - HWY_INLINE Vec128& operator&=(const Vec128 other) { - return *this = (*this & other); - } - HWY_INLINE Vec128& operator|=(const Vec128 other) { - return *this = (*this | other); - } - HWY_INLINE Vec128& operator^=(const Vec128 other) { - return *this = (*this ^ other); - } - - Raw raw; -}; - -template -using Vec64 = Vec128; - -template -using Vec32 = Vec128; - -template -using Vec16 = Vec128; - -// FF..FF or 0. -template -struct Mask128 { - using PrivateT = T; // only for DFromM - static constexpr size_t kPrivateN = N; // only for DFromM - - typename detail::Raw128::type raw; -}; - -template -using DFromV = Simd; - -template -using DFromM = Simd; - -template -using TFromV = typename V::PrivateT; - -// ------------------------------ Zero - -// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. -template -HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { - return Vec128, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)}; -} -template -HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { - return Vec128, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)}; -} -template -HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { - return Vec128, HWY_MAX_LANES_D(D)>{wasm_f64x2_splat(0.0)}; -} - -template -using VFromD = decltype(Zero(D())); - -// ------------------------------ Tuple (VFromD) -#include "hwy/ops/tuple-inl.h" - -// ------------------------------ BitCast - -namespace detail { - -HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } -HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { - return static_cast<__v128_u>(v); -} -HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { - return static_cast<__v128_u>(v); -} - -template -HWY_INLINE Vec128 BitCastToByte(Vec128 v) { - return Vec128{BitCastToInteger(v.raw)}; -} - -// Cannot rely on function overloading because return types differ. -template -struct BitCastFromInteger128 { - HWY_INLINE __v128_u operator()(__v128_u v) { return v; } -}; -template <> -struct BitCastFromInteger128 { - HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } -}; -template <> -struct BitCastFromInteger128 { - HWY_INLINE __f64x2 operator()(__v128_u v) { return static_cast<__f64x2>(v); } -}; - -template -HWY_INLINE VFromD BitCastFromByte(D d, Vec128 v) { - return VFromD{BitCastFromInteger128>()(v.raw)}; -} - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, - Vec128().MaxLanes()> v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(v)); -} - -// ------------------------------ ResizeBitCast - -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - const Repartition du8_to; - return BitCast(d, VFromD{detail::BitCastToInteger(v.raw)}); -} - -// ------------------------------ Set - -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{wasm_i8x16_splat(static_cast(t))}; -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{wasm_i16x8_splat(static_cast(t))}; -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{wasm_i32x4_splat(static_cast(t))}; -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{wasm_i64x2_splat(static_cast(t))}; -} - -template -HWY_API VFromD Set(D /* tag */, const float t) { - return VFromD{wasm_f32x4_splat(t)}; -} -template -HWY_API VFromD Set(D /* tag */, const double t) { - return VFromD{wasm_f64x2_splat(t)}; -} - -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") - -// For all vector sizes. -template -HWY_API VFromD Undefined(D d) { - return Zero(d); -} - -HWY_DIAGNOSTICS(pop) - -// For all vector sizes. -template , typename T2> -HWY_API VFromD Iota(D d, const T2 first) { - HWY_ALIGN T lanes[MaxLanes(d)]; - for (size_t i = 0; i < MaxLanes(d); ++i) { - lanes[i] = - AddWithWraparound(hwy::IsFloatTag(), static_cast(first), i); - } - return Load(d, lanes); -} - -// ================================================== ARITHMETIC - -// ------------------------------ Addition - -// Unsigned -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i8x16_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i64x2_add(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i8x16_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i64x2_add(a.raw, b.raw)}; -} - -// Float -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_f32x4_add(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_f64x2_add(a.raw, b.raw)}; -} - -// ------------------------------ Subtraction - -// Unsigned -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(Vec128 a, - Vec128 b) { - return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; -} - -// Float -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_f32x4_sub(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_f64x2_sub(a.raw, b.raw)}; -} - -// ------------------------------ SaturatedAdd - -// Returns a + b clamped to the destination range. - -// Unsigned -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; -} - -// ------------------------------ SaturatedSub - -// Returns a - b clamped to the destination range. - -// Unsigned -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; -} - -// ------------------------------ Average - -// Returns (a + b + 1) / 2 - -// Unsigned -template -HWY_API Vec128 AverageRound(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_u8x16_avgr(a.raw, b.raw)}; -} -template -HWY_API Vec128 AverageRound(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_u16x8_avgr(a.raw, b.raw)}; -} - -// ------------------------------ Absolute value - -// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. -template -HWY_API Vec128 Abs(const Vec128 v) { - return Vec128{wasm_i8x16_abs(v.raw)}; -} -template -HWY_API Vec128 Abs(const Vec128 v) { - return Vec128{wasm_i16x8_abs(v.raw)}; -} -template -HWY_API Vec128 Abs(const Vec128 v) { - return Vec128{wasm_i32x4_abs(v.raw)}; -} -template -HWY_API Vec128 Abs(const Vec128 v) { - return Vec128{wasm_i64x2_abs(v.raw)}; -} - -template -HWY_API Vec128 Abs(const Vec128 v) { - return Vec128{wasm_f32x4_abs(v.raw)}; -} -template -HWY_API Vec128 Abs(const Vec128 v) { - return Vec128{wasm_f64x2_abs(v.raw)}; -} - -// ------------------------------ Shift lanes by constant #bits - -// Unsigned -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{wasm_i16x8_shl(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{wasm_u16x8_shr(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{wasm_i32x4_shl(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{wasm_i64x2_shl(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{wasm_u32x4_shr(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{wasm_u64x2_shr(v.raw, kBits)}; -} - -// Signed -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{wasm_i16x8_shl(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{wasm_i16x8_shr(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{wasm_i32x4_shl(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{wasm_i64x2_shl(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{wasm_i32x4_shr(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{wasm_i64x2_shr(v.raw, kBits)}; -} - -// 8-bit -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; - return kBits == 1 - ? (v + v) - : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); -} - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ - ShiftRight(Vec128{v.raw}).raw}; - return shifted & Set(d8, 0xFF >> kBits); -} - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - const DFromV di; - const RebindToUnsigned du; - const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); - const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// ------------------------------ RotateRight (ShiftRight, Or) -template -HWY_API Vec128 RotateRight(const Vec128 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -// ------------------------------ Shift lanes by same variable #bits - -// After https://reviews.llvm.org/D108415 shift argument became unsigned. -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") - -// Unsigned -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i16x8_shl(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { - return Vec128{wasm_u16x8_shr(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i32x4_shl(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { - return Vec128{wasm_u32x4_shr(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i64x2_shl(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { - return Vec128{wasm_u64x2_shr(v.raw, bits)}; -} - -// Signed -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i16x8_shl(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i16x8_shr(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i32x4_shl(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i32x4_shr(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i64x2_shl(v.raw, bits)}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { - return Vec128{wasm_i64x2_shr(v.raw, bits)}; -} - -// 8-bit -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ - ShiftLeftSame(Vec128>{v.raw}, bits).raw}; - return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); -} - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, - const int bits) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ - ShiftRightSame(Vec128{v.raw}, bits).raw}; - return shifted & Set(d8, 0xFF >> bits); -} - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { - const DFromV di; - const RebindToUnsigned du; - const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); - const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// ignore Wsign-conversion -HWY_DIAGNOSTICS(pop) - -// ------------------------------ Minimum - -// Unsigned -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{wasm_u8x16_min(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{wasm_u16x8_min(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{wasm_u32x4_min(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. - const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); - const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); - const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); - const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); - alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)}; - return Vec128{wasm_v128_load(min)}; -} - -// Signed -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{wasm_i8x16_min(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{wasm_i16x8_min(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{wasm_i32x4_min(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - alignas(16) int64_t min[4]; - min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), - wasm_i64x2_extract_lane(b.raw, 0)); - min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), - wasm_i64x2_extract_lane(b.raw, 1)); - return Vec128{wasm_v128_load(min)}; -} - -// Float -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - // Equivalent to a < b ? a : b (taking into account our swapped arg order, - // so that Min(NaN, x) is x to match x86). - return Vec128{wasm_f32x4_pmin(b.raw, a.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - // Equivalent to a < b ? a : b (taking into account our swapped arg order, - // so that Min(NaN, x) is x to match x86). - return Vec128{wasm_f64x2_pmin(b.raw, a.raw)}; -} - -// ------------------------------ Maximum - -// Unsigned -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{wasm_u8x16_max(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{wasm_u16x8_max(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{wasm_u32x4_max(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. - const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); - const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); - const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); - const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); - alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)}; - return Vec128{wasm_v128_load(max)}; -} - -// Signed -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{wasm_i8x16_max(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{wasm_i16x8_max(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{wasm_i32x4_max(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - alignas(16) int64_t max[2]; - max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), - wasm_i64x2_extract_lane(b.raw, 0)); - max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), - wasm_i64x2_extract_lane(b.raw, 1)); - return Vec128{wasm_v128_load(max)}; -} - -// Float -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - // Equivalent to b < a ? a : b (taking into account our swapped arg order, - // so that Max(NaN, x) is x to match x86). - return Vec128{wasm_f32x4_pmax(b.raw, a.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - // Equivalent to b < a ? a : b (taking into account our swapped arg order, - // so that Max(NaN, x) is x to match x86). - return Vec128{wasm_f64x2_pmax(b.raw, a.raw)}; -} - -// ------------------------------ Integer multiplication - -// Unsigned -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; -} - -// Returns the upper 16 bits of a * b in each lane. -template -HWY_API Vec128 MulHigh(const Vec128 a, - const Vec128 b) { - const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw); - const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw); - // TODO(eustas): shift-right + narrow? - return Vec128{ - wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; -} -template -HWY_API Vec128 MulHigh(const Vec128 a, - const Vec128 b) { - const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw); - const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw); - // TODO(eustas): shift-right + narrow? - return Vec128{ - wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; -} - -template -HWY_API Vec128 MulFixedPoint15(Vec128 a, - Vec128 b) { - return Vec128{wasm_i16x8_q15mulr_sat(a.raw, b.raw)}; -} - -// Multiplies even lanes (0, 2 ..) and returns the double-width result. -template -HWY_API Vec128, (N + 1) / 2> MulEven(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RepartitionToWide dw; - constexpr int kSrcBits = sizeof(T) * 8; - - const auto ae = - ShiftRight(ShiftLeft(ResizeBitCast(dw, a))); - const auto be = - ShiftRight(ShiftLeft(ResizeBitCast(dw, b))); - return ae * be; -} -template -HWY_API Vec128, (N + 1) / 2> MulEven(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RepartitionToWide dw; - const auto kEvenMask = Set(dw, LimitsMax()); - - const auto ae = And(ResizeBitCast(dw, a), kEvenMask); - const auto be = And(ResizeBitCast(dw, b), kEvenMask); - return ae * be; -} -template -HWY_API Vec128 MulEven(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RepartitionToWide dw; - const auto ae = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, a))).raw; - const auto be = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, b))).raw; - return Vec128{wasm_i64x2_mul(ae, be)}; -} -template -HWY_API Vec128 MulEven(const Vec128 a, - const Vec128 b) { - const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); - const auto ae = wasm_v128_and(a.raw, kEvenMask); - const auto be = wasm_v128_and(b.raw, kEvenMask); - return Vec128{wasm_i64x2_mul(ae, be)}; -} - -// Multiplies odd lanes (1, 3 ..) and returns the double-width result. -template -HWY_API Vec128, (N + 1) / 2> MulOdd(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RepartitionToWide dw; - constexpr int kSrcBits = sizeof(T) * 8; - - const auto ao = ShiftRight(BitCast(dw, a)); - const auto bo = ShiftRight(BitCast(dw, b)); - return ao * bo; -} -template -HWY_API Vec128, (N + 1) / 2> MulOdd(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RepartitionToWide dw; - - const auto ao = ShiftRight<32>(BitCast(dw, a)); - const auto bo = ShiftRight<32>(BitCast(dw, b)); - return Vec128, (N + 1) / 2>{wasm_i64x2_mul(ao.raw, bo.raw)}; -} - -// ------------------------------ Negate - -template -HWY_API Vec128 Neg(const Vec128 v) { - return Xor(v, SignBit(DFromV())); -} - -template -HWY_API Vec128 Neg(const Vec128 v) { - return Vec128{wasm_i8x16_neg(v.raw)}; -} -template -HWY_API Vec128 Neg(const Vec128 v) { - return Vec128{wasm_i16x8_neg(v.raw)}; -} -template -HWY_API Vec128 Neg(const Vec128 v) { - return Vec128{wasm_i32x4_neg(v.raw)}; -} -template -HWY_API Vec128 Neg(const Vec128 v) { - return Vec128{wasm_i64x2_neg(v.raw)}; -} - -// ------------------------------ Floating-point mul / div - -template -HWY_API Vec128 operator*(Vec128 a, Vec128 b) { - return Vec128{wasm_f32x4_mul(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(Vec128 a, Vec128 b) { - return Vec128{wasm_f64x2_mul(a.raw, b.raw)}; -} - -template -HWY_API Vec128 operator/(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_f32x4_div(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator/(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_f64x2_div(a.raw, b.raw)}; -} - -template -HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { - return Set(DFromV(), T{1.0}) / v; -} - -// Integer overload defined in generic_ops-inl.h. -template -HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { - return Abs(a - b); -} - -// ------------------------------ Floating-point multiply-add variants - -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return mul * x + add; -} - -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { - return add - mul * x; -} - -template -HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return mul * x - sub; -} - -template -HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, - Vec128 sub) { - return Neg(mul) * x - sub; -} - -// ------------------------------ Floating-point square root - -// Full precision square root -template -HWY_API Vec128 Sqrt(const Vec128 v) { - return Vec128{wasm_f32x4_sqrt(v.raw)}; -} -template -HWY_API Vec128 Sqrt(const Vec128 v) { - return Vec128{wasm_f64x2_sqrt(v.raw)}; -} - -// Approximate reciprocal square root -template -HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { - // TODO(eustas): find cheaper a way to calculate this. - return Set(DFromV(), T{1.0}) / Sqrt(v); -} - -// ------------------------------ Floating-point rounding - -// Toward nearest integer, ties to even -template -HWY_API Vec128 Round(const Vec128 v) { - return Vec128{wasm_f32x4_nearest(v.raw)}; -} -template -HWY_API Vec128 Round(const Vec128 v) { - return Vec128{wasm_f64x2_nearest(v.raw)}; -} - -// Toward zero, aka truncate -template -HWY_API Vec128 Trunc(const Vec128 v) { - return Vec128{wasm_f32x4_trunc(v.raw)}; -} -template -HWY_API Vec128 Trunc(const Vec128 v) { - return Vec128{wasm_f64x2_trunc(v.raw)}; -} - -// Toward +infinity, aka ceiling -template -HWY_API Vec128 Ceil(const Vec128 v) { - return Vec128{wasm_f32x4_ceil(v.raw)}; -} -template -HWY_API Vec128 Ceil(const Vec128 v) { - return Vec128{wasm_f64x2_ceil(v.raw)}; -} - -// Toward -infinity, aka floor -template -HWY_API Vec128 Floor(const Vec128 v) { - return Vec128{wasm_f32x4_floor(v.raw)}; -} -template -HWY_API Vec128 Floor(const Vec128 v) { - return Vec128{wasm_f64x2_floor(v.raw)}; -} - -// ------------------------------ Floating-point classification -template -HWY_API Mask128 IsNaN(const Vec128 v) { - return v != v; -} - -template -HWY_API Mask128 IsInf(const Vec128 v) { - const DFromV d; - const RebindToSigned di; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask128 IsFinite(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - const VFromD vu = BitCast(du, v); - // 'Shift left' to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). - const VFromD exp = - BitCast(di, ShiftRight() + 1>(Add(vu, vu))); - return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); -} - -// ================================================== COMPARE - -// Comparisons fill a lane with 1-bits if the condition is true, else 0. - -// Mask and Vec are the same (true = FF..FF). -template -HWY_API Mask128 MaskFromVec(const Vec128 v) { - return Mask128{v.raw}; -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -template -HWY_API MFromD RebindMask(DTo /* tag */, Mask128 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - return MFromD{m.raw}; -} - -template -HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} - -// ------------------------------ Equality - -// Unsigned -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; -} - -// Float -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f32x4_eq(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f64x2_eq(a.raw, b.raw)}; -} - -// ------------------------------ Inequality - -// Unsigned -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; -} - -// Float -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f32x4_ne(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f64x2_ne(a.raw, b.raw)}; -} - -// ------------------------------ Strict inequality - -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i8x16_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i16x8_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i32x4_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i64x2_gt(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_u8x16_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_u16x8_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_u32x4_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - const DFromV d; - const Repartition d32; - const auto a32 = BitCast(d32, a); - const auto b32 = BitCast(d32, b); - // If the upper halves are not equal, this is the answer. - const auto m_gt = a32 > b32; - - // Otherwise, the lower half decides. - const auto m_eq = a32 == b32; - const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); - const auto lo_gt = And(m_eq, MaskFromVec(VFromD{lo_in_hi})); - - const auto gt = Or(lo_gt, m_gt); - // Copy result in upper 32 bits to lower 32 bits. - return Mask128{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; -} - -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f32x4_gt(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f64x2_gt(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { - return operator>(b, a); -} - -// ------------------------------ Weak inequality - -// Float >= -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_f64x2_ge(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i8x16_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i16x8_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i32x4_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_i64x2_ge(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_u8x16_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_u16x8_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Mask128{wasm_u32x4_ge(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(const Vec128 a, - const Vec128 b) { - return Not(b > a); -} - -template -HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { - return operator>=(b, a); -} - -// ------------------------------ FirstN (Iota, Lt) - -template -HWY_API MFromD FirstN(D d, size_t num) { - const RebindToSigned di; // Signed comparisons may be cheaper. - using TI = TFromD; - return RebindMask(d, Iota(di, 0) < Set(di, static_cast(num))); -} - -// ================================================== LOGICAL - -// ------------------------------ Not - -template -HWY_API Vec128 Not(Vec128 v) { - return Vec128{wasm_v128_not(v.raw)}; -} - -// ------------------------------ And - -template -HWY_API Vec128 And(Vec128 a, Vec128 b) { - return Vec128{wasm_v128_and(a.raw, b.raw)}; -} - -// ------------------------------ AndNot - -// Returns ~not_mask & mask. -template -HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { - return Vec128{wasm_v128_andnot(mask.raw, not_mask.raw)}; -} - -// ------------------------------ Or - -template -HWY_API Vec128 Or(Vec128 a, Vec128 b) { - return Vec128{wasm_v128_or(a.raw, b.raw)}; -} - -// ------------------------------ Xor - -template -HWY_API Vec128 Xor(Vec128 a, Vec128 b) { - return Vec128{wasm_v128_xor(a.raw, b.raw)}; -} - -// ------------------------------ Xor3 - -template -HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { - return Xor(x1, Xor(x2, x3)); -} - -// ------------------------------ Or3 - -template -HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { - return Or(o1, Or(o2, o3)); -} - -// ------------------------------ OrAnd - -template -HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { - return Or(o, And(a1, a2)); -} - -// ------------------------------ IfVecThenElse - -template -HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, - Vec128 no) { - return IfThenElse(MaskFromVec(mask), yes, no); -} - -// ------------------------------ Operator overloads (internal-only if float) - -template -HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { - return And(a, b); -} - -template -HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { - return Or(a, b); -} - -template -HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { - return Xor(a, b); -} - -// ------------------------------ CopySign -template -HWY_API Vec128 CopySign(const Vec128 magn, - const Vec128 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return BitwiseIfThenElse(SignBit(d), sign, magn); -} - -// ------------------------------ CopySignToAbs -template -HWY_API Vec128 CopySignToAbs(const Vec128 abs, - const Vec128 sign) { - static_assert(IsFloat(), "Only makes sense for floating-point"); - const DFromV d; - return OrAnd(abs, SignBit(d), sign); -} - -// ------------------------------ BroadcastSignBit (compare) - -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - return ShiftRight(v); -} -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - const DFromV d; - return VecFromMask(d, v < Zero(d)); -} - -// ------------------------------ Mask - -template -HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { - return VFromD{v.raw}; -} - -// mask ? yes : no -template -HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, - Vec128 no) { - return Vec128{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; -} - -// mask ? yes : 0 -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { - return yes & VecFromMask(DFromV(), mask); -} - -// mask ? 0 : no -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { - return AndNot(VecFromMask(DFromV(), mask), no); -} - -template -HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, - Vec128 no) { - static_assert(IsSigned(), "Only works for signed/float"); - const DFromV d; - const RebindToSigned di; - - v = BitCast(d, BroadcastSignBit(BitCast(di, v))); - return IfThenElse(MaskFromVec(v), yes, no); -} - -template -HWY_API Vec128 ZeroIfNegative(Vec128 v) { - const DFromV d; - const auto zero = Zero(d); - return IfThenElse(Mask128{(v > zero).raw}, v, zero); -} - -// ------------------------------ Mask logical - -template -HWY_API Mask128 Not(const Mask128 m) { - const DFromM d; - return MaskFromVec(Not(VecFromMask(d, m))); -} - -template -HWY_API Mask128 And(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Or(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { - const DFromM d; - return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); -} - -// ------------------------------ Shl (BroadcastSignBit, IfThenElse) - -// The x86 multiply-by-Pow2() trick will not work because WASM saturates -// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a -// scalar count operand, per-lane shift instructions would require extract_lane -// for each lane, and hoping that shuffle is correctly mapped to a native -// instruction. Using non-vector shifts would incur a store-load forwarding -// stall when loading the result vector. We instead test bits of the shift -// count to "predicate" a shift of the entire vector by a constant. - -template -HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { - const DFromV d; - Mask128 mask; - // Need a signed type for BroadcastSignBit. - auto test = BitCast(RebindToSigned(), bits); - // Move the highest valid bit of the shift count into the sign bit. - test = ShiftLeft<5>(test); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<4>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<2>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - return IfThenElse(mask, ShiftLeft<1>(v), v); -} - -template -HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { - const DFromV d; - Mask128 mask; - // Need a signed type for BroadcastSignBit. - auto test = BitCast(RebindToSigned(), bits); - // Move the highest valid bit of the shift count into the sign bit. - test = ShiftLeft<12>(test); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<8>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<4>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<2>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - return IfThenElse(mask, ShiftLeft<1>(v), v); -} - -template -HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { - const DFromV d; - Mask128 mask; - // Need a signed type for BroadcastSignBit. - auto test = BitCast(RebindToSigned(), bits); - // Move the highest valid bit of the shift count into the sign bit. - test = ShiftLeft<27>(test); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<16>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<8>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<4>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftLeft<2>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - return IfThenElse(mask, ShiftLeft<1>(v), v); -} - -template -HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { - const DFromV d; - const RebindToUnsigned du; - using TU = MakeUnsigned; - alignas(16) TU lanes[2] = {}; - alignas(16) TU bits_lanes[2] = {}; - Store(BitCast(du, v), du, lanes); - Store(BitCast(du, bits), du, bits_lanes); - lanes[0] <<= (bits_lanes[0] & 63); - lanes[1] <<= (bits_lanes[1] & 63); - return BitCast(d, Load(du, lanes)); -} - -// ------------------------------ Shr (BroadcastSignBit, IfThenElse) - -template -HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { - const DFromV d; - Mask128 mask; - // Need a signed type for BroadcastSignBit. - auto test = BitCast(RebindToSigned(), bits); - // Move the highest valid bit of the shift count into the sign bit. - test = ShiftLeft<5>(test); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<4>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<2>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - return IfThenElse(mask, ShiftRight<1>(v), v); -} - -template -HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { - const DFromV d; - Mask128 mask; - // Need a signed type for BroadcastSignBit. - auto test = BitCast(RebindToSigned(), bits); - // Move the highest valid bit of the shift count into the sign bit. - test = ShiftLeft<12>(test); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<8>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<4>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<2>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - return IfThenElse(mask, ShiftRight<1>(v), v); -} - -template -HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { - const DFromV d; - Mask128 mask; - // Need a signed type for BroadcastSignBit. - auto test = BitCast(RebindToSigned(), bits); - // Move the highest valid bit of the shift count into the sign bit. - test = ShiftLeft<27>(test); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<16>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<8>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<4>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - test = ShiftLeft<1>(test); // next bit (descending order) - v = IfThenElse(mask, ShiftRight<2>(v), v); - - mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); - return IfThenElse(mask, ShiftRight<1>(v), v); -} - -template -HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { - const DFromV d; - alignas(16) T lanes[2] = {}; - alignas(16) T bits_lanes[2] = {}; - Store(v, d, lanes); - Store(bits, d, bits_lanes); - lanes[0] >>= (bits_lanes[0] & 63); - lanes[1] >>= (bits_lanes[1] & 63); - return Load(d, lanes); -} - -// ================================================== MEMORY - -// ------------------------------ Load - -template > -HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { - return Vec128{wasm_v128_load(aligned)}; -} - -// Partial -template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { - VFromD v; - CopyBytes(p, &v); - return v; -} - -// LoadU == Load. -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} - -// 128-bit SIMD => nothing to duplicate, same as an unaligned load. -template -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} - -template > -HWY_API VFromD MaskedLoad(MFromD m, D d, const T* HWY_RESTRICT aligned) { - return IfThenElseZero(m, Load(d, aligned)); -} - -template > -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const T* HWY_RESTRICT aligned) { - return IfThenElse(m, Load(d, aligned), v); -} - -// ------------------------------ Store - -namespace detail { - -template -HWY_INLINE T ExtractLane(const Vec128 v) { - return static_cast(wasm_i8x16_extract_lane(v.raw, kLane)); -} -template -HWY_INLINE T ExtractLane(const Vec128 v) { - return static_cast(wasm_i16x8_extract_lane(v.raw, kLane)); -} -template -HWY_INLINE T ExtractLane(const Vec128 v) { - return static_cast(wasm_i32x4_extract_lane(v.raw, kLane)); -} -template -HWY_INLINE T ExtractLane(const Vec128 v) { - return static_cast(wasm_i64x2_extract_lane(v.raw, kLane)); -} - -template -HWY_INLINE float ExtractLane(const Vec128 v) { - return wasm_f32x4_extract_lane(v.raw, kLane); -} -template -HWY_INLINE double ExtractLane(const Vec128 v) { - return wasm_f64x2_extract_lane(v.raw, kLane); -} - -} // namespace detail - -template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { - wasm_v128_store(aligned, v.raw); -} - -// Partial -template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { - CopyBytes(&v, p); -} - -template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { - *p = detail::ExtractLane<0>(v); -} - -// StoreU == Store. -template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); -} - -// ------------------------------ Non-temporal stores - -// Same as aligned stores on non-x86. - -template -HWY_API void Stream(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { - wasm_v128_store(aligned, v.raw); -} - -// ------------------------------ Scatter in generic_ops-inl.h -// ------------------------------ Gather in generic_ops-inl.h - -// ================================================== SWIZZLE - -// ------------------------------ ExtractLane - -// One overload per vector length just in case *_extract_lane raise compile -// errors if their argument is out of bounds (even if that would never be -// reached at runtime). -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { - HWY_DASSERT(i == 0); - (void)i; - return detail::ExtractLane<0>(v); -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - } - } -#endif - alignas(16) T lanes[2]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - case 2: - return detail::ExtractLane<2>(v); - case 3: - return detail::ExtractLane<3>(v); - } - } -#endif - alignas(16) T lanes[4]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - case 2: - return detail::ExtractLane<2>(v); - case 3: - return detail::ExtractLane<3>(v); - case 4: - return detail::ExtractLane<4>(v); - case 5: - return detail::ExtractLane<5>(v); - case 6: - return detail::ExtractLane<6>(v); - case 7: - return detail::ExtractLane<7>(v); - } - } -#endif - alignas(16) T lanes[8]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - case 2: - return detail::ExtractLane<2>(v); - case 3: - return detail::ExtractLane<3>(v); - case 4: - return detail::ExtractLane<4>(v); - case 5: - return detail::ExtractLane<5>(v); - case 6: - return detail::ExtractLane<6>(v); - case 7: - return detail::ExtractLane<7>(v); - case 8: - return detail::ExtractLane<8>(v); - case 9: - return detail::ExtractLane<9>(v); - case 10: - return detail::ExtractLane<10>(v); - case 11: - return detail::ExtractLane<11>(v); - case 12: - return detail::ExtractLane<12>(v); - case 13: - return detail::ExtractLane<13>(v); - case 14: - return detail::ExtractLane<14>(v); - case 15: - return detail::ExtractLane<15>(v); - } - } -#endif - alignas(16) T lanes[16]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -// ------------------------------ GetLane -template -HWY_API T GetLane(const Vec128 v) { - return detail::ExtractLane<0>(v); -} - -// ------------------------------ InsertLane - -namespace detail { - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); - return Vec128{ - wasm_i8x16_replace_lane(v.raw, kLane, static_cast(t))}; -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); - return Vec128{ - wasm_i16x8_replace_lane(v.raw, kLane, static_cast(t))}; -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); - return Vec128{ - wasm_i32x4_replace_lane(v.raw, kLane, static_cast(t))}; -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); - return Vec128{ - wasm_i64x2_replace_lane(v.raw, kLane, static_cast(t))}; -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { - static_assert(kLane < N, "Lane index out of bounds"); - return Vec128{wasm_f32x4_replace_lane(v.raw, kLane, t)}; -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { - static_assert(kLane < 2, "Lane index out of bounds"); - return Vec128{wasm_f64x2_replace_lane(v.raw, kLane, t)}; -} - -} // namespace detail - -// Requires one overload per vector length because InsertLane<3> may be a -// compile error if it calls wasm_f64x2_replace_lane. - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { - HWY_DASSERT(i == 0); - (void)i; - return Set(DFromV(), t); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[2]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[4]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - case 4: - return detail::InsertLane<4>(v, t); - case 5: - return detail::InsertLane<5>(v, t); - case 6: - return detail::InsertLane<6>(v, t); - case 7: - return detail::InsertLane<7>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[8]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - case 4: - return detail::InsertLane<4>(v, t); - case 5: - return detail::InsertLane<5>(v, t); - case 6: - return detail::InsertLane<6>(v, t); - case 7: - return detail::InsertLane<7>(v, t); - case 8: - return detail::InsertLane<8>(v, t); - case 9: - return detail::InsertLane<9>(v, t); - case 10: - return detail::InsertLane<10>(v, t); - case 11: - return detail::InsertLane<11>(v, t); - case 12: - return detail::InsertLane<12>(v, t); - case 13: - return detail::InsertLane<13>(v, t); - case 14: - return detail::InsertLane<14>(v, t); - case 15: - return detail::InsertLane<15>(v, t); - } - } -#endif - const DFromV d; - alignas(16) T lanes[16]; - Store(v, d, lanes); - lanes[i] = t; - return Load(d, lanes); -} - -// ------------------------------ LowerHalf - -template -HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { - return VFromD{v.raw}; -} -template -HWY_API Vec128 LowerHalf(Vec128 v) { - return Vec128{v.raw}; -} - -// ------------------------------ ShiftLeftBytes - -// 0x01..0F, kBytes = 1 => 0x02..0F00 -template -HWY_API VFromD ShiftLeftBytes(D /* tag */, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - const __i8x16 zero = wasm_i8x16_splat(0); - switch (kBytes) { - case 0: - return v; - - case 1: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12, 13, 14)}; - - case 2: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13)}; - - case 3: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, - 4, 5, 6, 7, 8, 9, 10, 11, 12)}; - - case 4: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, - 3, 4, 5, 6, 7, 8, 9, 10, 11)}; - - case 5: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, - 2, 3, 4, 5, 6, 7, 8, 9, 10)}; - - case 6: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; - - case 7: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; - - case 8: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; - - case 9: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; - - case 10: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; - - case 11: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; - - case 12: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; - - case 13: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; - - case 14: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 0, - 1)}; - - case 15: - return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, 16, - 0)}; - } - return VFromD{zero}; -} - -template -HWY_API Vec128 ShiftLeftBytes(Vec128 v) { - return ShiftLeftBytes(DFromV(), v); -} - -// ------------------------------ ShiftLeftLanes - -template -HWY_API VFromD ShiftLeftLanes(D d, const VFromD v) { - const Repartition d8; - constexpr size_t kBytes = kLanes * sizeof(TFromD); - return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); -} - -template -HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { - return ShiftLeftLanes(DFromV(), v); -} - -// ------------------------------ ShiftRightBytes -namespace detail { - -// Helper function allows zeroing invalid lanes in caller. -template -HWY_API __i8x16 ShrBytes(const Vec128 v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - const __i8x16 zero = wasm_i8x16_splat(0); - - switch (kBytes) { - case 0: - return v.raw; - - case 1: - return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16); - - case 2: - return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 16, 16); - - case 3: - return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 16, 16, 16); - - case 4: - return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 16, 16, 16, 16); - - case 5: - return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 16, 16, 16, 16); - - case 6: - return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 16, 16, 16, 16, 16); - - case 7: - return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 16, 16, 16, 16, 16, 16); - - case 8: - return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 9: - return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 10: - return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 11: - return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 12: - return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 13: - return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 14: - return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - - case 15: - return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16); - case 16: - return zero; - } -} - -} // namespace detail - -// 0x01..0F, kBytes = 1 => 0x0001..0E -template -HWY_API VFromD ShiftRightBytes(D d, VFromD v) { - // For partial vectors, clear upper lanes so we shift in zeros. - if (d.MaxBytes() != 16) { - const Full128> dfull; - const VFromD vfull{v.raw}; - v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; - } - return VFromD{detail::ShrBytes(v)}; -} - -// ------------------------------ ShiftRightLanes -template -HWY_API VFromD ShiftRightLanes(D d, const VFromD v) { - const Repartition d8; - constexpr size_t kBytes = kLanes * sizeof(TFromD); - return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); -} - -// ------------------------------ UpperHalf (ShiftRightBytes) - -template > -HWY_API Vec64 UpperHalf(D /* tag */, const Vec128 v) { - return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; -} - -// Partial -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - return LowerHalf(d, ShiftRightBytes(Twice(), v)); -} - -// ------------------------------ CombineShiftRightBytes - -template > -HWY_API Vec128 CombineShiftRightBytes(D /* tag */, Vec128 hi, - Vec128 lo) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - switch (kBytes) { - case 0: - return lo; - - case 1: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, 16)}; - - case 2: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 16, 17)}; - - case 3: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18)}; - - case 4: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19)}; - - case 5: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20)}; - - case 6: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, 16, 17, 18, 19, 20, - 21)}; - - case 7: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22)}; - - case 8: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, - 14, 15, 16, 17, 18, 19, 20, 21, 22, - 23)}; - - case 9: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24)}; - - case 10: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25)}; - - case 11: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26)}; - - case 12: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, 25, - 26, 27)}; - - case 13: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, - 18, 19, 20, 21, 22, 23, 24, 25, 26, - 27, 28)}; - - case 14: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, - 28, 29)}; - - case 15: - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30)}; - } - return hi; -} - -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - constexpr size_t kSize = d.MaxBytes(); - static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); - const Repartition d8; - using V8 = Vec128; - const DFromV dfull8; - const Repartition, decltype(dfull8)> dfull; - const V8 hi8{BitCast(d8, hi).raw}; - // Move into most-significant bytes - const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); - const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); - return VFromD{BitCast(dfull, r).raw}; -} - -// ------------------------------ Broadcast/splat any lane - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{wasm_i8x16_shuffle( - v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, - kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, - kLane, kLane, kLane, kLane, kLane)}; -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{ - wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; -} - -// ------------------------------ TableLookupBytes - -// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. -// lane indices in [0, 16). -template -HWY_API Vec128 TableLookupBytes(const Vec128 bytes, - const Vec128 from) { - return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; -} - -template -HWY_API Vec128 TableLookupBytesOr0(const Vec128 bytes, - const Vec128 from) { - const DFromV d; - // Mask size must match vector type, so cast everything to this type. - Repartition di8; - Repartition> d_bytes8; - const auto msb = BitCast(di8, from) < Zero(di8); - const auto lookup = - TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); - return BitCast(d, IfThenZeroElse(msb, lookup)); -} - -// ------------------------------ Hard-coded shuffles - -// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). -// Shuffle0321 rotates one lane to the right (the previous least-significant -// lane is now most-significant). These could also be implemented via -// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. - -// Swap 32-bit halves in 64-bit halves. -template -HWY_API Vec128 Shuffle2301(const Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; -} - -// These are used by generic_ops-inl to implement LoadInterleaved3. -namespace detail { - -template -HWY_API Vec128 ShuffleTwo2301(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16, - 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, - 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; -} -template -HWY_API Vec128 ShuffleTwo2301(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8, - 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; -} -template -HWY_API Vec128 ShuffleTwo2301(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)}; -} - -template -HWY_API Vec128 ShuffleTwo1230(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16, - 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, - 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; -} -template -HWY_API Vec128 ShuffleTwo1230(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8, - 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; -} -template -HWY_API Vec128 ShuffleTwo1230(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)}; -} - -template -HWY_API Vec128 ShuffleTwo3012(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16, - 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, - 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; -} -template -HWY_API Vec128 ShuffleTwo3012(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8, - 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; -} -template -HWY_API Vec128 ShuffleTwo3012(const Vec128 a, - const Vec128 b) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)}; -} - -} // namespace detail - -// Swap 64-bit halves -template -HWY_API Vec128 Shuffle01(const Vec128 v) { - static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); - return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; -} -template -HWY_API Vec128 Shuffle1032(const Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; -} - -// Rotate right 32 bits -template -HWY_API Vec128 Shuffle0321(const Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; -} - -// Rotate left 32 bits -template -HWY_API Vec128 Shuffle2103(const Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; -} - -// Reverse -template -HWY_API Vec128 Shuffle0123(const Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; -} - -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices for use by TableLookupLanes. -template -struct Indices128 { - __v128_u raw; -}; - -namespace detail { - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - return Iota(d8, 0); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - return Load(d8, kBroadcastLaneBytes); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; - return Load(d8, kBroadcastLaneBytes); -} - -template -HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( - D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; - return Load(d8, kBroadcastLaneBytes); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - return Zero(d8); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; - return Load(d8, kByteOffsets); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; - return Load(d8, kByteOffsets); -} - -template -HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { - const Repartition d8; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; - return Load(d8, kByteOffsets); -} - -} // namespace detail - -template -HWY_API Indices128, MaxLanes(D())> IndicesFromVec( - D d, Vec128 vec) { - using T = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const RebindToUnsigned du; - using TU = TFromD; - HWY_DASSERT(AllTrue( - du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); -#endif - - (void)d; - return Indices128, MaxLanes(D())>{vec.raw}; -} - -template -HWY_API Indices128, MaxLanes(D())> IndicesFromVec( - D d, Vec128 vec) { - using T = TFromD; - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const RebindToUnsigned du; - using TU = TFromD; - HWY_DASSERT(AllTrue( - du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); -#endif - - const Repartition d8; - using V8 = VFromD; - - // Broadcast each lane index to all bytes of T and shift to bytes - const V8 lane_indices = TableLookupBytes( - BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); - constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); - const V8 byte_indices = ShiftLeft(lane_indices); - const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); - return Indices128, MaxLanes(D())>{BitCast(d, sum).raw}; -} - -template -HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( - D d, const TI* idx) { - const Rebind di; - return IndicesFromVec(d, LoadU(di, idx)); -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { - using TI = MakeSigned; - const DFromV d; - const Rebind di; - return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - const DFromV d; - const Twice dt; -// TableLookupLanes currently requires table and index vectors to be the same -// size, though a half-length index vector would be sufficient here. -#if HWY_IS_MSAN - const Vec128 idx_vec{idx.raw}; - const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; -#else - // We only keep LowerHalf of the result, which is valid in idx. - const Indices128 idx2{idx.raw}; -#endif - return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - const DFromV d; - const Repartition du8; - - const VFromD byte_idx{idx.raw}; - const auto byte_idx_mod = byte_idx & Set(du8, uint8_t{0x0F}); - // If ANDing did not change the index, it is for the lower half. - const auto is_lo = (byte_idx == byte_idx_mod); - - return BitCast(d, IfThenElse(is_lo, TableLookupBytes(a, byte_idx_mod), - TableLookupBytes(b, byte_idx_mod))); -} - -// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) - -// Single lane: no change -template , HWY_IF_LANES_D(D, 1)> -HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { - return v; -} - -// 32-bit x2: shuffle -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec64 Reverse(D /* tag */, const Vec64 v) { - return Vec64{Shuffle2301(Vec128{v.raw}).raw}; -} - -// 64-bit x2: shuffle -template , HWY_IF_T_SIZE(T, 8)> -HWY_API Vec128 Reverse(D /* tag */, const Vec128 v) { - return Shuffle01(v); -} - -// 32-bit x2: shuffle -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec128 Reverse(D /* tag */, const Vec128 v) { - return Shuffle0123(v); -} - -// 16-bit -template -HWY_API VFromD Reverse(D d, const VFromD v) { - const RepartitionToWide> du32; - return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - static constexpr int kN = 16 + Lanes(d); - return VFromD{wasm_i8x16_shuffle( - v.raw, v.raw, - // kN is adjusted to ensure we have valid indices for all lengths. - kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9, - kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)}; -} - -// ------------------------------ Reverse2 - -template -HWY_API VFromD Reverse2(D d, const VFromD v) { - const RepartitionToWide> dw; - return BitCast(d, RotateRight<16>(BitCast(dw, v))); -} - -template -HWY_API VFromD Reverse2(D /* tag */, const VFromD v) { - return Shuffle2301(v); -} - -template -HWY_API VFromD Reverse2(D /* tag */, const VFromD v) { - return Shuffle01(v); -} - -// ------------------------------ Reverse4 - -template -HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { - return VFromD{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)}; -} - -template -HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { - return Shuffle0123(v); -} - -template -HWY_API VFromD Reverse4(D /* tag */, const VFromD) { - HWY_ASSERT(0); // don't have 8 u64 lanes -} - -// ------------------------------ Reverse8 - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - return Reverse(d, v); -} - -template -HWY_API VFromD Reverse8(D /* tag */, const VFromD) { - HWY_ASSERT(0); // don't have 8 lanes for > 16-bit lanes -} - -// ------------------------------ InterleaveLower - -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i8x16_shuffle( - a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{ - wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; -} - -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i8x16_shuffle( - a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{ - wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; -} - -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; -} - -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; -} - -// Additional overload for the optional tag (all vector lengths). -template -HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { - return InterleaveLower(a, b); -} - -// ------------------------------ InterleaveUpper (UpperHalf) - -// All functions inside detail lack the required D parameter. -namespace detail { - -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, - 26, 11, 27, 12, 28, 13, 29, 14, - 30, 15, 31)}; -} -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{ - wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; -} -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; -} -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; -} - -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, - 26, 11, 27, 12, 28, 13, 29, 14, - 30, 15, 31)}; -} -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{ - wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; -} -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; -} -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; -} - -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; -} - -template -HWY_API Vec128 InterleaveUpper(Vec128 a, - Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; -} - -} // namespace detail - -// Full -template > -HWY_API Vec128 InterleaveUpper(D /* tag */, Vec128 a, Vec128 b) { - return detail::InterleaveUpper(a, b); -} - -// Partial -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const Half d2; - return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, - VFromD{UpperHalf(d2, b).raw}); -} - -// ------------------------------ ZipLower/ZipUpper (InterleaveLower) - -// Same as Interleave*, except that the return lanes are double-width integers; -// this is necessary because the single-lane scalar cannot return two values. -template >> -HWY_API VFromD ZipLower(V a, V b) { - return BitCast(DW(), InterleaveLower(a, b)); -} -template , class DW = RepartitionToWide> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - return BitCast(dw, InterleaveLower(D(), a, b)); -} - -template , class DW = RepartitionToWide> -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - return BitCast(dw, InterleaveUpper(D(), a, b)); -} - -// ------------------------------ Per4LaneBlockShuffle -namespace detail { - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<1> /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - constexpr int kIdx3 = static_cast((kIdx3210 >> 6) & 3); - constexpr int kIdx2 = static_cast((kIdx3210 >> 4) & 3); - constexpr int kIdx1 = static_cast((kIdx3210 >> 2) & 3); - constexpr int kIdx0 = static_cast(kIdx3210 & 3); - return V{wasm_i8x16_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3, - kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4, - kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8, - kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)}; -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<2> /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - constexpr int kIdx3 = static_cast((kIdx3210 >> 6) & 3); - constexpr int kIdx2 = static_cast((kIdx3210 >> 4) & 3); - constexpr int kIdx1 = static_cast((kIdx3210 >> 2) & 3); - constexpr int kIdx0 = static_cast(kIdx3210 & 3); - return V{wasm_i16x8_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3, - kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)}; -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - constexpr int kIdx3 = static_cast((kIdx3210 >> 6) & 3); - constexpr int kIdx2 = static_cast((kIdx3210 >> 4) & 3); - constexpr int kIdx1 = static_cast((kIdx3210 >> 2) & 3); - constexpr int kIdx0 = static_cast(kIdx3210 & 3); - return V{wasm_i32x4_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3)}; -} - -} // namespace detail - -// ------------------------------ SlideUpLanes - -namespace detail { - -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - const Full64 du64; - const auto vu64 = ResizeBitCast(du64, v); - return ResizeBitCast( - d, ShiftLeftSame(vu64, static_cast(amt * sizeof(TFromV) * 8))); -} - -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - const Repartition du8; - const auto idx = - Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); - return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); -} - -} // namespace detail - -template -HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - case 4: - return ShiftLeftLanes<4>(d, v); - case 5: - return ShiftLeftLanes<5>(d, v); - case 6: - return ShiftLeftLanes<6>(d, v); - case 7: - return ShiftLeftLanes<7>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - case 4: - return ShiftLeftLanes<4>(d, v); - case 5: - return ShiftLeftLanes<5>(d, v); - case 6: - return ShiftLeftLanes<6>(d, v); - case 7: - return ShiftLeftLanes<7>(d, v); - case 8: - return ShiftLeftLanes<8>(d, v); - case 9: - return ShiftLeftLanes<9>(d, v); - case 10: - return ShiftLeftLanes<10>(d, v); - case 11: - return ShiftLeftLanes<11>(d, v); - case 12: - return ShiftLeftLanes<12>(d, v); - case 13: - return ShiftLeftLanes<13>(d, v); - case 14: - return ShiftLeftLanes<14>(d, v); - case 15: - return ShiftLeftLanes<15>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -// ------------------------------ SlideDownLanes - -namespace detail { - -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - const Repartition, decltype(d)> dv; - return BitCast(d, - ShiftRightSame(BitCast(dv, v), - static_cast(amt * sizeof(TFromV) * 8))); -} - -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - const Repartition di8; - auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); - idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); - return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); -} - -} // namespace detail - -template -HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - case 4: - return ShiftRightLanes<4>(d, v); - case 5: - return ShiftRightLanes<5>(d, v); - case 6: - return ShiftRightLanes<6>(d, v); - case 7: - return ShiftRightLanes<7>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - case 4: - return ShiftRightLanes<4>(d, v); - case 5: - return ShiftRightLanes<5>(d, v); - case 6: - return ShiftRightLanes<6>(d, v); - case 7: - return ShiftRightLanes<7>(d, v); - case 8: - return ShiftRightLanes<8>(d, v); - case 9: - return ShiftRightLanes<9>(d, v); - case 10: - return ShiftRightLanes<10>(d, v); - case 11: - return ShiftRightLanes<11>(d, v); - case 12: - return ShiftRightLanes<12>(d, v); - case 13: - return ShiftRightLanes<13>(d, v); - case 14: - return ShiftRightLanes<14>(d, v); - case 15: - return ShiftRightLanes<15>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -// ================================================== COMBINE - -// ------------------------------ Combine (InterleaveLower) - -// N = N/2 + N/2 (upper half undefined) -template >> -HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { - const Half dh; - const RebindToUnsigned duh; - // Treat half-width input as one lane, and expand to two lanes. - using VU = Vec128, 2>; - const VU lo{BitCast(duh, lo_half).raw}; - const VU hi{BitCast(duh, hi_half).raw}; - return BitCast(d, InterleaveLower(lo, hi)); -} - -// ------------------------------ ZeroExtendVector (IfThenElseZero) -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { - const Half dh; - return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD{lo.raw}); -} - -// ------------------------------ ConcatLowerLower -template > -HWY_API Vec128 ConcatLowerLower(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; -} - -// ------------------------------ ConcatUpperUpper -template > -HWY_API Vec128 ConcatUpperUpper(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; -} - -// ------------------------------ ConcatLowerUpper -template > -HWY_API Vec128 ConcatLowerUpper(D d, Vec128 hi, Vec128 lo) { - return CombineShiftRightBytes<8>(d, hi, lo); -} - -// ------------------------------ ConcatUpperLower -template > -HWY_API Vec128 ConcatUpperLower(D d, Vec128 hi, Vec128 lo) { - return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); -} - -// ------------------------------ Concat partial (Combine, LowerHalf) - -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatLowerUpper(D d, const VFromD hi, - const VFromD lo) { - const Half d2; - return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); -} - -// ------------------------------ ConcatOdd - -// 8-bit full -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 ConcatOdd(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15, - 17, 19, 21, 23, 25, 27, 29, 31)}; -} - -// 8-bit x8 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec64 ConcatOdd(D /* tag */, Vec64 hi, Vec64 lo) { - // Don't care about upper half. - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21, - 23, 1, 3, 5, 7, 17, 19, 21, 23)}; -} - -// 8-bit x4 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 ConcatOdd(D /* tag */, Vec32 hi, Vec32 lo) { - // Don't care about upper 3/4. - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17, - 19, 1, 3, 17, 19, 1, 3, 17, 19)}; -} - -// 16-bit full -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec128 ConcatOdd(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{ - wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)}; -} - -// 16-bit x4 -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec64 ConcatOdd(D /* tag */, Vec64 hi, Vec64 lo) { - // Don't care about upper half. - return Vec128{ - wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)}; -} - -// 32-bit full -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec128 ConcatOdd(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; -} - -// Any T x2 -template , HWY_IF_LANES_D(D, 2)> -HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { - return InterleaveUpper(d, lo, hi); -} - -// ------------------------------ ConcatEven (InterleaveLower) - -// 8-bit full -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec128 ConcatEven(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30)}; -} - -// 8-bit x8 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec64 ConcatEven(D /* tag */, Vec64 hi, Vec64 lo) { - // Don't care about upper half. - return Vec64{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22, - 0, 2, 4, 6, 16, 18, 20, 22)}; -} - -// 8-bit x4 -template , HWY_IF_T_SIZE(T, 1)> -HWY_API Vec32 ConcatEven(D /* tag */, Vec32 hi, Vec32 lo) { - // Don't care about upper 3/4. - return Vec32{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18, - 0, 2, 16, 18, 0, 2, 16, 18)}; -} - -// 16-bit full -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec128 ConcatEven(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{ - wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)}; -} - -// 16-bit x4 -template , HWY_IF_T_SIZE(T, 2)> -HWY_API Vec64 ConcatEven(D /* tag */, Vec64 hi, Vec64 lo) { - // Don't care about upper half. - return Vec64{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)}; -} - -// 32-bit full -template , HWY_IF_T_SIZE(T, 4)> -HWY_API Vec128 ConcatEven(D /* tag */, Vec128 hi, Vec128 lo) { - return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; -} - -// Any T x2 -template , HWY_IF_LANES_D(D, 2)> -HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { - return InterleaveLower(d, lo, hi); -} - -// ------------------------------ DupEven (InterleaveLower) - -template -HWY_API Vec128 DupEven(Vec128 v) { - return Vec128{wasm_i8x16_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6, - 8, 8, 10, 10, 12, 12, 14, 14)}; -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6)}; -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; -} - -template -HWY_API Vec128 DupEven(const Vec128 v) { - return InterleaveLower(DFromV(), v, v); -} - -// ------------------------------ DupOdd (InterleaveUpper) - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return Vec128{wasm_i8x16_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7, - 9, 9, 11, 11, 13, 13, 15, 15)}; -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7)}; -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; -} - -template -HWY_API Vec128 DupOdd(const Vec128 v) { - return InterleaveUpper(DFromV(), v, v); -} - -// ------------------------------ OddEven - -namespace detail { - -template -HWY_INLINE Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, - const Vec128 b) { - const DFromV d; - const Repartition d8; - alignas(16) static constexpr uint8_t mask[16] = { - 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; - return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); -} -template -HWY_INLINE Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, - const Vec128 b) { - return Vec128{ - wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; -} -template -HWY_INLINE Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; -} -template -HWY_INLINE Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; -} - -} // namespace detail - -template -HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { - return detail::OddEven(hwy::SizeTag(), a, b); -} -template -HWY_API Vec128 OddEven(const Vec128 a, - const Vec128 b) { - return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; -} - -// ------------------------------ OddEvenBlocks -template -HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { - return even; -} - -// ------------------------------ SwapAdjacentBlocks - -template -HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { - return v; -} - -// ------------------------------ ReverseBlocks - -// Single block: no change -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return v; -} - -// ================================================== CONVERT - -// ------------------------------ Promotions (part w/ narrow lanes -> full) - -// Unsigned: zero-extend. -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u16x8_extend_low_u8x16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u32x4_extend_low_u16x8(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u64x2_extend_low_u32x4(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{ - wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u16x8_extend_low_u8x16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u32x4_extend_low_u16x8(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u64x2_extend_low_u32x4(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{ - wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; -} - -// U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to -// TFromD -template -HWY_API VFromD PromoteTo(D d, V v) { - const Rebind du32; - return PromoteTo(d, PromoteTo(du32, v)); -} - -// Signed: replicate sign bit. -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i16x8_extend_low_i8x16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i32x4_extend_low_i16x8(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i64x2_extend_low_i32x4(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{ - wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; -} - -// I8/I16 to I64: First, promote to I32, and then promote to I64 -template -HWY_API VFromD PromoteTo(D d, V v) { - const Rebind di32; - return PromoteTo(d, PromoteTo(di32, v)); -} - -template -HWY_API VFromD PromoteTo(D df32, VFromD> v) { - const Rebind du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_f64x2_convert_low_i32x4(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_f64x2_promote_low_f32x4(v.raw)}; -} - -// ------------------------------ PromoteUpperTo - -// Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo. -#ifdef HWY_NATIVE_PROMOTE_UPPER_TO -#undef HWY_NATIVE_PROMOTE_UPPER_TO -#else -#define HWY_NATIVE_PROMOTE_UPPER_TO -#endif - -// Unsigned: zero-extend. -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_u16x8_extend_high_u8x16(v.raw)}; -} -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_u32x4_extend_high_u16x8(v.raw)}; -} -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_u64x2_extend_high_u32x4(v.raw)}; -} - -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_u16x8_extend_high_u8x16(v.raw)}; -} -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_u32x4_extend_high_u16x8(v.raw)}; -} -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_u64x2_extend_high_u32x4(v.raw)}; -} - -// Signed: replicate sign bit. -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_i16x8_extend_high_i8x16(v.raw)}; -} -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_i32x4_extend_high_i16x8(v.raw)}; -} -template -HWY_API VFromD PromoteUpperTo(D /* tag */, - VFromD> v) { - return VFromD{wasm_i64x2_extend_high_i32x4(v.raw)}; -} - -template -HWY_API VFromD PromoteUpperTo(D df32, VFromD> v) { - const Rebind dh; - return PromoteTo(df32, UpperHalf(dh, v)); -} - -template -HWY_API VFromD PromoteUpperTo(D df32, VFromD> v) { - const Repartition du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); -} - -template -HWY_API VFromD PromoteUpperTo(D dd, VFromD> v) { - // There is no wasm_f64x2_convert_high_i32x4. - return PromoteTo(dd, UpperHalf(Rebind(), v)); -} - -template -HWY_API VFromD PromoteUpperTo(D dd, VFromD> v) { - // There is no wasm_f64x2_promote_high_f32x4. - return PromoteTo(dd, UpperHalf(Rebind(), v)); -} - -// Generic version for <=64 bit input/output (_high is only for full vectors). -template -HWY_API VFromD PromoteUpperTo(D d, V v) { - const Rebind, decltype(d)> dh; - return PromoteTo(d, UpperHalf(dh, v)); -} - -// ------------------------------ Demotions (full -> part w/ narrow lanes) - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); - return VFromD{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); - return VFromD{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - const DFromV du32; - const RebindToSigned di32; - return DemoteTo(dn, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); -} - -template -HWY_API VFromD DemoteTo(D du8, VFromD> v) { - const DFromV du16; - const RebindToSigned di16; - return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF)))); -} - -template -HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { - const Rebind di32; - const Rebind du32; // for logical shift right - const Rebind du16; - const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); - return BitCast(dbf16, DemoteTo(du16, bits_in_32)); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{wasm_f32x4_demote_f64x2_zero(v.raw)}; -} - -template >> -HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { - const RebindToUnsigned du16; - const Repartition du32; - const VFromD b_in_even = ShiftRight<16>(BitCast(du32, b)); - return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); -} - -// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes -// above 2*N. -template -HWY_API Vec32 ReorderDemote2To(D dn, Vec32 a, - Vec32 b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, - Vec64 b) { - const Twice dn_full; - const Repartition du32_full; - - const Vec128 v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template -HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return Vec128{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; -} - -template -HWY_API Vec32 ReorderDemote2To(D dn, Vec32 a, - Vec32 b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, - Vec64 b) { - const Twice dn_full; - const Repartition du32_full; - - const Vec128 v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template -HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return Vec128{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV du32; - const RebindToSigned di32; - const auto max_i32 = Set(du32, 0x7FFFFFFFu); - - const auto clamped_a = BitCast(di32, Min(a, max_i32)); - const auto clamped_b = BitCast(di32, Min(b, max_i32)); - return ReorderDemote2To(dn, clamped_a, clamped_b); -} -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -// Specializations for partial vectors because i8x16_narrow_i16x8 sets lanes -// above 2*N. -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, - Vec64 b) { - const Twice dn_full; - const Repartition du32_full; - - const Vec128 v_full{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template -HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return Vec128{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, - Vec64 b) { - const Twice dn_full; - const Repartition du32_full; - - const Vec128 v_full{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; - const auto vu32_full = BitCast(du32_full, v_full); - return LowerHalf( - BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); -} -template -HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return Vec128{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV du16; - const RebindToSigned di16; - const auto max_i16 = Set(du16, 0x7FFFu); - - const auto clamped_a = BitCast(di16, Min(a, max_i16)); - const auto clamped_b = BitCast(di16, Min(b, max_i16)); - return ReorderDemote2To(dn, clamped_a, clamped_b); -} -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -// For already range-limited input [0, 255]. -template -HWY_API Vec128 U8FromU32(const Vec128 v) { - const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); - return Vec128{ - wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; -} - -// ------------------------------ Truncations - -template -HWY_API VFromD TruncateTo(DTo /* tag */, Vec128 v) { - // BitCast requires the same size; DTo might be u8x1 and v u16x1. - const Repartition, DFromV> dto; - return VFromD{BitCast(dto, v).raw}; -} - -template -HWY_API Vec16 TruncateTo(D /* tag */, Vec128 v) { - const Full128 d; - const auto v1 = BitCast(d, v); - const auto v2 = ConcatEven(d, v1, v1); - const auto v4 = ConcatEven(d, v2, v2); - return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4)))); -} - -template -HWY_API Vec32 TruncateTo(D /* tag */, Vec128 v) { - const Full128 d; - const auto v1 = BitCast(d, v); - const auto v2 = ConcatEven(d, v1, v1); - return LowerHalf(LowerHalf(ConcatEven(d, v2, v2))); -} - -template -HWY_API Vec64 TruncateTo(D /* tag */, Vec128 v) { - const Full128 d; - const auto v1 = BitCast(d, v); - return LowerHalf(ConcatEven(d, v1, v1)); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const Repartition> d; - const auto v1 = Vec128{v.raw}; - const auto v2 = ConcatEven(d, v1, v1); - const auto v3 = ConcatEven(d, v2, v2); - return VFromD{v3.raw}; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const Repartition> d; - const auto v1 = Vec128{v.raw}; - const auto v2 = ConcatEven(d, v1, v1); - return VFromD{v2.raw}; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const Repartition> d; - const auto v1 = Vec128{v.raw}; - const auto v2 = ConcatEven(d, v1, v1); - return VFromD{v2.raw}; -} - -// ------------------------------ Demotions to/from i64 - -namespace detail { -template -HWY_INLINE VFromD> DemoteFromU64MaskOutResult( - D /*dn*/, VFromD> v) { - return v; -} - -template -HWY_INLINE VFromD> DemoteFromU64MaskOutResult( - D /*dn*/, VFromD> v) { - const DFromV du64; - return And(v, - Set(du64, static_cast(hwy::HighestValue>()))); -} - -template -HWY_INLINE VFromD> DemoteFromU64Saturate( - D dn, VFromD> v) { - const Rebind du64; - const RebindToSigned di64; - constexpr int kShiftAmt = static_cast(sizeof(TFromD) * 8) - - static_cast(hwy::IsSigned>()); - - const auto too_big = BitCast( - du64, VecFromMask( - di64, Gt(BitCast(di64, ShiftRight(v)), Zero(di64)))); - return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); -} - -template -HWY_INLINE VFromD ReorderDemote2From64To32Combine(D dn, V a, V b) { - return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); -} - -} // namespace detail - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - const DFromV di64; - const RebindToUnsigned du64; - const RebindToUnsigned dn_u; - - // Negative values are saturated by first saturating their bitwise inverse - // and then inverting the saturation result - const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); - const auto saturated_vals = Xor( - invert_mask, - detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); - return BitCast(dn, TruncateTo(dn_u, saturated_vals)); -} - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - const DFromV di64; - const RebindToUnsigned du64; - - const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); - return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); -} - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); -} - -template )> -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV di64; - const RebindToUnsigned du64; - const Half dnh; - - // Negative values are saturated by first saturating their bitwise inverse - // and then inverting the saturation result - const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); - const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); - const auto saturated_a = Xor( - invert_mask_a, - detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); - const auto saturated_b = Xor( - invert_mask_b, - detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); - - return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV di64; - const RebindToUnsigned du64; - const Half dnh; - - const auto saturated_a = detail::DemoteFromU64Saturate( - dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); - const auto saturated_b = detail::DemoteFromU64Saturate( - dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); - - return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const Half dnh; - - const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); - const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); - - return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); -} - -template ), class V, - HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return ReorderDemote2To(d, a, b); -} - -template >> -HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { - const RebindToUnsigned du16; - return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); -} - -// ------------------------------ ConvertTo - -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD{wasm_f32x4_convert_i32x4(v.raw)}; -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD{wasm_f32x4_convert_u32x4(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D dd, VFromD> v) { - // Based on wim's approach (https://stackoverflow.com/questions/41144668/) - const Repartition d32; - const Repartition d64; - - // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 - const auto k84_63 = Set(d64, 0x4530000080000000ULL); - const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); - - // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) - const auto k52 = Set(d32, 0x43300000); - const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); - - const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); - return (v_upper - k84_63_52) + v_lower; // order matters! -} - -namespace detail { -template -HWY_INLINE VFromD>> U64ToF64VecFast(VW w) { - const DFromV d64; - const RebindToFloat dd; - const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 - return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; -} -} // namespace detail - -template -HWY_API VFromD ConvertTo(D dd, VFromD> v) { - // Based on wim's approach (https://stackoverflow.com/questions/41144668/) - const RebindToUnsigned d64; - using VU = VFromD; - - const VU msk_lo = Set(d64, 0xFFFFFFFF); - const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 - - // Extract the 32 lowest/highest significant bits of v - const VU v_lo = And(v, msk_lo); - const VU v_hi = ShiftRight<32>(v); - - const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); - return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); -} - -// Truncates (rounds toward zero). -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD{wasm_i32x4_trunc_sat_f32x4(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(DI di, VFromD> v) { - using VI = VFromD; - using MI = MFromD; - const RebindToUnsigned du; - using VU = VFromD; - const Repartition du16; - const VI k1075 = Set(di, 1075); // biased exponent of 2^52 - - // Exponent indicates whether the number can be represented as int64_t. - const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF); - const MI in_range = BitCast(di, biased_exp) < Set(di, 1086); - - // If we were to cap the exponent at 51 and add 2^52, the number would be in - // [2^52, 2^53) and mantissa bits could be read out directly. We need to - // round-to-0 (truncate). - // Use 16-bit saturated unsigned subtraction to compute shift_mnt and - // shift_int since biased_exp[i] is a non-negative integer that is less than - // or equal to 2047. - // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be - // zero as the upper 48 bits of both k1075 and biased_exp are zero. - - const VU shift_mnt = BitCast( - du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); - const VU shift_int = BitCast( - du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); - const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1); - // Include implicit 1-bit - VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; - // WASM clamps shift count; zero if greater. - const MI tiny = BitCast(di, shift_mnt) > Set(di, 63); - int53 = IfThenZeroElse(RebindMask(du, tiny), int53); - - // For inputs larger than 2^53 - 1, insert zeros at the bottom. - // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be - // shifted out of the left shift result below as shift_int[i] <= 10 is true - // for any inputs that are less than 2^63. - const VU shifted = int53 << shift_int; - - // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. - const VI sign_mask = BroadcastSignBit(BitCast(di, v)); - const VI limit = Set(di, LimitsMax()) - sign_mask; - const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit); - - // If the input was negative, negate the integer (two's complement). - return (magnitude ^ sign_mask) - sign_mask; -} - -// ------------------------------ NearestInt (Round) -template -HWY_API Vec128 NearestInt(const Vec128 v) { - return ConvertTo(RebindToSigned>(), Round(v)); -} - -// ================================================== MISC - -// ------------------------------ SumsOf8 (ShiftRight, Add) -template -HWY_API Vec128 SumsOf8(const Vec128 v) { - const DFromV du8; - const RepartitionToWide du16; - const RepartitionToWide du32; - const RepartitionToWide du64; - using VU16 = VFromD; - - const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); - const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); - const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); - - const VU16 szz_FE_zz_BA_zz_76_zz_32 = - BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); - const VU16 sxx_FC_xx_B8_xx_74_xx_30 = - Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); - const VU16 szz_zz_xx_FC_zz_zz_xx_74 = - BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); - const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = - Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); - return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); -} - -// ------------------------------ LoadMaskBits (TestBit) - -namespace detail { - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { - const RebindToUnsigned du; - // Easier than Set(), which would require an >8-bit type, which would not - // compile for T=uint8_t, N=1. - const VFromD vbits{wasm_i32x4_splat(static_cast(bits))}; - - // Replicate bytes 8x such that each byte contains the bit that governs it. - alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1}; - const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); - - alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128}; - return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - return RebindMask( - d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; - return RebindMask( - d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; - return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); -} - -} // namespace detail - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - uint64_t mask_bits = 0; - CopyBytes<(MaxLanes(d) + 7) / 8>(bits, &mask_bits); - return detail::LoadMaskBits(d, mask_bits); -} - -// ------------------------------ Mask - -namespace detail { - -// Full -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { - alignas(16) uint64_t lanes[2]; - wasm_v128_store(lanes, mask.raw); - - constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - const uint64_t lo = ((lanes[0] * kMagic) >> 56); - const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; - return (hi + lo); -} - -// 64-bit -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { - constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * - kMagic) >> - 56; -} - -// 32-bit or less: need masking -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { - uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); - // Clear potentially undefined bytes. - bytes &= (1ULL << (N * 8)) - 1; - constexpr uint64_t kMagic = 0x103070F1F3F80ULL; - return (bytes * kMagic) >> 56; -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, - const Mask128 mask) { - // Remove useless lower half of each u16 while preserving the sign bit. - const __i16x8 zero = wasm_i16x8_splat(0); - const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; - return BitsFromMask(hwy::SizeTag<1>(), mask8); -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, - const Mask128 mask) { - const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); - const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); - const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); - alignas(16) uint32_t lanes[4]; - wasm_v128_store(lanes, sliced_mask); - return lanes[0] | lanes[1] | lanes[2] | lanes[3]; -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, - const Mask128 mask) { - const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); - const __i64x2 slice = wasm_i64x2_make(1, 2); - const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); - alignas(16) uint64_t lanes[2]; - wasm_v128_store(lanes, sliced_mask); - return lanes[0] | lanes[1]; -} - -// Returns the lowest N bits for the BitsFromMask result. -template -constexpr uint64_t OnlyActive(uint64_t bits) { - return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); -} - -// Returns 0xFF for bytes with index >= N, otherwise 0. -template -constexpr __i8x16 BytesAbove() { - return /**/ - (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) - : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) - : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) - : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) - : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) - : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) - : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) - : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) - : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) - : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1) - : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1) - : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1) - : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, - -1, -1, -1) - : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, - -1, -1, -1) - : (N == 11) - ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) - : (N == 13) - ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) - : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); -} - -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { - return PopCount(BitsFromMask(tag, m)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { - return PopCount(BitsFromMask(tag, m)); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { - const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); - const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); - alignas(16) uint64_t lanes[2]; - wasm_v128_store(lanes, shifted_bits); - return PopCount(lanes[0] | lanes[1]); -} - -template -HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { - alignas(16) int64_t lanes[2]; - wasm_v128_store(lanes, m.raw); - return static_cast(-(lanes[0] + lanes[1])); -} - -} // namespace detail - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, const MFromD mask, uint8_t* bits) { - const uint64_t mask_bits = detail::BitsFromMask(mask); - const size_t kNumBytes = (d.MaxLanes() + 7) / 8; - CopyBytes(&mask_bits, bits); - return kNumBytes; -} - -template -HWY_API size_t CountTrue(D /* tag */, const MFromD m) { - return detail::CountTrue(hwy::SizeTag)>(), m); -} - -// Partial -template , HWY_IF_V_SIZE_LE_D(D, 8)> -HWY_API size_t CountTrue(D d, MFromD m) { - // Ensure all undefined bytes are 0. - const MFromD mask{detail::BytesAbove()}; - const Full128 dfull; - return CountTrue(dfull, Mask128{AndNot(mask, m).raw}); -} - -// Full vector -template -HWY_API bool AllFalse(D d, const MFromD m) { - const auto v8 = BitCast(Full128(), VecFromMask(d, m)); - return !wasm_v128_any_true(v8.raw); -} - -// Full vector -namespace detail { -template -HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { - return wasm_i8x16_all_true(m.raw); -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { - return wasm_i16x8_all_true(m.raw); -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { - return wasm_i32x4_all_true(m.raw); -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { - return wasm_i64x2_all_true(m.raw); -} - -} // namespace detail - -template > -HWY_API bool AllTrue(D /* tag */, const Mask128 m) { - return detail::AllTrue(hwy::SizeTag(), m); -} - -// Partial vectors - -template , HWY_IF_V_SIZE_LE_D(D, 8)> -HWY_API bool AllFalse(D d, const MFromD m) { - // Ensure all undefined bytes are 0. - const MFromD mask{detail::BytesAbove()}; - return AllFalse(Full128(), Mask128{AndNot(mask, m).raw}); -} - -template , HWY_IF_V_SIZE_LE_D(D, 8)> -HWY_API bool AllTrue(D d, const MFromD m) { - // Ensure all undefined bytes are FF. - const MFromD mask{detail::BytesAbove()}; - return AllTrue(Full128(), Mask128{Or(mask, m).raw}); -} - -template -HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); - return Num0BitsBelowLS1Bit_Nonzero32(bits); -} - -template -HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); - return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; -} - -template -HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); - return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); -} - -template -HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD mask) { - const uint32_t bits = static_cast(detail::BitsFromMask(mask)); - return bits - ? (31 - static_cast(Num0BitsAboveMS1Bit_Nonzero32(bits))) - : -1; -} - -// ------------------------------ Compress - -namespace detail { - -template -HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Simd d; - const Rebind d8; - const Simd du; - - // We need byte indices for TableLookupBytes (one vector's worth for each of - // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We - // can instead store lane indices and convert to byte indices (2*lane + 0..1), - // with the doubling baked into the table. Unpacking nibbles is likely more - // costly than the higher cache footprint from storing bytes. - alignas(16) static constexpr uint8_t table[256 * 8] = { - // PrintCompress16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // - 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // - 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // - 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // - 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // - 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // - 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // - 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // - 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // - 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // - 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // - 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // - 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // - 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // - 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // - 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // - 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // - 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // - 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // - 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // - 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // - 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // - 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // - 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // - 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // - 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // - 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // - 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // - 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // - 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // - 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // - 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // - 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // - 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // - 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // - 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // - 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // - 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // - 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // - 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // - 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // - 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // - 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // - 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // - 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // - 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // - 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // - 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // - 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // - 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // - 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // - 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // - 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // - 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // - 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // - 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // - 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // - 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // - 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // - 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // - 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // - 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // - 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // - 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // - 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // - 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // - 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // - 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // - 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // - 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // - 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // - 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // - 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // - 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // - 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // - 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // - 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // - 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // - 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // - 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // - 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // - 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // - 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // - 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // - 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // - 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // - 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // - 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // - 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // - 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // - 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // - 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // - 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // - 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // - 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // - 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // - 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // - 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // - 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // - 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // - 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // - 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // - 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // - 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // - 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // - 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // - 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // - 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // - 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // - 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // - 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // - 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // - 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // - 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // - 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // - 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // - 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // - 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // - 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // - 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // - 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; - const Vec128 pairs = ZipLower(byte_idx, byte_idx); - return BitCast(d, pairs + Set(du, 0x0100)); -} - -template -HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Simd d; - const Rebind d8; - const Simd du; - - // We need byte indices for TableLookupBytes (one vector's worth for each of - // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We - // can instead store lane indices and convert to byte indices (2*lane + 0..1), - // with the doubling baked into the table. Unpacking nibbles is likely more - // costly than the higher cache footprint from storing bytes. - alignas(16) static constexpr uint8_t table[256 * 8] = { - // PrintCompressNot16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // - 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // - 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // - 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // - 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // - 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // - 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // - 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // - 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // - 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // - 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // - 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // - 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // - 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // - 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // - 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // - 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // - 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // - 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // - 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // - 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // - 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // - 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // - 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // - 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // - 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // - 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // - 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // - 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // - 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // - 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // - 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // - 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // - 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // - 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // - 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // - 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // - 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // - 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // - 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // - 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // - 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // - 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // - 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // - 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // - 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // - 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // - 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // - 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // - 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // - 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // - 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // - 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // - 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // - 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // - 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // - 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // - 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // - 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // - 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // - 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // - 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // - 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // - 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // - 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // - 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // - 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // - 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // - 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // - 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // - 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // - 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // - 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // - 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // - 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // - 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // - 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // - 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // - 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // - 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // - 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // - 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // - 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // - 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // - 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // - 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // - 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // - 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // - 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // - 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // - 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // - 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // - 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // - 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // - 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // - 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // - 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // - 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // - 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // - 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // - 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // - 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // - 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // - 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // - 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // - 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // - 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // - 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // - 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // - 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // - 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // - 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // - 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // - 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // - 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // - 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // - 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // - 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // - 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // - 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // - 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; - const Vec128 pairs = ZipLower(byte_idx, byte_idx); - return BitCast(d, pairs + Set(du, 0x0100)); -} - -template -HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { - // PrintCompress32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // - 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // - 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // - 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // - 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { - // PrintCompressNot32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, - 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15}; - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { - // PrintCompress64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { - // PrintCompressNot64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Simd d; - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -// Helper functions called by both Compress and CompressStore - avoids a -// redundant BitsFromMask in the latter. - -template -HWY_INLINE Vec128 Compress(Vec128 v, const uint64_t mask_bits) { - const auto idx = detail::IdxFromBits(mask_bits); - const DFromV d; - const RebindToSigned di; - return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); -} - -template -HWY_INLINE Vec128 CompressNot(Vec128 v, const uint64_t mask_bits) { - const auto idx = detail::IdxFromNotBits(mask_bits); - const DFromV d; - const RebindToSigned di; - return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); -} - -} // namespace detail - -template -struct CompressIsPartition { -#if HWY_TARGET == HWY_WASM_EMU256 - enum { value = 0 }; -#else - enum { value = (sizeof(T) != 1) }; -#endif -}; - -// Single lane: no-op -template -HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. - const Full128 d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskL, maskH); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 byte lanes -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::Compress(v, detail::BitsFromMask(mask)); -} - -// Single lane: no-op -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. - const Full128 d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskH, maskL); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 byte lanes -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // For partial vectors, we cannot pull the Not() into the table because - // BitsFromMask clears the upper bits. - if (N < 16 / sizeof(T)) { - return detail::Compress(v, detail::BitsFromMask(Not(mask))); - } - return detail::CompressNot(v, detail::BitsFromMask(mask)); -} - -// ------------------------------ CompressBlocksNot -HWY_API Vec128 CompressBlocksNot(Vec128 v, - Mask128 /* m */) { - return v; -} - -// ------------------------------ CompressBits -template -HWY_API Vec128 CompressBits(Vec128 v, - const uint8_t* HWY_RESTRICT bits) { - uint64_t mask_bits = 0; - constexpr size_t kNumBytes = (N + 7) / 8; - CopyBytes(bits, &mask_bits); - if (N < 8) { - mask_bits &= (1ull << N) - 1; - } - - return detail::Compress(v, mask_bits); -} - -// ------------------------------ CompressStore -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(mask); - const auto c = detail::Compress(v, mask_bits); - StoreU(c, d, unaligned); - return PopCount(mask_bits); -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; // so we can support fp16/bf16 - const uint64_t mask_bits = detail::BitsFromMask(m); - const size_t count = PopCount(mask_bits); - const VFromD compressed = - detail::Compress(BitCast(du, v), mask_bits); - const MFromD store_mask = RebindMask(d, FirstN(du, count)); - BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); - return count; -} - -// ------------------------------ CompressBitsStore - -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - uint64_t mask_bits = 0; - constexpr size_t kN = MaxLanes(d); - CopyBytes<(kN + 7) / 8>(bits, &mask_bits); - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - - const auto c = detail::Compress(v, mask_bits); - StoreU(c, d, unaligned); - return PopCount(mask_bits); -} - -// ------------------------------ StoreInterleaved2/3/4 - -// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in -// generic_ops-inl.h. - -// ------------------------------ Additional mask logical operations -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const FixedTag d; - const auto vmask = VecFromMask(d, mask); - return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Simd d; - const auto vmask = VecFromMask(d, mask); - const auto neg_vmask = - ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); - return MaskFromVec(Or(vmask, neg_vmask)); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Full128 d; - const Repartition di64; - - auto vmask = BitCast(di64, VecFromMask(d, mask)); - vmask = Or(vmask, Neg(vmask)); - - // Copy the sign bit of the first int64_t lane to the second int64_t lane - const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask)); - return MaskFromVec(BitCast(d, Or(vmask, vmask2))); -} - -template -HWY_API Mask128 SetBeforeFirst(Mask128 mask) { - return Not(SetAtOrAfterFirst(mask)); -} - -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const FixedTag d; - const RebindToSigned di; - - const auto vmask = BitCast(di, VecFromMask(d, mask)); - const auto zero = Zero(di); - const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); - return MaskFromVec(BitCast(d, And(vmask, vmask2))); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Simd d; - const RebindToSigned di; - - const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); - const auto only_first_vmask = - BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); - return MaskFromVec(only_first_vmask); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Full128 d; - const RebindToSigned di; - const Repartition di64; - - const auto zero = Zero(di64); - const auto vmask = BitCast(di64, VecFromMask(d, mask)); - const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); - const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); - return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); -} - -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { - const FixedTag d; - const RebindToSigned di; - using TI = MakeSigned; - - return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); -} -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { - const Simd d; - return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); -} - -// ------------------------------ MulEven/Odd (Load) - -HWY_INLINE Vec128 MulEven(const Vec128 a, - const Vec128 b) { - alignas(16) uint64_t mul[2]; - mul[0] = - Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 0)), - static_cast(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); - return Load(Full128(), mul); -} - -HWY_INLINE Vec128 MulOdd(const Vec128 a, - const Vec128 b) { - alignas(16) uint64_t mul[2]; - mul[0] = - Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 1)), - static_cast(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); - return Load(Full128(), mul); -} - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) - -// Generic for all vector lengths. -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { - const Rebind du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 - // Using shift/and instead of Zip leads to the odd/even order that - // RearrangeToOddPlusEven prefers. - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return Mul(BitCast(df32, ae), BitCast(df32, be)) + - Mul(BitCast(df32, ao), BitCast(df32, bo)); -} - -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, - const VFromD sum0, - VFromD& sum1) { - const Rebind du32; - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 - // Using shift/and instead of Zip leads to the odd/even order that - // RearrangeToOddPlusEven prefers. - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -} - -// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is -// safe. -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { - return VFromD{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; -} - -template >> -HWY_API VFromD WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { - const auto lo16_mask = Set(du32, 0x0000FFFFu); - - const auto a0 = And(BitCast(du32, a), lo16_mask); - const auto b0 = And(BitCast(du32, b), lo16_mask); - - const auto a1 = ShiftRight<16>(BitCast(du32, a)); - const auto b1 = ShiftRight<16>(BitCast(du32, b)); - - return MulAdd(a1, b1, a0 * b0); -} - -// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is -// safe. -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, - const VFromD sum0, - VFromD& /*sum1*/) { - return sum0 + WidenMulPairwiseAdd(d, a, b); -} - -// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is -// safe. -template >> -HWY_API VFromD ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b, - const VFromD sum0, - VFromD& /*sum1*/) { - return sum0 + WidenMulPairwiseAdd(d, a, b); -} - -// ------------------------------ RearrangeToOddPlusEven -template -HWY_API Vec128 RearrangeToOddPlusEven( - const Vec128 sum0, const Vec128 /*sum1*/) { - return sum0; // invariant already holds -} - -template -HWY_API Vec128 RearrangeToOddPlusEven( - const Vec128 sum0, const Vec128 /*sum1*/) { - return sum0; // invariant already holds -} - -template -HWY_API Vec128 RearrangeToOddPlusEven(const Vec128 sum0, - const Vec128 sum1) { - return Add(sum0, sum1); -} - -// ------------------------------ Reductions - -namespace detail { - -// N=1: no-op -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v) { - return v; -} - -// N=2 -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v10) { - const DFromV d; - return Add(v10, Reverse2(d, v10)); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v10) { - const DFromV d; - return Min(v10, Reverse2(d, v10)); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v10) { - const DFromV d; - return Max(v10, Reverse2(d, v10)); -} - -// N=4 (only 16/32-bit, else >128-bit) -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Add(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Add(v03_12_12_03, v12_03_03_12); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Min(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Min(v03_12_12_03, v12_03_03_12); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Max(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Max(v03_12_12_03, v12_03_03_12); -} - -// N=8 (only 16-bit, else >128-bit) -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} - -template -HWY_INLINE T ReduceSum(Vec128 v) { - return GetLane(SumOfLanes(v)); -} - -} // namespace detail - -template -HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { - return detail::SumOfLanes(v); -} -template -HWY_API TFromD ReduceSum(D /* tag */, VFromD v) { - return detail::ReduceSum(v); -} -template -HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { - return detail::MinOfLanes(v); -} -template -HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { - return detail::MaxOfLanes(v); -} - -// ------------------------------ Lt128 - -template -HWY_INLINE MFromD Lt128(D d, VFromD a, VFromD b) { - // Truth table of Eq and Lt for Hi and Lo u64. - // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) - // =H =L cH cL | out = cH | (=H & cL) - // 0 0 0 0 | 0 - // 0 0 0 1 | 0 - // 0 0 1 0 | 1 - // 0 0 1 1 | 1 - // 0 1 0 0 | 0 - // 0 1 0 1 | 0 - // 0 1 1 0 | 1 - // 1 0 0 0 | 0 - // 1 0 0 1 | 1 - // 1 1 0 0 | 0 - const MFromD eqHL = Eq(a, b); - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - // We need to bring cL to the upper lane/bit corresponding to cH. Comparing - // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the - // comparison result leftwards requires only 4. IfThenElse compiles to the - // same code as OrAnd(). - const VFromD ltLx = DupEven(ltHL); - const VFromD outHx = IfThenElse(eqHL, ltLx, ltHL); - return MaskFromVec(DupOdd(outHx)); -} - -template -HWY_INLINE MFromD Lt128Upper(D d, VFromD a, VFromD b) { - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); -} - -// ------------------------------ Eq128 - -template -HWY_INLINE MFromD Eq128(D d, VFromD a, VFromD b) { - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); -} - -template -HWY_INLINE MFromD Eq128Upper(D d, VFromD a, VFromD b) { - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); -} - -// ------------------------------ Ne128 - -template -HWY_INLINE MFromD Ne128(D d, VFromD a, VFromD b) { - const VFromD neHL = VecFromMask(d, Ne(a, b)); - return MaskFromVec(Or(Reverse2(d, neHL), neHL)); -} - -template -HWY_INLINE MFromD Ne128Upper(D d, VFromD a, VFromD b) { - const VFromD neHL = VecFromMask(d, Ne(a, b)); - return MaskFromVec(InterleaveUpper(d, neHL, neHL)); -} - -// ------------------------------ Min128, Max128 (Lt128) - -// Without a native OddEven, it seems infeasible to go faster than Lt128. -template -HWY_INLINE VFromD Min128(D d, const VFromD a, const VFromD b) { - return IfThenElse(Lt128(d, a, b), a, b); -} - -template -HWY_INLINE VFromD Max128(D d, const VFromD a, const VFromD b) { - return IfThenElse(Lt128(d, b, a), a, b); -} - -template -HWY_INLINE VFromD Min128Upper(D d, const VFromD a, const VFromD b) { - return IfThenElse(Lt128Upper(d, a, b), a, b); -} - -template -HWY_INLINE VFromD Max128Upper(D d, const VFromD a, const VFromD b) { - return IfThenElse(Lt128Upper(d, b, a), a, b); -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); diff --git a/deps/highway/include/hwy/ops/x86_128-inl.h b/deps/highway/include/hwy/ops/x86_128-inl.h deleted file mode 100644 index fd944ffe..00000000 --- a/deps/highway/include/hwy/ops/x86_128-inl.h +++ /dev/null @@ -1,10756 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL -// operations when compiling for those targets. -// External include guard in highway.h - see comment there. - -// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL -#include "hwy/base.h" - -// Avoid uninitialized warnings in GCC's emmintrin.h - see -// https://github.com/google/highway/issues/710 and pull/902 -HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_GCC_ACTUAL -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") -HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, - ignored "-Wmaybe-uninitialized") -#endif - -#include -#include -#if HWY_TARGET == HWY_SSSE3 -#include // SSSE3 -#elif HWY_TARGET <= HWY_SSE4 -#include // SSE4 -#ifndef HWY_DISABLE_PCLMUL_AES -#include // CLMUL -#endif -#endif - -#include "hwy/ops/shared-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { -namespace detail { - -template -struct Raw128 { - using type = __m128i; -}; -#if HWY_HAVE_FLOAT16 -template <> -struct Raw128 { - using type = __m128h; -}; -#endif // HWY_HAVE_FLOAT16 -template <> -struct Raw128 { - using type = __m128; -}; -template <> -struct Raw128 { - using type = __m128d; -}; - -} // namespace detail - -template -class Vec128 { - using Raw = typename detail::Raw128::type; - - public: - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = N; // only for DFromV - - // Compound assignment. Only usable if there is a corresponding non-member - // binary operator overload. For example, only f32 and f64 support division. - HWY_INLINE Vec128& operator*=(const Vec128 other) { - return *this = (*this * other); - } - HWY_INLINE Vec128& operator/=(const Vec128 other) { - return *this = (*this / other); - } - HWY_INLINE Vec128& operator+=(const Vec128 other) { - return *this = (*this + other); - } - HWY_INLINE Vec128& operator-=(const Vec128 other) { - return *this = (*this - other); - } - HWY_INLINE Vec128& operator&=(const Vec128 other) { - return *this = (*this & other); - } - HWY_INLINE Vec128& operator|=(const Vec128 other) { - return *this = (*this | other); - } - HWY_INLINE Vec128& operator^=(const Vec128 other) { - return *this = (*this ^ other); - } - - Raw raw; -}; - -template -using Vec64 = Vec128; - -template -using Vec32 = Vec128; - -template -using Vec16 = Vec128; - -#if HWY_TARGET <= HWY_AVX3 - -namespace detail { - -// Template arg: sizeof(lane type) -template -struct RawMask128 {}; -template <> -struct RawMask128<1> { - using type = __mmask16; -}; -template <> -struct RawMask128<2> { - using type = __mmask8; -}; -template <> -struct RawMask128<4> { - using type = __mmask8; -}; -template <> -struct RawMask128<8> { - using type = __mmask8; -}; - -} // namespace detail - -template -struct Mask128 { - using Raw = typename detail::RawMask128::type; - - static Mask128 FromBits(uint64_t mask_bits) { - return Mask128{static_cast(mask_bits)}; - } - - Raw raw; -}; - -#else // AVX2 or below - -// FF..FF or 0. -template -struct Mask128 { - typename detail::Raw128::type raw; -}; - -#endif // AVX2 or below - -namespace detail { - -// Returns the lowest N of the _mm_movemask* bits. -template -constexpr uint64_t OnlyActive(uint64_t mask_bits) { - return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); -} - -} // namespace detail - -#if HWY_TARGET <= HWY_AVX3 -namespace detail { - -// Used by Expand() emulation, which is required for both AVX3 and AVX2. -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(mask.raw); -} - -} // namespace detail -#endif // HWY_TARGET <= HWY_AVX3 - -template -using DFromV = Simd; - -template -using TFromV = typename V::PrivateT; - -// ------------------------------ Zero - -// Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. -template -HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { - return Vec128, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; -} -template -HWY_API Vec128 Zero(D /* tag */) { - return Vec128{_mm_setzero_si128()}; -} -template -HWY_API Vec128 Zero(D /* tag */) { -#if HWY_HAVE_FLOAT16 - return Vec128{_mm_setzero_ph()}; -#else - return Vec128{_mm_setzero_si128()}; -#endif -} -template -HWY_API Vec128 Zero(D /* tag */) { - return Vec128{_mm_setzero_ps()}; -} -template -HWY_API Vec128 Zero(D /* tag */) { - return Vec128{_mm_setzero_pd()}; -} - -// Using the existing Zero function instead of a dedicated function for -// deduction avoids having to forward-declare Vec256 here. -template -using VFromD = decltype(Zero(D())); - -// ------------------------------ Tuple (VFromD) -#include "hwy/ops/tuple-inl.h" - -// ------------------------------ BitCast - -namespace detail { - -HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } -#if HWY_HAVE_FLOAT16 -HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); } -#endif // HWY_HAVE_FLOAT16 -HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } -HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } - -template -HWY_INLINE Vec128 BitCastToByte(Vec128 v) { - return Vec128{BitCastToInteger(v.raw)}; -} - -// Cannot rely on function overloading because return types differ. -template -struct BitCastFromInteger128 { - HWY_INLINE __m128i operator()(__m128i v) { return v; } -}; -#if HWY_HAVE_FLOAT16 -template <> -struct BitCastFromInteger128 { - HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); } -}; -#endif // HWY_HAVE_FLOAT16 -template <> -struct BitCastFromInteger128 { - HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } -}; -template <> -struct BitCastFromInteger128 { - HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } -}; - -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, - Vec128 v) { - return VFromD{BitCastFromInteger128>()(v.raw)}; -} - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, - Vec128().MaxLanes()> v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(v)); -} - -// ------------------------------ Set - -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm_set1_epi8(static_cast(t))}; // NOLINT -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm_set1_epi16(static_cast(t))}; // NOLINT -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm_set1_epi32(static_cast(t))}; -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm_set1_epi64x(static_cast(t))}; // NOLINT -} -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD Set(D /* tag */, float16_t t) { - return VFromD{_mm_set1_ph(t)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API VFromD Set(D /* tag */, float t) { - return VFromD{_mm_set1_ps(t)}; -} -template -HWY_API VFromD Set(D /* tag */, double t) { - return VFromD{_mm_set1_pd(t)}; -} - -// Generic for all vector lengths. -template -HWY_API VFromD Set(D df, TFromD t) { - const RebindToUnsigned du; - static_assert(sizeof(TFromD) == 2, "Expecting [b]f16"); - uint16_t bits; - CopyBytes<2>(&t, &bits); - return BitCast(df, Set(du, bits)); -} - -// ------------------------------ Undefined - -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") - -// Returns a vector with uninitialized elements. -template -HWY_API VFromD Undefined(D /* tag */) { - // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC - // generate an XOR instruction. - return VFromD{_mm_undefined_si128()}; -} -template -HWY_API VFromD Undefined(D /* tag */) { - return VFromD{_mm_undefined_si128()}; -} -template -HWY_API VFromD Undefined(D /* tag */) { -#if HWY_HAVE_FLOAT16 - return VFromD{_mm_undefined_ph()}; -#else - return VFromD{_mm_undefined_si128()}; -#endif -} -template -HWY_API VFromD Undefined(D /* tag */) { - return VFromD{_mm_undefined_ps()}; -} -template -HWY_API VFromD Undefined(D /* tag */) { - return VFromD{_mm_undefined_pd()}; -} - -HWY_DIAGNOSTICS(pop) - -// ------------------------------ GetLane - -template -HWY_API T GetLane(const Vec128 v) { - return static_cast(_mm_cvtsi128_si32(v.raw) & 0xFF); -} -template -HWY_API T GetLane(const Vec128 v) { - return static_cast(_mm_cvtsi128_si32(v.raw) & 0xFFFF); -} -template -HWY_API T GetLane(const Vec128 v) { - return static_cast(_mm_cvtsi128_si32(v.raw)); -} -template -HWY_API float GetLane(const Vec128 v) { - return _mm_cvtss_f32(v.raw); -} -template -HWY_API T GetLane(const Vec128 v) { -#if HWY_ARCH_X86_32 - const DFromV d; - alignas(16) T lanes[2]; - Store(v, d, lanes); - return lanes[0]; -#else - return static_cast(_mm_cvtsi128_si64(v.raw)); -#endif -} -template -HWY_API double GetLane(const Vec128 v) { - return _mm_cvtsd_f64(v.raw); -} - -// ------------------------------ ResizeBitCast - -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - const Repartition du8; - return BitCast(d, VFromD{detail::BitCastToInteger(v.raw)}); -} - -// ================================================== LOGICAL - -// ------------------------------ And - -template -HWY_API Vec128 And(Vec128 a, Vec128 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm_and_si128(a.raw, b.raw)}); -} -template -HWY_API Vec128 And(Vec128 a, Vec128 b) { - return Vec128{_mm_and_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 And(Vec128 a, Vec128 b) { - return Vec128{_mm_and_pd(a.raw, b.raw)}; -} - -// ------------------------------ AndNot - -// Returns ~not_mask & mask. -template -HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast( - d, VFromD{_mm_andnot_si128(not_mask.raw, mask.raw)}); -} -template -HWY_API Vec128 AndNot(Vec128 not_mask, - Vec128 mask) { - return Vec128{_mm_andnot_ps(not_mask.raw, mask.raw)}; -} -template -HWY_API Vec128 AndNot(Vec128 not_mask, - Vec128 mask) { - return Vec128{_mm_andnot_pd(not_mask.raw, mask.raw)}; -} - -// ------------------------------ Or - -template -HWY_API Vec128 Or(Vec128 a, Vec128 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm_or_si128(a.raw, b.raw)}); -} - -template -HWY_API Vec128 Or(Vec128 a, Vec128 b) { - return Vec128{_mm_or_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 Or(Vec128 a, Vec128 b) { - return Vec128{_mm_or_pd(a.raw, b.raw)}; -} - -// ------------------------------ Xor - -template -HWY_API Vec128 Xor(Vec128 a, Vec128 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm_xor_si128(a.raw, b.raw)}); -} - -template -HWY_API Vec128 Xor(Vec128 a, Vec128 b) { - return Vec128{_mm_xor_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 Xor(Vec128 a, Vec128 b) { - return Vec128{_mm_xor_pd(a.raw, b.raw)}; -} - -// ------------------------------ Not -template -HWY_API Vec128 Not(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; -#if HWY_TARGET <= HWY_AVX3 - const __m128i vu = BitCast(du, v).raw; - return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); -#else - return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)})); -#endif -} - -// ------------------------------ Xor3 -template -HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m128i ret = _mm_ternarylogic_epi64( - BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); - return BitCast(d, VU{ret}); -#else - return Xor(x1, Xor(x2, x3)); -#endif -} - -// ------------------------------ Or3 -template -HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m128i ret = _mm_ternarylogic_epi64( - BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); - return BitCast(d, VU{ret}); -#else - return Or(o1, Or(o2, o3)); -#endif -} - -// ------------------------------ OrAnd -template -HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m128i ret = _mm_ternarylogic_epi64( - BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); - return BitCast(d, VU{ret}); -#else - return Or(o, And(a1, a2)); -#endif -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, - Vec128 no) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast( - d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, - BitCast(du, no).raw, 0xCA)}); -#else - return IfThenElse(MaskFromVec(mask), yes, no); -#endif -} - -// ------------------------------ BitwiseIfThenElse -#if HWY_TARGET <= HWY_AVX3 - -#ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#undef HWY_NATIVE_BITWISE_IF_THEN_ELSE -#else -#define HWY_NATIVE_BITWISE_IF_THEN_ELSE -#endif - -template -HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { - return IfVecThenElse(mask, yes, no); -} - -#endif - -// ------------------------------ Operator overloads (internal-only if float) - -template -HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { - return And(a, b); -} - -template -HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { - return Or(a, b); -} - -template -HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { - return Xor(a, b); -} - -// ------------------------------ PopulationCount - -// 8/16 require BITALG, 32/64 require VPOPCNTDQ. -#if HWY_TARGET <= HWY_AVX3_DL - -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -namespace detail { - -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, - Vec128 v) { - return Vec128{_mm_popcnt_epi8(v.raw)}; -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, - Vec128 v) { - return Vec128{_mm_popcnt_epi16(v.raw)}; -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, - Vec128 v) { - return Vec128{_mm_popcnt_epi32(v.raw)}; -} -template -HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, - Vec128 v) { - return Vec128{_mm_popcnt_epi64(v.raw)}; -} - -} // namespace detail - -template -HWY_API Vec128 PopulationCount(Vec128 v) { - return detail::PopulationCount(hwy::SizeTag(), v); -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ================================================== SIGN - -// ------------------------------ Neg - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_INLINE Vec128 Neg(hwy::FloatTag /*tag*/, const Vec128 v) { - return Xor(v, SignBit(DFromV())); -} - -template -HWY_INLINE Vec128 Neg(hwy::SpecialTag /*tag*/, const Vec128 v) { - return Xor(v, SignBit(DFromV())); -} - -template -HWY_INLINE Vec128 Neg(hwy::SignedTag /*tag*/, const Vec128 v) { - return Zero(DFromV()) - v; -} - -} // namespace detail - -template -HWY_INLINE Vec128 Neg(const Vec128 v) { - return detail::Neg(hwy::TypeTag(), v); -} - -// ------------------------------ Floating-point Abs -template -HWY_API Vec128 Abs(const Vec128 v) { - const DFromV d; - const RebindToSigned di; - using TI = TFromD; - return v & BitCast(d, Set(di, static_cast(~SignMask()))); -} - -// ------------------------------ CopySign -// Generic for all vector lengths. -template -HWY_API V CopySign(const V magn, const V sign) { - static_assert(IsFloat>(), "Only makes sense for floating-point"); - - const DFromV d; - const auto msb = SignBit(d); - - // Truth table for msb, magn, sign | bitwise msb ? sign : mag - // 0 0 0 | 0 - // 0 0 1 | 0 - // 0 1 0 | 1 - // 0 1 1 | 1 - // 1 0 0 | 0 - // 1 0 1 | 1 - // 1 1 0 | 0 - // 1 1 1 | 1 - return BitwiseIfThenElse(msb, sign, magn); -} - -// ------------------------------ CopySignToAbs -// Generic for all vector lengths. -template -HWY_API V CopySignToAbs(const V abs, const V sign) { - const DFromV d; - return OrAnd(abs, SignBit(d), sign); -} - -// ================================================== MASK - -#if HWY_TARGET <= HWY_AVX3 - -// ------------------------------ IfThenElse - -// Returns mask ? b : a. - -namespace detail { - -// Templates for signed/unsigned integer of a particular size. -template -HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<1> /* tag */, - Mask128 mask, Vec128 yes, - Vec128 no) { - return Vec128{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<2> /* tag */, - Mask128 mask, Vec128 yes, - Vec128 no) { - return Vec128{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<4> /* tag */, - Mask128 mask, Vec128 yes, - Vec128 no) { - return Vec128{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<8> /* tag */, - Mask128 mask, Vec128 yes, - Vec128 no) { - return Vec128{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; -} - -} // namespace detail - -template -HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, - Vec128 no) { - return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 IfThenElse(Mask128 mask, - Vec128 yes, - Vec128 no) { - return Vec128{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API Vec128 IfThenElse(Mask128 mask, - Vec128 yes, Vec128 no) { - return Vec128{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)}; -} - -template -HWY_API Vec128 IfThenElse(Mask128 mask, - Vec128 yes, - Vec128 no) { - return Vec128{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)}; -} - -namespace detail { - -template -HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<1> /* tag */, - Mask128 mask, Vec128 yes) { - return Vec128{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<2> /* tag */, - Mask128 mask, Vec128 yes) { - return Vec128{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<4> /* tag */, - Mask128 mask, Vec128 yes) { - return Vec128{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<8> /* tag */, - Mask128 mask, Vec128 yes) { - return Vec128{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; -} - -} // namespace detail - -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { - return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); -} - -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, - Vec128 yes) { - return Vec128{_mm_maskz_mov_ps(mask.raw, yes.raw)}; -} - -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, - Vec128 yes) { - return Vec128{_mm_maskz_mov_pd(mask.raw, yes.raw)}; -} - -namespace detail { - -template -HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<1> /* tag */, - Mask128 mask, Vec128 no) { - // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. - return Vec128{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<2> /* tag */, - Mask128 mask, Vec128 no) { - return Vec128{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<4> /* tag */, - Mask128 mask, Vec128 no) { - return Vec128{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<8> /* tag */, - Mask128 mask, Vec128 no) { - return Vec128{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; -} - -} // namespace detail - -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { - return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); -} - -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, - Vec128 no) { - return Vec128{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; -} - -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, - Vec128 no) { - return Vec128{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; -} - -// ------------------------------ Mask logical - -// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. -#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) -#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \ - HWY_COMPILER_CLANG >= 800 -#define HWY_COMPILER_HAS_MASK_INTRINSICS 1 -#else -#define HWY_COMPILER_HAS_MASK_INTRINSICS 0 -#endif -#endif // HWY_COMPILER_HAS_MASK_INTRINSICS - -namespace detail { - -template -HWY_INLINE Mask128 And(hwy::SizeTag<1> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kand_mask16(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask16>(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask128 And(hwy::SizeTag<2> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kand_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask128 And(hwy::SizeTag<4> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kand_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask128 And(hwy::SizeTag<8> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kand_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; -#endif -} - -template -HWY_INLINE Mask128 AndNot(hwy::SizeTag<1> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kandn_mask16(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask16>(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask128 AndNot(hwy::SizeTag<2> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kandn_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask128 AndNot(hwy::SizeTag<4> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kandn_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask128 AndNot(hwy::SizeTag<8> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kandn_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; -#endif -} - -template -HWY_INLINE Mask128 Or(hwy::SizeTag<1> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kor_mask16(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask16>(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask128 Or(hwy::SizeTag<2> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask128 Or(hwy::SizeTag<4> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask128 Or(hwy::SizeTag<8> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; -#endif -} - -template -HWY_INLINE Mask128 Xor(hwy::SizeTag<1> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kxor_mask16(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask16>(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask128 Xor(hwy::SizeTag<2> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kxor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask128 Xor(hwy::SizeTag<4> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kxor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask128 Xor(hwy::SizeTag<8> /*tag*/, const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kxor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; -#endif -} - -template -HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, - const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kxnor_mask16(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; -#endif -} -template -HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, - const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{_kxnor_mask8(a.raw, b.raw)}; -#else - return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; -#endif -} -template -HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, - const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; -#else - return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; -#endif -} -template -HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, - const Mask128 a, - const Mask128 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask128{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)}; -#else - return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)}; -#endif -} - -} // namespace detail - -template -HWY_API Mask128 And(const Mask128 a, Mask128 b) { - return detail::And(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { - return detail::AndNot(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask128 Or(const Mask128 a, Mask128 b) { - return detail::Or(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { - return detail::Xor(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask128 Not(const Mask128 m) { - // Flip only the valid bits. - // TODO(janwas): use _knot intrinsics if N >= 8. - return Xor(m, Mask128::FromBits((1ull << N) - 1)); -} - -template -HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { - return detail::ExclusiveNeither(hwy::SizeTag(), a, b); -} - -#else // AVX2 or below - -// ------------------------------ Mask - -// Mask and Vec are the same (true = FF..FF). -template -HWY_API Mask128 MaskFromVec(const Vec128 v) { - return Mask128{v.raw}; -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{v.raw}; -} - -// Generic for all vector lengths. -template -HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { - return VecFromMask(v); -} - -#if HWY_TARGET >= HWY_SSSE3 - -// mask ? yes : no -template -HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, - Vec128 no) { - const auto vmask = VecFromMask(DFromV(), mask); - return Or(And(vmask, yes), AndNot(vmask, no)); -} - -#else // HWY_TARGET < HWY_SSSE3 - -// mask ? yes : no -template -HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, - Vec128 no) { - return Vec128{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; -} -template -HWY_API Vec128 IfThenElse(Mask128 mask, - Vec128 yes, Vec128 no) { - return Vec128{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; -} -template -HWY_API Vec128 IfThenElse(Mask128 mask, - Vec128 yes, - Vec128 no) { - return Vec128{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; -} - -#endif // HWY_TARGET >= HWY_SSSE3 - -// mask ? yes : 0 -template -HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { - return yes & VecFromMask(DFromV(), mask); -} - -// mask ? 0 : no -template -HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { - return AndNot(VecFromMask(DFromV(), mask), no); -} - -// ------------------------------ Mask logical - -template -HWY_API Mask128 Not(const Mask128 m) { - const Simd d; - return MaskFromVec(Not(VecFromMask(d, m))); -} - -template -HWY_API Mask128 And(const Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Or(const Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { - const Simd d; - return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ ShiftLeft - -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{_mm_slli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{_mm_slli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{_mm_slli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{_mm_slli_epi16(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{_mm_slli_epi32(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - return Vec128{_mm_slli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec128 ShiftLeft(const Vec128 v) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; - return kBits == 1 - ? (v + v) - : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); -} - -// ------------------------------ ShiftRight - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{_mm_srli_epi16(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{_mm_srli_epi32(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{_mm_srli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ - ShiftRight(Vec128{v.raw}).raw}; - return shifted & Set(d8, 0xFF >> kBits); -} - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{_mm_srai_epi16(v.raw, kBits)}; -} -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - return Vec128{_mm_srai_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { - const DFromV di; - const RebindToUnsigned du; - const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); - const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// i64 is implemented after BroadcastSignBit. - -// ================================================== MEMORY (1) - -// Clang static analysis claims the memory immediately after a partial vector -// store is uninitialized, and also flags the input to partial loads (at least -// for loadl_pd) as "garbage". This is a false alarm because msan does not -// raise errors. We work around this by using CopyBytes instead of intrinsics, -// but only for the analyzer to avoid potentially bad code generation. -// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. -#ifndef HWY_SAFE_PARTIAL_LOAD_STORE -#if defined(__clang_analyzer__) || \ - (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) -#define HWY_SAFE_PARTIAL_LOAD_STORE 1 -#else -#define HWY_SAFE_PARTIAL_LOAD_STORE 0 -#endif -#endif // HWY_SAFE_PARTIAL_LOAD_STORE - -// ------------------------------ Load - -template -HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { - return VFromD{_mm_load_si128(reinterpret_cast(aligned))}; -} -// Generic for all vector lengths greater than or equal to 16 bytes. -template -HWY_API VFromD Load(D d, const bfloat16_t* HWY_RESTRICT aligned) { - const RebindToUnsigned du; - return BitCast(d, Load(du, reinterpret_cast(aligned))); -} -template -HWY_API Vec128 Load(D d, const float16_t* HWY_RESTRICT aligned) { -#if HWY_HAVE_FLOAT16 - return Vec128{_mm_load_ph(aligned)}; -#else - const RebindToUnsigned du; - return BitCast(d, Load(du, reinterpret_cast(aligned))); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec128 Load(D /* tag */, const float* HWY_RESTRICT aligned) { - return Vec128{_mm_load_ps(aligned)}; -} -template -HWY_API Vec128 Load(D /* tag */, const double* HWY_RESTRICT aligned) { - return Vec128{_mm_load_pd(aligned)}; -} - -template -HWY_API VFromD LoadU(D /* tag */, const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_loadu_si128(reinterpret_cast(p))}; -} -// Generic for all vector lengths greater than or equal to 16 bytes. -template -HWY_API VFromD LoadU(D d, const bfloat16_t* HWY_RESTRICT p) { - const RebindToUnsigned du; - return BitCast(d, LoadU(du, reinterpret_cast(p))); -} -template -HWY_API Vec128 LoadU(D d, const float16_t* HWY_RESTRICT p) { -#if HWY_HAVE_FLOAT16 - (void)d; - return Vec128{_mm_loadu_ph(p)}; -#else - const RebindToUnsigned du; - return BitCast(d, LoadU(du, reinterpret_cast(p))); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec128 LoadU(D /* tag */, const float* HWY_RESTRICT p) { - return Vec128{_mm_loadu_ps(p)}; -} -template -HWY_API Vec128 LoadU(D /* tag */, const double* HWY_RESTRICT p) { - return Vec128{_mm_loadu_pd(p)}; -} - -template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t -#if HWY_SAFE_PARTIAL_LOAD_STORE - __m128i v = _mm_setzero_si128(); - CopyBytes<8>(p, &v); // not same size -#else - const __m128i v = _mm_loadl_epi64(reinterpret_cast(p)); -#endif - return BitCast(d, VFromD{v}); -} - -template -HWY_API Vec64 Load(D /* tag */, const float* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - __m128 v = _mm_setzero_ps(); - CopyBytes<8>(p, &v); // not same size - return Vec64{v}; -#else - const __m128 hi = _mm_setzero_ps(); - return Vec64{_mm_loadl_pi(hi, reinterpret_cast(p))}; -#endif -} - -template -HWY_API Vec64 Load(D /* tag */, const double* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - __m128d v = _mm_setzero_pd(); - CopyBytes<8>(p, &v); // not same size - return Vec64{v}; -#else - return Vec64{_mm_load_sd(p)}; -#endif -} - -template -HWY_API Vec32 Load(D /* tag */, const float* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - __m128 v = _mm_setzero_ps(); - CopyBytes<4>(p, &v); // not same size - return Vec32{v}; -#else - return Vec32{_mm_load_ss(p)}; -#endif -} - -// Any <= 32 bit except -template -HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - // Clang ArgumentPromotionPass seems to break this code. We can unpoison - // before SetTableIndices -> LoadU -> Load and the memory is poisoned again. - detail::MaybeUnpoison(p, Lanes(d)); - -#if HWY_SAFE_PARTIAL_LOAD_STORE - __m128i v = Zero(Full128>()).raw; - CopyBytes(p, &v); // not same size as VFromD -#else - int32_t bits = 0; - CopyBytes(p, &bits); // not same size as VFromD - const __m128i v = _mm_cvtsi32_si128(bits); -#endif - return BitCast(d, VFromD{v}); -} - -// For < 128 bit, LoadU == Load. -template -HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { - return Load(d, p); -} - -// 128-bit SIMD => nothing to duplicate, same as an unaligned load. -template -HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { - return LoadU(d, p); -} - -// ------------------------------ Store - -template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { - _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); -} -// Generic for all vector lengths greater than or equal to 16 bytes. -template -HWY_API void Store(VFromD v, D d, bfloat16_t* HWY_RESTRICT aligned) { - const RebindToUnsigned du; - Store(BitCast(du, v), du, reinterpret_cast(aligned)); -} -template -HWY_API void Store(Vec128 v, D d, float16_t* HWY_RESTRICT aligned) { -#if HWY_HAVE_FLOAT16 - (void)d; - _mm_store_ph(aligned, v.raw); -#else - const RebindToUnsigned du; - Store(BitCast(du, v), du, reinterpret_cast(aligned)); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API void Store(Vec128 v, D /* tag */, float* HWY_RESTRICT aligned) { - _mm_store_ps(aligned, v.raw); -} -template -HWY_API void Store(Vec128 v, D /* tag */, - double* HWY_RESTRICT aligned) { - _mm_store_pd(aligned, v.raw); -} - -template -HWY_API void StoreU(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); -} -// Generic for all vector lengths greater than or equal to 16 bytes. -template -HWY_API void StoreU(VFromD v, D d, bfloat16_t* HWY_RESTRICT p) { - const RebindToUnsigned du; - StoreU(BitCast(du, v), du, reinterpret_cast(p)); -} -template -HWY_API void StoreU(Vec128 v, D d, float16_t* HWY_RESTRICT p) { -#if HWY_HAVE_FLOAT16 - (void)d; - _mm_storeu_ph(p, v.raw); -#else - const RebindToUnsigned du; - StoreU(BitCast(du, v), du, reinterpret_cast(p)); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, float* HWY_RESTRICT p) { - _mm_storeu_ps(p, v.raw); -} -template -HWY_API void StoreU(Vec128 v, D /* tag */, double* HWY_RESTRICT p) { - _mm_storeu_pd(p, v.raw); -} - -template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - (void)d; - CopyBytes<8>(&v, p); // not same size -#else - const RebindToUnsigned du; // for float16_t - _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw); -#endif -} -template -HWY_API void Store(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - CopyBytes<8>(&v, p); // not same size -#else - _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); -#endif -} -template -HWY_API void Store(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - CopyBytes<8>(&v, p); // not same size -#else - _mm_storel_pd(p, v.raw); -#endif -} - -// Any <= 32 bit except -template -HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { - CopyBytes(&v, p); // not same size -} -template -HWY_API void Store(Vec32 v, D /* tag */, float* HWY_RESTRICT p) { -#if HWY_SAFE_PARTIAL_LOAD_STORE - CopyBytes<4>(&v, p); // not same size -#else - _mm_store_ss(p, v.raw); -#endif -} - -// For < 128 bit, StoreU == Store. -template -HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { - Store(v, d, p); -} - -// ================================================== SWIZZLE (1) - -// ------------------------------ TableLookupBytes -template -HWY_API Vec128 TableLookupBytes(const Vec128 bytes, - const Vec128 from) { -#if HWY_TARGET == HWY_SSE2 -#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) - typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); - return Vec128{reinterpret_cast::type>( - __builtin_shuffle(reinterpret_cast(bytes.raw), - reinterpret_cast(from.raw)))}; -#else - const DFromV d; - const Repartition du8; - const Full128 du8_full; - - const DFromV d_bytes; - const Repartition du8_bytes; - - alignas(16) uint8_t result_bytes[16]; - alignas(16) uint8_t u8_bytes[16]; - alignas(16) uint8_t from_bytes[16]; - - Store(Vec128{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes); - Store(Vec128{BitCast(du8, from).raw}, du8_full, from_bytes); - - for (int i = 0; i < 16; i++) { - result_bytes[i] = u8_bytes[from_bytes[i] & 15]; - } - - return BitCast(d, VFromD{Load(du8_full, result_bytes).raw}); -#endif -#else // SSSE3 or newer - return Vec128{_mm_shuffle_epi8(bytes.raw, from.raw)}; -#endif -} - -// ------------------------------ TableLookupBytesOr0 -// For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3 -template -HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { -#if HWY_TARGET == HWY_SSE2 - const DFromV d; - const Repartition di8; - - const auto di8_from = BitCast(di8, from); - return BitCast(d, IfThenZeroElse(di8_from < Zero(di8), - TableLookupBytes(bytes, di8_from))); -#else - return TableLookupBytes(bytes, from); -#endif -} - -// ------------------------------ Shuffles (ShiftRight, TableLookupBytes) - -// Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). -// Shuffle0321 rotates one lane to the right (the previous least-significant -// lane is now most-significant). These could also be implemented via -// CombineShiftRightBytes but the shuffle_abcd notation is more convenient. - -// Swap 32-bit halves in 64-bit halves. -template -HWY_API Vec128 Shuffle2301(const Vec128 v) { - static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{_mm_shuffle_epi32(v.raw, 0xB1)}; -} -template -HWY_API Vec128 Shuffle2301(const Vec128 v) { - static_assert(N == 2 || N == 4, "Does not make sense for N=1"); - return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; -} - -// These are used by generic_ops-inl to implement LoadInterleaved3. As with -// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output -// comes from the first argument. -namespace detail { - -template -HWY_API Vec32 ShuffleTwo2301(const Vec32 a, const Vec32 b) { - const DFromV d; - const Twice d2; - const auto ba = Combine(d2, b, a); -#if HWY_TARGET == HWY_SSE2 - Vec32 ba_shuffled{ - _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; - return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled))); -#else - alignas(16) const T kShuffle[8] = {1, 0, 7, 6}; - return Vec32{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; -#endif -} -template -HWY_API Vec64 ShuffleTwo2301(const Vec64 a, const Vec64 b) { - const DFromV d; - const Twice d2; - const auto ba = Combine(d2, b, a); -#if HWY_TARGET == HWY_SSE2 - Vec64 ba_shuffled{ - _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; - return Vec64{ - _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))}; -#else - alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c}; - return Vec64{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; -#endif -} -template -HWY_API Vec128 ShuffleTwo2301(const Vec128 a, const Vec128 b) { - const DFromV d; - const RebindToFloat df; - constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); - return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, - BitCast(df, b).raw, m)}); -} - -template -HWY_API Vec32 ShuffleTwo1230(const Vec32 a, const Vec32 b) { - const DFromV d; -#if HWY_TARGET == HWY_SSE2 - const auto zero = Zero(d); - const Rebind di16; - const Vec32 a_shuffled{_mm_shufflelo_epi16( - _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; - const Vec32 b_shuffled{_mm_shufflelo_epi16( - _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; - const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); - return Vec32{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; -#else - const Twice d2; - const auto ba = Combine(d2, b, a); - alignas(16) const T kShuffle[8] = {0, 3, 6, 5}; - return Vec32{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; -#endif -} -template -HWY_API Vec64 ShuffleTwo1230(const Vec64 a, const Vec64 b) { - const DFromV d; -#if HWY_TARGET == HWY_SSE2 - const Vec32 a_shuffled{ - _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))}; - const Vec32 b_shuffled{ - _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))}; - return Combine(d, b_shuffled, a_shuffled); -#else - const Twice d2; - const auto ba = Combine(d2, b, a); - alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a}; - return Vec64{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; -#endif -} -template -HWY_API Vec128 ShuffleTwo1230(const Vec128 a, const Vec128 b) { - const DFromV d; - const RebindToFloat df; - constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); - return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, - BitCast(df, b).raw, m)}); -} - -template -HWY_API Vec32 ShuffleTwo3012(const Vec32 a, const Vec32 b) { - const DFromV d; -#if HWY_TARGET == HWY_SSE2 - const auto zero = Zero(d); - const Rebind di16; - const Vec32 a_shuffled{_mm_shufflelo_epi16( - _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; - const Vec32 b_shuffled{_mm_shufflelo_epi16( - _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; - const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); - return Vec32{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; -#else - const Twice d2; - const auto ba = Combine(d2, b, a); - alignas(16) const T kShuffle[8] = {2, 1, 4, 7}; - return Vec32{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; -#endif -} -template -HWY_API Vec64 ShuffleTwo3012(const Vec64 a, const Vec64 b) { - const DFromV d; -#if HWY_TARGET == HWY_SSE2 - const Vec32 a_shuffled{ - _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))}; - const Vec32 b_shuffled{ - _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))}; - return Combine(d, b_shuffled, a_shuffled); -#else - const Twice d2; - const auto ba = Combine(d2, b, a); - alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e}; - return Vec64{TableLookupBytes(ba, Load(d2, kShuffle)).raw}; -#endif -} -template -HWY_API Vec128 ShuffleTwo3012(const Vec128 a, const Vec128 b) { - const DFromV d; - const RebindToFloat df; - constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); - return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, - BitCast(df, b).raw, m)}); -} - -} // namespace detail - -// Swap 64-bit halves -HWY_API Vec128 Shuffle1032(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec128 Shuffle1032(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec128 Shuffle1032(const Vec128 v) { - return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; -} -HWY_API Vec128 Shuffle01(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec128 Shuffle01(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec128 Shuffle01(const Vec128 v) { - return Vec128{_mm_shuffle_pd(v.raw, v.raw, 1)}; -} - -// Rotate right 32 bits -HWY_API Vec128 Shuffle0321(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x39)}; -} -HWY_API Vec128 Shuffle0321(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x39)}; -} -HWY_API Vec128 Shuffle0321(const Vec128 v) { - return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; -} -// Rotate left 32 bits -HWY_API Vec128 Shuffle2103(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x93)}; -} -HWY_API Vec128 Shuffle2103(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x93)}; -} -HWY_API Vec128 Shuffle2103(const Vec128 v) { - return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; -} - -// Reverse -HWY_API Vec128 Shuffle0123(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x1B)}; -} -HWY_API Vec128 Shuffle0123(const Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, 0x1B)}; -} -HWY_API Vec128 Shuffle0123(const Vec128 v) { - return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; -} - -// ================================================== COMPARE - -#if HWY_TARGET <= HWY_AVX3 - -// Comparisons set a mask bit to 1 if the condition is true, else 0. - -// ------------------------------ MaskFromVec - -namespace detail { - -template -HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<1> /*tag*/, - const Vec128 v) { - return Mask128{_mm_movepi8_mask(v.raw)}; -} -template -HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<2> /*tag*/, - const Vec128 v) { - return Mask128{_mm_movepi16_mask(v.raw)}; -} -template -HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<4> /*tag*/, - const Vec128 v) { - return Mask128{_mm_movepi32_mask(v.raw)}; -} -template -HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<8> /*tag*/, - const Vec128 v) { - return Mask128{_mm_movepi64_mask(v.raw)}; -} - -} // namespace detail - -template -HWY_API Mask128 MaskFromVec(const Vec128 v) { - return detail::MaskFromVec(hwy::SizeTag(), v); -} -// There do not seem to be native floating-point versions of these instructions. -template -HWY_API Mask128 MaskFromVec(const Vec128 v) { - const RebindToSigned> di; - return Mask128{MaskFromVec(BitCast(di, v)).raw}; -} -template -HWY_API Mask128 MaskFromVec(const Vec128 v) { - const RebindToSigned> di; - return Mask128{MaskFromVec(BitCast(di, v)).raw}; -} - -template -using MFromD = decltype(MaskFromVec(VFromD())); - -// ------------------------------ VecFromMask - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_movm_epi8(v.raw)}; -} - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_movm_epi16(v.raw)}; -} - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_movm_epi32(v.raw)}; -} - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_movm_epi64(v.raw)}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_castsi128_ph(_mm_movm_epi16(v.raw))}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; -} - -template -HWY_API Vec128 VecFromMask(const Mask128 v) { - return Vec128{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; -} - -// Generic for all vector lengths. -template -HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { - return VecFromMask(v); -} - -// ------------------------------ RebindMask (MaskFromVec) - -template -HWY_API MFromD RebindMask(DTo /* tag */, Mask128 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - return MFromD{m.raw}; -} - -// ------------------------------ TestBit - -namespace detail { - -template -HWY_INLINE Mask128 TestBit(hwy::SizeTag<1> /*tag*/, const Vec128 v, - const Vec128 bit) { - return Mask128{_mm_test_epi8_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask128 TestBit(hwy::SizeTag<2> /*tag*/, const Vec128 v, - const Vec128 bit) { - return Mask128{_mm_test_epi16_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask128 TestBit(hwy::SizeTag<4> /*tag*/, const Vec128 v, - const Vec128 bit) { - return Mask128{_mm_test_epi32_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask128 TestBit(hwy::SizeTag<8> /*tag*/, const Vec128 v, - const Vec128 bit) { - return Mask128{_mm_test_epi64_mask(v.raw, bit.raw)}; -} - -} // namespace detail - -template -HWY_API Mask128 TestBit(const Vec128 v, const Vec128 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return detail::TestBit(hwy::SizeTag(), v, bit); -} - -// ------------------------------ Equality - -template -HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator==(Vec128 a, Vec128 b) { - return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -// ------------------------------ Inequality - -template -HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { - return Mask128{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { - return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -// ------------------------------ Strict inequality - -// Signed/float < -template -HWY_API Mask128 operator>(Vec128 a, Vec128 b) { - return Mask128{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator>(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator>(Vec128 a, Vec128 b) { - return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} -template -HWY_API Mask128 operator>(Vec128 a, Vec128 b) { - return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} - -// ------------------------------ Weak inequality - -#if HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { - return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} - -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epi8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epi16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epi32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epi64_mask(a.raw, b.raw)}; -} - -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epu8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epu16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epu32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator>=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_epu64_mask(a.raw, b.raw)}; -} - -#else // AVX2 or below - -// Comparisons fill a lane with 1-bits if the condition is true, else 0. - -template -HWY_API MFromD RebindMask(DTo dto, Mask128 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - const Simd d; - return MaskFromVec(BitCast(dto, VecFromMask(d, m))); -} - -template -HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} - -// ------------------------------ Equality - -// Unsigned -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_epi8(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_epi16(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_epi32(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - const DFromV d64; - const RepartitionToNarrow d32; - const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); - const auto cmp64 = cmp32 & Shuffle2301(cmp32); - return MaskFromVec(BitCast(d64, cmp64)); -#else - return Mask128{_mm_cmpeq_epi64(a.raw, b.raw)}; -#endif -} - -// Signed -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_epi8(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_epi16(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_epi32(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(const Vec128 a, - const Vec128 b) { - // Same as signed ==; avoid duplicating the SSSE3 version. - const DFromV d; - RebindToUnsigned du; - return RebindMask(d, BitCast(du, a) == BitCast(du, b)); -} - -// Float -template -HWY_API Mask128 operator==(Vec128 a, Vec128 b) { - return Mask128{_mm_cmpeq_ps(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator==(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpeq_pd(a.raw, b.raw)}; -} - -// ------------------------------ Inequality - -// This cannot have T as a template argument, otherwise it is not more -// specialized than rewritten operator== in C++20, leading to compile -// errors: https://gcc.godbolt.org/z/xsrPhPvPT. -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Not(a == b); -} - -template -HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { - return Mask128{_mm_cmpneq_ps(a.raw, b.raw)}; -} -template -HWY_API Mask128 operator!=(Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpneq_pd(a.raw, b.raw)}; -} - -// ------------------------------ Strict inequality - -namespace detail { - -template -HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epi8(a.raw, b.raw)}; -} -template -HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epi16(a.raw, b.raw)}; -} -template -HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_epi32(a.raw, b.raw)}; -} - -template -HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, - const Vec128 a, - const Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - // See https://stackoverflow.com/questions/65166174/: - const DFromV d; - const RepartitionToNarrow d32; - const Vec128 m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw}; - const Vec128 m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw}; - // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper: - // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0. - const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw; - // Duplicate upper to lower half. - return Mask128{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))}; -#else - return Mask128{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 -#endif -} - -template -HWY_INLINE Mask128 Gt(hwy::UnsignedTag /*tag*/, Vec128 a, - Vec128 b) { - const DFromV du; - const RebindToSigned di; - const Vec128 msb = Set(du, (LimitsMax() >> 1) + 1); - const auto sa = BitCast(di, Xor(a, msb)); - const auto sb = BitCast(di, Xor(b, msb)); - return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); -} - -template -HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_ps(a.raw, b.raw)}; -} -template -HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpgt_pd(a.raw, b.raw)}; -} - -} // namespace detail - -template -HWY_INLINE Mask128 operator>(Vec128 a, Vec128 b) { - return detail::Gt(hwy::TypeTag(), a, b); -} - -// ------------------------------ Weak inequality - -namespace detail { -template -HWY_INLINE Mask128 Ge(hwy::SignedTag tag, Vec128 a, - Vec128 b) { - return Not(Gt(tag, b, a)); -} - -template -HWY_INLINE Mask128 Ge(hwy::UnsignedTag tag, Vec128 a, - Vec128 b) { - return Not(Gt(tag, b, a)); -} - -template -HWY_INLINE Mask128 Ge(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_ps(a.raw, b.raw)}; -} -template -HWY_INLINE Mask128 Ge(hwy::FloatTag /*tag*/, Vec128 a, - Vec128 b) { - return Mask128{_mm_cmpge_pd(a.raw, b.raw)}; -} - -} // namespace detail - -template -HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { - return detail::Ge(hwy::TypeTag(), a, b); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Reversed comparisons - -template -HWY_API Mask128 operator<(Vec128 a, Vec128 b) { - return b > a; -} - -template -HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { - return b >= a; -} - -// ------------------------------ Iota (Load) - -namespace detail { - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm_set_epi8( - static_cast(15), static_cast(14), static_cast(13), - static_cast(12), static_cast(11), static_cast(10), - static_cast(9), static_cast(8), static_cast(7), - static_cast(6), static_cast(5), static_cast(4), - static_cast(3), static_cast(2), static_cast(1), - static_cast(0))}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, - int16_t{3}, int16_t{2}, int16_t{1}, - int16_t{0})}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5}, - float16_t{4}, float16_t{3}, float16_t{2}, - float16_t{1}, float16_t{0})}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{ - _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm_set_epi64x(int64_t{1}, int64_t{0})}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm_set_pd(1.0, 0.0)}; -} - -#if HWY_COMPILER_MSVC -template -static HWY_INLINE V MaskOutVec128Iota(V v) { - const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)}; - return v & mask_out_mask; -} -template -static HWY_INLINE V MaskOutVec128Iota(V v) { -#if HWY_TARGET <= HWY_SSE4 - return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)}; -#else - const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)}; - return v & mask_out_mask; -#endif -} -template -static HWY_INLINE V MaskOutVec128Iota(V v) { - const DFromV d; - const Repartition df; - using VF = VFromD; - return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)}); -} -template -static HWY_INLINE V MaskOutVec128Iota(V v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)}); -} -template -static HWY_INLINE V MaskOutVec128Iota(V v) { - return v; -} -#endif - -} // namespace detail - -template -HWY_API VFromD Iota(D d, const T2 first) { - const auto result_iota = - detail::Iota0(d) + Set(d, static_cast>(first)); -#if HWY_COMPILER_MSVC - return detail::MaskOutVec128Iota(result_iota); -#else - return result_iota; -#endif -} - -// ------------------------------ FirstN (Iota, Lt) - -template , HWY_IF_V_SIZE_LE_D(D, 16)> -HWY_API M FirstN(D d, size_t num) { -#if HWY_TARGET <= HWY_AVX3 - constexpr size_t kN = MaxLanes(d); -#if HWY_ARCH_X86_64 - const uint64_t all = (1ull << kN) - 1; - // BZHI only looks at the lower 8 bits of n! - return M::FromBits((num > 255) ? all : _bzhi_u64(all, num)); -#else - const uint32_t all = static_cast((1ull << kN) - 1); - // BZHI only looks at the lower 8 bits of n! - return M::FromBits((num > 255) ? all - : _bzhi_u32(all, static_cast(num))); -#endif // HWY_ARCH_X86_64 -#else // HWY_TARGET > HWY_AVX3 - const RebindToSigned di; // Signed comparisons are cheaper. - using TI = TFromD; - return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(num))); -#endif // HWY_TARGET <= HWY_AVX3 -} - -// ================================================== MEMORY (2) - -// ------------------------------ MaskedLoad - -#if HWY_TARGET <= HWY_AVX3 - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_maskz_loadu_epi8(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - return BitCast(d, VFromD{_mm_maskz_loadu_epi16(m.raw, p)}); -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_maskz_loadu_epi32(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_maskz_loadu_epi64(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const float* HWY_RESTRICT p) { - return VFromD{_mm_maskz_loadu_ps(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const double* HWY_RESTRICT p) { - return VFromD{_mm_maskz_loadu_pd(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_mask_loadu_epi8(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_mask_loadu_epi16(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_mask_loadu_epi32(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm_mask_loadu_epi64(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const float* HWY_RESTRICT p) { - return VFromD{_mm_mask_loadu_ps(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const double* HWY_RESTRICT p) { - return VFromD{_mm_mask_loadu_pd(v.raw, m.raw, p)}; -} - -#elif HWY_TARGET == HWY_AVX2 - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - auto p_p = reinterpret_cast(p); // NOLINT - return VFromD{_mm_maskload_epi32(p_p, m.raw)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - auto p_p = reinterpret_cast(p); // NOLINT - return VFromD{_mm_maskload_epi64(p_p, m.raw)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, const float* HWY_RESTRICT p) { - const RebindToSigned di; - return VFromD{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, const double* HWY_RESTRICT p) { - const RebindToSigned di; - return VFromD{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)}; -} - -// There is no maskload_epi8/16, so blend instead. -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElseZero(m, LoadU(d, p)); -} - -#else // <= SSE4 - -// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElseZero(m, LoadU(d, p)); -} - -#endif - -// ------------------------------ MaskedLoadOr - -#if HWY_TARGET > HWY_AVX3 // else: native - -// Generic for all vector lengths. -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElse(m, LoadU(d, p), v); -} - -#endif // HWY_TARGET > HWY_AVX3 - -// ------------------------------ LoadN - -#if HWY_TARGET <= HWY_AVX2 -#ifdef HWY_NATIVE_LOAD_N -#undef HWY_NATIVE_LOAD_N -#else -#define HWY_NATIVE_LOAD_N -#endif - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, - size_t max_lanes_to_load) { - const size_t num_of_lanes_to_load = - HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)); - const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> - d_full; - return ResizeBitCast( - d, MaskedLoad(FirstN(d_full, num_of_lanes_to_load), d_full, p)); -} - -#if HWY_TARGET > HWY_AVX3 -namespace detail { - -template > -HWY_INLINE VFromD AVX2UIF8Or16LoadLeadingN(VFromD /*load_mask*/, D /*d*/, - const T* HWY_RESTRICT /*p*/, - VFromD v_trailing) { - return v_trailing; -} - -template > -HWY_INLINE VFromD AVX2UIF8Or16LoadLeadingN(VFromD load_mask, D d, - const T* HWY_RESTRICT p, - VFromD v_trailing) { - using DI32 = Repartition; - const FixedTag di32_full; - - // ResizeBitCast of load_mask to di32 is okay below if - // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past - // the first (lowest-index) lanes of load_mask.raw will have already been - // zeroed out - return ResizeBitCast( - d, IfNegativeThenElse( - ResizeBitCast(di32_full, load_mask), - MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)), - di32_full, reinterpret_cast(p)), - ResizeBitCast(di32_full, v_trailing))); -} - -template > -HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD /*load_mask*/, D d, - const T* HWY_RESTRICT p, - size_t num_of_lanes_to_load) { - return (num_of_lanes_to_load > 0) ? LoadU(d, p) : Zero(d); -} - -template > -HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD /*load_mask*/, D d, - const T* HWY_RESTRICT p, - size_t num_of_lanes_to_load) { - if (num_of_lanes_to_load > 1) { - return LoadU(d, p); - } else { - const FixedTag, 1> d1; - return (num_of_lanes_to_load == 1) ? ResizeBitCast(d, LoadU(d1, p)) - : Zero(d); - } -} - -template > -HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD load_mask, D d, - const T* HWY_RESTRICT p, - size_t num_of_lanes_to_load) { - const size_t trailing_n = num_of_lanes_to_load & 3; - if (trailing_n != 0) { - VFromD v_trailing = And(load_mask, Set(d, p[num_of_lanes_to_load - 1])); - - if ((trailing_n & 2) != 0) { - const Repartition di16; - int16_t i16_bits; - CopyBytes(p + num_of_lanes_to_load - trailing_n, - &i16_bits); - v_trailing = BitCast( - d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), - BitCast(di16, v_trailing))); - } - - return v_trailing; - } else { - return Zero(d); - } -} - -template > -HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD load_mask, D d, - const T* HWY_RESTRICT p, - size_t num_of_lanes_to_load) { - if ((num_of_lanes_to_load & 1) != 0) { - return And(load_mask, Set(d, p[num_of_lanes_to_load - 1])); - } else { - return Zero(d); - } -} - -} // namespace detail - -template > -HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, size_t N) { - const size_t num_of_lanes_to_load = HWY_MIN(N, HWY_MAX_LANES_D(D)); - const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> - d_full; - - const auto load_mask = ResizeBitCast( - d, VecFromMask(d_full, FirstN(d_full, num_of_lanes_to_load))); - const auto v_trailing = - detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_of_lanes_to_load); - -#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD - if (__builtin_constant_p(num_of_lanes_to_load < (4 / sizeof(TFromD))) && - num_of_lanes_to_load < (4 / sizeof(TFromD))) { - return v_trailing; - } -#endif - - return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing); -} - -#endif // HWY_TARGET > HWY_AVX3 -#endif // HWY_TARGET <= HWY_AVX2 - -// ------------------------------ BlendedStore - -namespace detail { - -// There is no maskload_epi8/16 with which we could safely implement -// BlendedStore. Manual blending is also unsafe because loading a full vector -// that crosses the array end causes asan faults. Resort to scalar code; the -// caller should instead use memcpy, assuming m is FirstN(d, n). -template -HWY_API void ScalarMaskedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - const RebindToSigned di; // for testing mask if T=bfloat16_t. - using TI = TFromD; - alignas(16) TI buf[MaxLanes(d)]; - alignas(16) TI mask[MaxLanes(d)]; - Store(BitCast(di, v), di, buf); - Store(BitCast(di, VecFromMask(d, m)), di, mask); - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask[i]) { - CopySameSize(buf + i, p + i); - } - } -} -} // namespace detail - -#if HWY_TARGET <= HWY_AVX3 - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm_mask_storeu_epi8(p, m.raw, v.raw); -} -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - _mm_mask_storeu_epi16(reinterpret_cast(p), RebindMask(du, m).raw, - BitCast(du, v).raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - _mm_mask_storeu_epi32(pi, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - _mm_mask_storeu_epi64(pi, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D, float* HWY_RESTRICT p) { - _mm_mask_storeu_ps(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D, double* HWY_RESTRICT p) { - _mm_mask_storeu_pd(p, m.raw, v.raw); -} - -#elif HWY_TARGET == HWY_AVX2 - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - detail::ScalarMaskedStore(v, m, d, p); -} - -namespace detail { - -template -HWY_INLINE void NativeBlendedStore(V v, M m, TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - _mm_maskstore_epi32(pi, m.raw, v.raw); -} - -template -HWY_INLINE void NativeBlendedStore(V v, M m, TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - _mm_maskstore_epi64(pi, m.raw, v.raw); -} - -template -HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) { - _mm_maskstore_ps(p, m.raw, v.raw); -} - -template -HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) { - _mm_maskstore_pd(p, m.raw, v.raw); -} - -} // namespace detail - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - const RebindToSigned di; - // For partial vectors, avoid writing other lanes by zeroing their mask. - if (d.MaxBytes() < 16) { - const Full128> dfull; - const Mask128> mfull{m.raw}; - m = MFromD{And(mfull, FirstN(dfull, MaxLanes(d))).raw}; - } - - // Float/double require, and unsigned ints tolerate, signed int masks. - detail::NativeBlendedStore(v, RebindMask(di, m), p); -} - -#else // <= SSE4 - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). - detail::ScalarMaskedStore(v, m, d, p); -} - -#endif // SSE4 - -// ================================================== ARITHMETIC - -// ------------------------------ Addition - -// Unsigned -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi64(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator+(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_add_pd(a.raw, b.raw)}; -} - -// ------------------------------ Subtraction - -// Unsigned -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(Vec128 a, - Vec128 b) { - return Vec128{_mm_sub_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi64(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator-(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sub_pd(a.raw, b.raw)}; -} - -// ------------------------------ SumsOf8 -template -HWY_API Vec128 SumsOf8(const Vec128 v) { - return Vec128{_mm_sad_epu8(v.raw, _mm_setzero_si128())}; -} - -#ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF -#undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF -#else -#define HWY_NATIVE_SUMS_OF_8_ABS_DIFF -#endif - -template -HWY_API Vec128 SumsOf8AbsDiff(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_sad_epu8(a.raw, b.raw)}; -} - -// ------------------------------ SaturatedAdd - -// Returns a + b clamped to the destination range. - -// Unsigned -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_adds_epu8(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_adds_epu16(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_adds_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedAdd(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_adds_epi16(a.raw, b.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB -#undef HWY_NATIVE_I32_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I32_SATURATED_ADDSUB -#endif - -#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB -#undef HWY_NATIVE_I64_SATURATED_ADDSUB -#else -#define HWY_NATIVE_I64_SATURATED_ADDSUB -#endif - -template -HWY_API Vec128 SaturatedAdd(Vec128 a, - Vec128 b) { - const DFromV d; - const auto sum = a + b; - const auto overflow_mask = MaskFromVec( - Vec128{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); - const auto i32_max = Set(d, LimitsMax()); - const Vec128 overflow_result{_mm_mask_ternarylogic_epi32( - i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, sum); -} - -template -HWY_API Vec128 SaturatedAdd(Vec128 a, - Vec128 b) { - const DFromV d; - const auto sum = a + b; - const auto overflow_mask = MaskFromVec( - Vec128{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); - const auto i64_max = Set(d, LimitsMax()); - const Vec128 overflow_result{_mm_mask_ternarylogic_epi64( - i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, sum); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ SaturatedSub - -// Returns a - b clamped to the destination range. - -// Unsigned -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_subs_epu8(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_subs_epu16(a.raw, b.raw)}; -} - -// Signed -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_subs_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 SaturatedSub(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_subs_epi16(a.raw, b.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API Vec128 SaturatedSub(Vec128 a, - Vec128 b) { - const DFromV d; - const auto diff = a - b; - const auto overflow_mask = MaskFromVec( - Vec128{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); - const auto i32_max = Set(d, LimitsMax()); - const Vec128 overflow_result{_mm_mask_ternarylogic_epi32( - i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, diff); -} - -template -HWY_API Vec128 SaturatedSub(Vec128 a, - Vec128 b) { - const DFromV d; - const auto diff = a - b; - const auto overflow_mask = MaskFromVec( - Vec128{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); - const auto i64_max = Set(d, LimitsMax()); - const Vec128 overflow_result{_mm_mask_ternarylogic_epi64( - i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, diff); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ AverageRound - -// Returns (a + b + 1) / 2 - -// Unsigned -template -HWY_API Vec128 AverageRound(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_avg_epu8(a.raw, b.raw)}; -} -template -HWY_API Vec128 AverageRound(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_avg_epu16(a.raw, b.raw)}; -} - -// ------------------------------ Integer multiplication - -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mullo_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mullo_epi16(a.raw, b.raw)}; -} - -// Returns the upper 16 bits of a * b in each lane. -template -HWY_API Vec128 MulHigh(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mulhi_epu16(a.raw, b.raw)}; -} -template -HWY_API Vec128 MulHigh(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mulhi_epi16(a.raw, b.raw)}; -} - -// Multiplies even lanes (0, 2 ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -template )> -HWY_API VFromD>> MulEven(V a, V b) { - const DFromV d; - const RepartitionToWide dw; - const auto lo8_mask = Set(dw, uint16_t{0x00FF}); - return And(ResizeBitCast(dw, a), lo8_mask) * - And(ResizeBitCast(dw, b), lo8_mask); -} - -template )> -HWY_API VFromD>> MulEven(V a, V b) { - const DFromV d; - const RepartitionToWide dw; - return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) * - ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b))); -} - -template )> -HWY_API VFromD>> MulEven(V a, V b) { - const DFromV d; - const RepartitionToWide dw; - const RepartitionToNarrow dw_as_d16; - - const auto lo = ResizeBitCast(dw, a * b); - const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b))); - return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); -} - -template -HWY_API Vec128 MulEven(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mul_epu32(a.raw, b.raw)}; -} - -template -HWY_API Vec128 MulEven(const Vec128 a, - const Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - const DFromV d; - const RepartitionToWide dw; - const RebindToUnsigned du; - - // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) + - // (((a[i] >> 31) * b[i]) << 32) + - // (((b[i] >> 31) * a[i]) << 32) + - // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) - - // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the - // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero. - - // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) == - // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32) - - // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be - // computed using MulEven(BitCast(du, a), BitCast(du, b)) - - const auto neg_p_hi = ShiftLeft<32>( - ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a))); - const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b))); - return p_lo - neg_p_hi; -#else - return Vec128{_mm_mul_epi32(a.raw, b.raw)}; -#endif -} - -template -HWY_API VFromD>> MulOdd(V a, V b) { - const DFromV d; - const RepartitionToWide dw; - return ShiftRight<8>(ResizeBitCast(dw, a)) * - ShiftRight<8>(ResizeBitCast(dw, b)); -} - -template )> -HWY_API VFromD>> MulOdd(V a, V b) { - const DFromV d; - const RepartitionToWide dw; - const RebindToUnsigned dw_u; - const RepartitionToNarrow dw_as_d16; - - const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b))); - const auto hi = ResizeBitCast(dw, MulHigh(a, b)); - return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); -} - -template )> -HWY_API VFromD>> MulOdd(V a, V b) { - return MulEven(DupOdd(a), DupOdd(b)); -} - -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. - // 64-bit right shift would also work but also needs port 5, so no benefit. - // Notation: x=don't care, z=0. - const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); - const auto mullo_x2x0 = MulEven(a, b); - const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); - const auto mullo_x3x1 = - MulEven(Vec128{a_x3x1}, Vec128{b_x3x1}); - // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating - // the latter requires one more instruction or a constant. - const __m128i mul_20 = - _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); - const __m128i mul_31 = - _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); - return Vec128{_mm_unpacklo_epi32(mul_20, mul_31)}; -#else - return Vec128{_mm_mullo_epi32(a.raw, b.raw)}; -#endif -} - -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - // Same as unsigned; avoid duplicating the SSSE3 code. - const DFromV d; - const RebindToUnsigned du; - return BitCast(d, BitCast(du, a) * BitCast(du, b)); -} - -// ------------------------------ RotateRight (ShiftRight, Or) - -template -HWY_API Vec128 RotateRight(const Vec128 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - // AVX3 does not support 8/16-bit. - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -template -HWY_API Vec128 RotateRight(const Vec128 v) { - static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_ror_epi32(v.raw, kBits)}; -#else - if (kBits == 0) return v; - return Or(ShiftRight(v), ShiftLeft(v)); -#endif -} - -template -HWY_API Vec128 RotateRight(const Vec128 v) { - static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_ror_epi64(v.raw, kBits)}; -#else - if (kBits == 0) return v; - return Or(ShiftRight(v), ShiftLeft(v)); -#endif -} - -// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) - -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - const DFromV d; - return VecFromMask(v < Zero(d)); -} - -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - return ShiftRight<15>(v); -} - -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - return ShiftRight<31>(v); -} - -template -HWY_API Vec128 BroadcastSignBit(const Vec128 v) { - const DFromV d; -#if HWY_TARGET <= HWY_AVX3 - (void)d; - return Vec128{_mm_srai_epi64(v.raw, 63)}; -#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 - return VecFromMask(v < Zero(d)); -#else - // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift - // avoids generating a zero. - const RepartitionToNarrow d32; - const auto sign = ShiftRight<31>(BitCast(d32, v)); - return Vec128{ - _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; -#endif -} - -// ------------------------------ Integer Abs - -// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. -template -HWY_API Vec128 Abs(const Vec128 v) { -#if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2 - const DFromV d; - const RebindToUnsigned du; - const auto zero = Zero(du); - const auto v_as_u8 = BitCast(du, v); - return BitCast(d, Min(v_as_u8, zero - v_as_u8)); -#else - return Vec128{_mm_abs_epi8(v.raw)}; -#endif -} - -template -HWY_API Vec128 Abs(const Vec128 v) { -#if HWY_TARGET == HWY_SSE2 - const auto zero = Zero(DFromV()); - return Max(v, zero - v); -#else - return Vec128{_mm_abs_epi16(v.raw)}; -#endif -} - -template -HWY_API Vec128 Abs(const Vec128 v) { -#if HWY_TARGET <= HWY_SSSE3 - return Vec128{_mm_abs_epi32(v.raw)}; -#else - const auto zero = Zero(DFromV()); - return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -#endif -} - -template -HWY_API Vec128 Abs(const Vec128 v) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_abs_epi64(v.raw)}; -#else - const auto zero = Zero(DFromV()); - return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -#endif -} - -// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL -// srli_epi64: the count should be unsigned int. Note that this is not the same -// as the Shift3264Count in x86_512-inl.h (GCC also requires int). -#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \ - (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) -using Shift64Count = int; -#else -// Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this. -using Shift64Count = unsigned int; -#endif - -template -HWY_API Vec128 ShiftRight(const Vec128 v) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{ - _mm_srai_epi64(v.raw, static_cast(kBits))}; -#else - const DFromV di; - const RebindToUnsigned du; - const auto right = BitCast(di, ShiftRight(BitCast(du, v))); - const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); - return right | sign; -#endif -} - -// ------------------------------ ZeroIfNegative (BroadcastSignBit) -template -HWY_API Vec128 ZeroIfNegative(Vec128 v) { - static_assert(IsFloat(), "Only works for float"); - const DFromV d; -#if HWY_TARGET >= HWY_SSSE3 - const RebindToSigned di; - const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); -#else - const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS -#endif - return IfThenElse(mask, Zero(d), v); -} - -// ------------------------------ IfNegativeThenElse -template -HWY_API Vec128 IfNegativeThenElse(const Vec128 v, - const Vec128 yes, - const Vec128 no) { -// int8: IfThenElse only looks at the MSB on SSE4 or newer -#if HWY_TARGET <= HWY_SSE4 - const auto mask = MaskFromVec(v); -#else - const DFromV d; - const RebindToSigned di; - const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); -#endif - - return IfThenElse(mask, yes, no); -} - -template -HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, - Vec128 no) { - static_assert(IsSigned(), "Only works for signed/float"); - -// 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's -// MSB. -#if HWY_TARGET <= HWY_AVX3 - const auto mask = MaskFromVec(v); -#else - const DFromV d; - const RebindToSigned di; - const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); -#endif - - return IfThenElse(mask, yes, no); -} - -template -HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, - Vec128 no) { - static_assert(IsSigned(), "Only works for signed/float"); - const DFromV d; - -#if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 - // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB - // on SSE4 or later. - const RebindToFloat df; - const auto mask = MaskFromVec(BitCast(df, v)); - return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no))); -#else // SSE2, SSSE3, or AVX3 - -#if HWY_TARGET <= HWY_AVX3 - // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only - // looks at the MSB on AVX3 - (void)d; - const auto mask = MaskFromVec(v); -#else - const RebindToSigned di; - const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); -#endif - - return IfThenElse(mask, yes, no); -#endif -} - -// ------------------------------ ShiftLeftSame - -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_slli_epi16(v.raw, bits)}; - } -#endif - return Vec128{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_slli_epi32(v.raw, bits)}; - } -#endif - return Vec128{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_slli_epi64(v.raw, bits)}; - } -#endif - return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_slli_epi16(v.raw, bits)}; - } -#endif - return Vec128{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_slli_epi32(v.raw, bits)}; - } -#endif - return Vec128{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_slli_epi64(v.raw, bits)}; - } -#endif - return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ - ShiftLeftSame(Vec128>{v.raw}, bits).raw}; - return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); -} - -// ------------------------------ ShiftRightSame (BroadcastSignBit) - -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_srli_epi16(v.raw, bits)}; - } -#endif - return Vec128{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_srli_epi32(v.raw, bits)}; - } -#endif - return Vec128{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_srli_epi64(v.raw, bits)}; - } -#endif - return Vec128{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, - const int bits) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec128 shifted{ - ShiftRightSame(Vec128{v.raw}, bits).raw}; - return shifted & Set(d8, static_cast(0xFF >> bits)); -} - -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_srai_epi16(v.raw, bits)}; - } -#endif - return Vec128{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{_mm_srai_epi32(v.raw, bits)}; - } -#endif - return Vec128{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -template -HWY_API Vec128 ShiftRightSame(const Vec128 v, - const int bits) { -#if HWY_TARGET <= HWY_AVX3 -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec128{ - _mm_srai_epi64(v.raw, static_cast(bits))}; - } -#endif - return Vec128{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -#else - const DFromV di; - const RebindToUnsigned du; - const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); - const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); - return right | sign; -#endif -} - -template -HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { - const DFromV di; - const RebindToUnsigned du; - const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); - const auto shifted_sign = - BitCast(di, Set(du, static_cast(0x80 >> bits))); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// ------------------------------ Floating-point mul / div - -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator*(Vec128 a, - Vec128 b) { - return Vec128{_mm_mul_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator*(Vec128 a, Vec128 b) { - return Vec128{_mm_mul_ps(a.raw, b.raw)}; -} -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mul_ss(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mul_pd(a.raw, b.raw)}; -} -HWY_API Vec64 operator*(const Vec64 a, const Vec64 b) { - return Vec64{_mm_mul_sd(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator/(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_div_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 operator/(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_div_ps(a.raw, b.raw)}; -} -HWY_API Vec128 operator/(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_div_ss(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator/(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_div_pd(a.raw, b.raw)}; -} -HWY_API Vec64 operator/(const Vec64 a, const Vec64 b) { - return Vec64{_mm_div_sd(a.raw, b.raw)}; -} - -// Approximate reciprocal -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 ApproximateReciprocal( - const Vec128 v) { - return Vec128{_mm_rcp_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { - return Vec128{_mm_rcp_ps(v.raw)}; -} -HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { - return Vec128{_mm_rcp_ss(v.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -#ifdef HWY_NATIVE_F64_APPROX_RECIP -#undef HWY_NATIVE_F64_APPROX_RECIP -#else -#define HWY_NATIVE_F64_APPROX_RECIP -#endif - -HWY_API Vec128 ApproximateReciprocal(Vec128 v) { - return Vec128{_mm_rcp14_pd(v.raw)}; -} -HWY_API Vec64 ApproximateReciprocal(Vec64 v) { - return Vec64{_mm_rcp14_sd(v.raw, v.raw)}; -} -#endif - -// Generic for all vector lengths. -template -HWY_API V AbsDiff(V a, V b) { - return Abs(a - b); -} - -// ------------------------------ Floating-point multiply-add variants - -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 MulAdd(Vec128 mul, - Vec128 x, - Vec128 add) { - return Vec128{_mm_fmadd_ph(mul.raw, x.raw, add.raw)}; -} - -template -HWY_API Vec128 NegMulAdd(Vec128 mul, - Vec128 x, - Vec128 add) { - return Vec128{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)}; -} - -template -HWY_API Vec128 MulSub(Vec128 mul, - Vec128 x, - Vec128 sub) { - return Vec128{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)}; -} - -template -HWY_API Vec128 NegMulSub(Vec128 mul, - Vec128 x, - Vec128 sub) { - return Vec128{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)}; -} - -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { -#if HWY_TARGET >= HWY_SSE4 - return mul * x + add; -#else - return Vec128{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; -#endif -} -template -HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, - Vec128 add) { -#if HWY_TARGET >= HWY_SSE4 - return mul * x + add; -#else - return Vec128{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; -#endif -} - -// Returns add - mul * x -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { -#if HWY_TARGET >= HWY_SSE4 - return add - mul * x; -#else - return Vec128{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; -#endif -} -template -HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, - Vec128 add) { -#if HWY_TARGET >= HWY_SSE4 - return add - mul * x; -#else - return Vec128{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; -#endif -} - -// Returns mul * x - sub -template -HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, - Vec128 sub) { -#if HWY_TARGET >= HWY_SSE4 - return mul * x - sub; -#else - return Vec128{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; -#endif -} -template -HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, - Vec128 sub) { -#if HWY_TARGET >= HWY_SSE4 - return mul * x - sub; -#else - return Vec128{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; -#endif -} - -// Returns -mul * x - sub -template -HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, - Vec128 sub) { -#if HWY_TARGET >= HWY_SSE4 - return Neg(mul) * x - sub; -#else - return Vec128{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; -#endif -} -template -HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, - Vec128 sub) { -#if HWY_TARGET >= HWY_SSE4 - return Neg(mul) * x - sub; -#else - return Vec128{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; -#endif -} - -// ------------------------------ Floating-point square root - -// Full precision square root -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Sqrt(Vec128 v) { - return Vec128{_mm_sqrt_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Sqrt(Vec128 v) { - return Vec128{_mm_sqrt_ps(v.raw)}; -} -HWY_API Vec128 Sqrt(Vec128 v) { - return Vec128{_mm_sqrt_ss(v.raw)}; -} -template -HWY_API Vec128 Sqrt(Vec128 v) { - return Vec128{_mm_sqrt_pd(v.raw)}; -} -HWY_API Vec64 Sqrt(Vec64 v) { - return Vec64{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; -} - -// Approximate reciprocal square root -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { - return Vec128{_mm_rsqrt_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { - return Vec128{_mm_rsqrt_ps(v.raw)}; -} -HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { - return Vec128{_mm_rsqrt_ss(v.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -#ifdef HWY_NATIVE_F64_APPROX_RSQRT -#undef HWY_NATIVE_F64_APPROX_RSQRT -#else -#define HWY_NATIVE_F64_APPROX_RSQRT -#endif - -HWY_API Vec64 ApproximateReciprocalSqrt(Vec64 v) { - return Vec64{_mm_rsqrt14_sd(v.raw, v.raw)}; -} -HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { -#if HWY_COMPILER_MSVC - const DFromV d; - return Vec128{_mm_mask_rsqrt14_pd( - Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)}; -#else - return Vec128{_mm_rsqrt14_pd(v.raw)}; -#endif -} -#endif - -// ------------------------------ Min (Gt, IfThenElse) - -namespace detail { - -template -HWY_INLINE HWY_MAYBE_UNUSED Vec128 MinU(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; - const auto msb = Set(du, static_cast(T(1) << (sizeof(T) * 8 - 1))); - const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); - return IfThenElse(gt, b, a); -} - -} // namespace detail - -// Unsigned -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{_mm_min_epu8(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return detail::MinU(a, b); -#else - return Vec128{_mm_min_epu16(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return detail::MinU(a, b); -#else - return Vec128{_mm_min_epu32(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_min_epu64(a.raw, b.raw)}; -#else - return detail::MinU(a, b); -#endif -} - -// Signed -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return IfThenElse(a < b, a, b); -#else - return Vec128{_mm_min_epi8(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{_mm_min_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return IfThenElse(a < b, a, b); -#else - return Vec128{_mm_min_epi32(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_min_epi64(a.raw, b.raw)}; -#else - return IfThenElse(a < b, a, b); -#endif -} - -// Float -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Min(Vec128 a, - Vec128 b) { - return Vec128{_mm_min_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{_mm_min_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 Min(Vec128 a, Vec128 b) { - return Vec128{_mm_min_pd(a.raw, b.raw)}; -} - -// ------------------------------ Max (Gt, IfThenElse) - -namespace detail { -template -HWY_INLINE HWY_MAYBE_UNUSED Vec128 MaxU(const Vec128 a, - const Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; - const auto msb = Set(du, static_cast(T(1) << (sizeof(T) * 8 - 1))); - const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); - return IfThenElse(gt, a, b); -} - -} // namespace detail - -// Unsigned -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{_mm_max_epu8(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return detail::MaxU(a, b); -#else - return Vec128{_mm_max_epu16(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return detail::MaxU(a, b); -#else - return Vec128{_mm_max_epu32(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_max_epu64(a.raw, b.raw)}; -#else - return detail::MaxU(a, b); -#endif -} - -// Signed -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return IfThenElse(a < b, b, a); -#else - return Vec128{_mm_max_epi8(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{_mm_max_epi16(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - return IfThenElse(a < b, b, a); -#else - return Vec128{_mm_max_epi32(a.raw, b.raw)}; -#endif -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_max_epi64(a.raw, b.raw)}; -#else - return IfThenElse(a < b, b, a); -#endif -} - -// Float -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Max(Vec128 a, - Vec128 b) { - return Vec128{_mm_max_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{_mm_max_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 Max(Vec128 a, Vec128 b) { - return Vec128{_mm_max_pd(a.raw, b.raw)}; -} - -// ================================================== MEMORY (3) - -// ------------------------------ Non-temporal stores - -// On clang6, we see incorrect code generated for _mm_stream_pi, so -// round even partial vectors up to 16 bytes. -template -HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - const RebindToUnsigned du; // for float16_t - _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw); -} -template -HWY_API void Stream(VFromD v, D /* tag */, float* HWY_RESTRICT aligned) { - _mm_stream_ps(aligned, v.raw); -} -template -HWY_API void Stream(VFromD v, D /* tag */, double* HWY_RESTRICT aligned) { - _mm_stream_pd(aligned, v.raw); -} - -// ------------------------------ Scatter - -// Work around warnings in the intrinsic definitions (passing -1 as a mask). -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") - -// Unfortunately the GCC/Clang intrinsics do not accept int64_t*. -using GatherIndex64 = long long int; // NOLINT(runtime/int) -static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); - -#if HWY_TARGET <= HWY_AVX3 - -#ifdef HWY_NATIVE_SCATTER -#undef HWY_NATIVE_SCATTER -#else -#define HWY_NATIVE_SCATTER -#endif - -namespace detail { - -template -HWY_INLINE void NativeScatter128(VFromD v, D d, TFromD* HWY_RESTRICT base, - VI index) { - if (d.MaxBytes() == 16) { - _mm_i32scatter_epi32(base, index.raw, v.raw, kScale); - } else { - const __mmask8 mask = (1u << MaxLanes(d)) - 1; - _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale); - } -} - -template -HWY_INLINE void NativeScatter128(VFromD v, D d, TFromD* HWY_RESTRICT base, - VI index) { - if (d.MaxBytes() == 16) { - _mm_i64scatter_epi64(base, index.raw, v.raw, kScale); - } else { - const __mmask8 mask = (1u << MaxLanes(d)) - 1; - _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale); - } -} - -template -HWY_INLINE void NativeScatter128(VFromD v, D d, float* HWY_RESTRICT base, - VI index) { - if (d.MaxBytes() == 16) { - _mm_i32scatter_ps(base, index.raw, v.raw, kScale); - } else { - const __mmask8 mask = (1u << MaxLanes(d)) - 1; - _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale); - } -} - -template -HWY_INLINE void NativeScatter128(VFromD v, D d, double* HWY_RESTRICT base, - VI index) { - if (d.MaxBytes() == 16) { - _mm_i64scatter_pd(base, index.raw, v.raw, kScale); - } else { - const __mmask8 mask = (1u << MaxLanes(d)) - 1; - _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale); - } -} - -template -HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT base, VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); - _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale); -} - -template -HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT base, VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); - _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale); -} - -template -HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, - float* HWY_RESTRICT base, VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); - _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale); -} - -template -HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, - double* HWY_RESTRICT base, VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); - _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale); -} - -} // namespace detail - -template -HWY_API void ScatterOffset(VFromD v, D d, TFromD* HWY_RESTRICT base, - VFromD> offset) { - return detail::NativeScatter128<1>(v, d, base, offset); -} -template -HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, - VFromD> index) { - return detail::NativeScatter128)>(v, d, base, index); -} -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT base, - VFromD> index) { - return detail::NativeMaskedScatter128)>(v, m, d, base, - index); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Gather (Load/Store) - -#if HWY_TARGET <= HWY_AVX2 - -#ifdef HWY_NATIVE_GATHER -#undef HWY_NATIVE_GATHER -#else -#define HWY_NATIVE_GATHER -#endif - -namespace detail { - -template -HWY_INLINE VFromD NativeGather128(D /* tag */, - const TFromD* HWY_RESTRICT base, - VI index) { - return VFromD{_mm_i32gather_epi32(reinterpret_cast(base), - index.raw, kScale)}; -} - -template -HWY_INLINE VFromD NativeGather128(D /* tag */, - const TFromD* HWY_RESTRICT base, - VI index) { - return VFromD{_mm_i64gather_epi64( - reinterpret_cast(base), index.raw, kScale)}; -} - -template -HWY_INLINE VFromD NativeGather128(D /* tag */, - const float* HWY_RESTRICT base, VI index) { - return VFromD{_mm_i32gather_ps(base, index.raw, kScale)}; -} - -template -HWY_INLINE VFromD NativeGather128(D /* tag */, - const double* HWY_RESTRICT base, - VI index) { - return VFromD{_mm_i64gather_pd(base, index.raw, kScale)}; -} - -template -HWY_INLINE VFromD NativeMaskedGather128(MFromD m, D d, - const TFromD* HWY_RESTRICT base, - VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); -#if HWY_TARGET <= HWY_AVX3 - return VFromD{_mm_mmask_i32gather_epi32( - Zero(d).raw, m.raw, index.raw, reinterpret_cast(base), - kScale)}; -#else - return VFromD{_mm_mask_i32gather_epi32( - Zero(d).raw, reinterpret_cast(base), index.raw, m.raw, - kScale)}; -#endif -} - -template -HWY_INLINE VFromD NativeMaskedGather128(MFromD m, D d, - const TFromD* HWY_RESTRICT base, - VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); -#if HWY_TARGET <= HWY_AVX3 - return VFromD{_mm_mmask_i64gather_epi64( - Zero(d).raw, m.raw, index.raw, - reinterpret_cast(base), kScale)}; -#else - return VFromD{_mm_mask_i64gather_epi64( - Zero(d).raw, reinterpret_cast(base), index.raw, - m.raw, kScale)}; -#endif -} - -template -HWY_INLINE VFromD NativeMaskedGather128(MFromD m, D d, - const float* HWY_RESTRICT base, - VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); -#if HWY_TARGET <= HWY_AVX3 - return VFromD{ - _mm_mmask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, kScale)}; -#else - return VFromD{ - _mm_mask_i32gather_ps(Zero(d).raw, base, index.raw, m.raw, kScale)}; -#endif -} - -template -HWY_INLINE VFromD NativeMaskedGather128(MFromD m, D d, - const double* HWY_RESTRICT base, - VI index) { - // For partial vectors, ensure upper mask lanes are zero to prevent faults. - if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); -#if HWY_TARGET <= HWY_AVX3 - return VFromD{ - _mm_mmask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, kScale)}; -#else - return VFromD{ - _mm_mask_i64gather_pd(Zero(d).raw, base, index.raw, m.raw, kScale)}; -#endif -} - -} // namespace detail - -template , class VI> -HWY_API VFromD GatherOffset(D d, const T* HWY_RESTRICT base, VI offset) { - static_assert(sizeof(T) == sizeof(TFromV), "Index/lane size must match"); - return detail::NativeGather128<1>(d, base, offset); -} -template , class VI> -HWY_API VFromD GatherIndex(D d, const T* HWY_RESTRICT base, VI index) { - static_assert(sizeof(T) == sizeof(TFromV), "Index/lane size must match"); - return detail::NativeGather128(d, base, index); -} -template , class VI> -HWY_API VFromD MaskedGatherIndex(MFromD m, D d, - const T* HWY_RESTRICT base, VI index) { - static_assert(sizeof(T) == sizeof(TFromV), "Index/lane size must match"); - return detail::NativeMaskedGather128(m, d, base, index); -} - -#endif // HWY_TARGET <= HWY_AVX2 - -HWY_DIAGNOSTICS(pop) - -// ================================================== SWIZZLE (2) - -// ------------------------------ LowerHalf - -template -HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { - return VFromD{v.raw}; -} -template -HWY_API Vec128 LowerHalf(Vec128 v) { - return Vec128{v.raw}; -} - -// ------------------------------ ShiftLeftBytes - -template -HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - const RebindToUnsigned du; - return BitCast( - d, VFromD{_mm_slli_si128(BitCast(du, v).raw, kBytes)}); -} - -// Generic for all vector lengths. -template -HWY_API V ShiftLeftBytes(const V v) { - return ShiftLeftBytes(DFromV(), v); -} - -// ------------------------------ ShiftLeftLanes - -// Generic for all vector lengths. -template -HWY_API VFromD ShiftLeftLanes(D d, const VFromD v) { - const Repartition d8; - return BitCast(d, ShiftLeftBytes)>(BitCast(d8, v))); -} - -// Generic for all vector lengths. -template -HWY_API V ShiftLeftLanes(const V v) { - return ShiftLeftLanes(DFromV(), v); -} - -// ------------------------------ ShiftRightBytes -template -HWY_API VFromD ShiftRightBytes(D d, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - const RebindToUnsigned du; - // For partial vectors, clear upper lanes so we shift in zeros. - if (d.MaxBytes() != 16) { - const Full128> dfull; - const VFromD vfull{v.raw}; - v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; - } - return BitCast( - d, VFromD{_mm_srli_si128(BitCast(du, v).raw, kBytes)}); -} - -// ------------------------------ ShiftRightLanes -// Generic for all vector lengths. -template -HWY_API VFromD ShiftRightLanes(D d, const VFromD v) { - const Repartition d8; - constexpr size_t kBytes = kLanes * sizeof(TFromD); - return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); -} - -// ------------------------------ UpperHalf (ShiftRightBytes) - -// Full input: copy hi into lo (smaller instruction encoding than shifts). -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - const Twice> dut; - using VUT = VFromD; // for float16_t - const VUT vut = BitCast(dut, v); - return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)})); -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64{_mm_movehl_ps(v.raw, v.raw)}; -} -template -HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { - return Vec64{_mm_unpackhi_pd(v.raw, v.raw)}; -} - -// Partial -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - return LowerHalf(d, ShiftRightBytes(Twice(), v)); -} - -// ------------------------------ ExtractLane (UpperHalf) - -namespace detail { - -template -HWY_INLINE T ExtractLane(const Vec128 v) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 - const int pair = _mm_extract_epi16(v.raw, kLane / 2); - constexpr int kShift = kLane & 1 ? 8 : 0; - return static_cast((pair >> kShift) & 0xFF); -#else - return static_cast(_mm_extract_epi8(v.raw, kLane) & 0xFF); -#endif -} - -template -HWY_INLINE T ExtractLane(const Vec128 v) { - static_assert(kLane < N, "Lane index out of bounds"); - const DFromV d; - const RebindToUnsigned du; - const uint16_t lane = static_cast( - _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF); - T ret; - CopySameSize(&lane, &ret); // for float16_t - return ret; -} - -template -HWY_INLINE T ExtractLane(const Vec128 v) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 - return static_cast(_mm_cvtsi128_si32( - (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane))); -#else - return static_cast(_mm_extract_epi32(v.raw, kLane)); -#endif -} - -template -HWY_INLINE T ExtractLane(const Vec128 v) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_ARCH_X86_32 - alignas(16) T lanes[2]; - Store(v, DFromV(), lanes); - return lanes[kLane]; -#elif HWY_TARGET >= HWY_SSSE3 - return static_cast( - _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE))); -#else - return static_cast(_mm_extract_epi64(v.raw, kLane)); -#endif -} - -template -HWY_INLINE float ExtractLane(const Vec128 v) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 - return _mm_cvtss_f32((kLane == 0) ? v.raw - : _mm_shuffle_ps(v.raw, v.raw, kLane)); -#else - // Bug in the intrinsic, returns int but should be float. - const int32_t bits = _mm_extract_ps(v.raw, kLane); - float ret; - CopySameSize(&bits, &ret); - return ret; -#endif -} - -// There is no extract_pd; two overloads because there is no UpperHalf for N=1. -template -HWY_INLINE double ExtractLane(const Vec64 v) { - static_assert(kLane == 0, "Lane index out of bounds"); - return GetLane(v); -} - -template -HWY_INLINE double ExtractLane(const Vec128 v) { - static_assert(kLane < 2, "Lane index out of bounds"); - const Half> dh; - return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v)); -} - -} // namespace detail - -// Requires one overload per vector length because ExtractLane<3> may be a -// compile error if it calls _mm_extract_epi64. -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { - HWY_DASSERT(i == 0); - (void)i; - return GetLane(v); -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - } - } -#endif - alignas(16) T lanes[2]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - case 2: - return detail::ExtractLane<2>(v); - case 3: - return detail::ExtractLane<3>(v); - } - } -#endif - alignas(16) T lanes[4]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - case 2: - return detail::ExtractLane<2>(v); - case 3: - return detail::ExtractLane<3>(v); - case 4: - return detail::ExtractLane<4>(v); - case 5: - return detail::ExtractLane<5>(v); - case 6: - return detail::ExtractLane<6>(v); - case 7: - return detail::ExtractLane<7>(v); - } - } -#endif - alignas(16) T lanes[8]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -template -HWY_API T ExtractLane(const Vec128 v, size_t i) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::ExtractLane<0>(v); - case 1: - return detail::ExtractLane<1>(v); - case 2: - return detail::ExtractLane<2>(v); - case 3: - return detail::ExtractLane<3>(v); - case 4: - return detail::ExtractLane<4>(v); - case 5: - return detail::ExtractLane<5>(v); - case 6: - return detail::ExtractLane<6>(v); - case 7: - return detail::ExtractLane<7>(v); - case 8: - return detail::ExtractLane<8>(v); - case 9: - return detail::ExtractLane<9>(v); - case 10: - return detail::ExtractLane<10>(v); - case 11: - return detail::ExtractLane<11>(v); - case 12: - return detail::ExtractLane<12>(v); - case 13: - return detail::ExtractLane<13>(v); - case 14: - return detail::ExtractLane<14>(v); - case 15: - return detail::ExtractLane<15>(v); - } - } -#endif - alignas(16) T lanes[16]; - Store(v, DFromV(), lanes); - return lanes[i]; -} - -// ------------------------------ InsertLane (UpperHalf) - -namespace detail { - -template -HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV t) { - const DFromV d; - -#if HWY_TARGET <= HWY_AVX3 - using RawMask = decltype(MaskFromVec(VFromD()).raw); - const auto mask = MFromD{static_cast(uint64_t{1} << i)}; -#else - const RebindToUnsigned du; - using TU = TFromD; - const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast(i))); -#endif - - return IfThenElse(mask, Set(d, t), v); -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 - return InsertLaneUsingBroadcastAndBlend(v, kLane, t); -#else - return Vec128{_mm_insert_epi8(v.raw, t, kLane)}; -#endif -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); - const DFromV d; - const RebindToUnsigned du; - uint16_t bits; - CopySameSize(&t, &bits); // for float16_t - return BitCast(d, VFromD{ - _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)}); -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 - return InsertLaneUsingBroadcastAndBlend(v, kLane, t); -#else - MakeSigned ti; - CopySameSize(&t, &ti); // don't just cast because T might be float. - return Vec128{_mm_insert_epi32(v.raw, ti, kLane)}; -#endif -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32 - const DFromV d; - const RebindToFloat df; - const auto vt = BitCast(df, Set(d, t)); - if (kLane == 0) { - return BitCast( - d, Vec128{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)}); - } - return BitCast( - d, Vec128{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)}); -#else - MakeSigned ti; - CopySameSize(&t, &ti); // don't just cast because T might be float. - return Vec128{_mm_insert_epi64(v.raw, ti, kLane)}; -#endif -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { - static_assert(kLane < N, "Lane index out of bounds"); -#if HWY_TARGET >= HWY_SSSE3 - return InsertLaneUsingBroadcastAndBlend(v, kLane, t); -#else - return Vec128{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)}; -#endif -} - -// There is no insert_pd; two overloads because there is no UpperHalf for N=1. -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { - static_assert(kLane == 0, "Lane index out of bounds"); - return Set(DFromV(), t); -} - -template -HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { - static_assert(kLane < 2, "Lane index out of bounds"); - const DFromV d; - const Vec128 vt = Set(d, t); - if (kLane == 0) { - return Vec128{_mm_shuffle_pd(vt.raw, v.raw, 2)}; - } - return Vec128{_mm_shuffle_pd(v.raw, vt.raw, 0)}; -} - -} // namespace detail - -// Requires one overload per vector length because InsertLane<3> may be a -// compile error if it calls _mm_insert_epi64. - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { - HWY_DASSERT(i == 0); - (void)i; - return Set(DFromV(), t); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - } - } -#endif - return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - } - } -#endif - return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - case 4: - return detail::InsertLane<4>(v, t); - case 5: - return detail::InsertLane<5>(v, t); - case 6: - return detail::InsertLane<6>(v, t); - case 7: - return detail::InsertLane<7>(v, t); - } - } -#endif - return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); -} - -template -HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(i)) { - switch (i) { - case 0: - return detail::InsertLane<0>(v, t); - case 1: - return detail::InsertLane<1>(v, t); - case 2: - return detail::InsertLane<2>(v, t); - case 3: - return detail::InsertLane<3>(v, t); - case 4: - return detail::InsertLane<4>(v, t); - case 5: - return detail::InsertLane<5>(v, t); - case 6: - return detail::InsertLane<6>(v, t); - case 7: - return detail::InsertLane<7>(v, t); - case 8: - return detail::InsertLane<8>(v, t); - case 9: - return detail::InsertLane<9>(v, t); - case 10: - return detail::InsertLane<10>(v, t); - case 11: - return detail::InsertLane<11>(v, t); - case 12: - return detail::InsertLane<12>(v, t); - case 13: - return detail::InsertLane<13>(v, t); - case 14: - return detail::InsertLane<14>(v, t); - case 15: - return detail::InsertLane<15>(v, t); - } - } -#endif - return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); -} - -// ------------------------------ CombineShiftRightBytes - -#if HWY_TARGET == HWY_SSE2 -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); - return Or(ShiftRightBytes(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); -} -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - constexpr size_t kSize = d.MaxBytes(); - static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); - - const Twice dt; - return VFromD{ShiftRightBytes(dt, Combine(dt, hi, lo)).raw}; -} -#else -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - const Repartition d8; - return BitCast(d, Vec128{_mm_alignr_epi8( - BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); -} - -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - constexpr size_t kSize = d.MaxBytes(); - static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); - const Repartition d8; - using V8 = Vec128; - const DFromV dfull8; - const Repartition, decltype(dfull8)> dfull; - const V8 hi8{BitCast(d8, hi).raw}; - // Move into most-significant bytes - const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); - const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); - return VFromD{BitCast(dfull, r).raw}; -} -#endif - -// ------------------------------ Broadcast/splat any lane - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const VU vu = BitCast(du, v); // for float16_t - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - if (kLane < 4) { - const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); - return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)}); - } else { - const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); - return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)}); - } -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; -} - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - return Vec128{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; -} - -// ------------------------------ TableLookupLanes (Shuffle01) - -// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. -template -struct Indices128 { - __m128i raw; -}; - -template , typename TI, size_t kN, - HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)> -HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const Rebind di; - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, kN * 2)))); -#endif - - // No change as byte indices are always used for 8-bit lane types - (void)d; - return Indices128{vec.raw}; -} - -template , typename TI, size_t kN, - HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)> -HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const Rebind di; - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, kN * 2)))); -#endif - -#if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 - (void)d; - return Indices128{vec.raw}; -#else // SSSE3, SSE4, or AVX2 - const Repartition d8; - using V8 = VFromD; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; - - // Broadcast each lane index to all 4 bytes of T - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); - - // Shift to bytes - const Repartition d16; - const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); - - return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; -#endif // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 -} - -template , typename TI, size_t kN, - HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)> -HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const Rebind di; - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, kN * 2)))); -#endif - -#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 - (void)d; - return Indices128{vec.raw}; -#else - const Repartition d8; - using V8 = VFromD; - alignas(16) static constexpr uint8_t kByteOffsets[16] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; - - // Broadcast each lane index to all 4 bytes of T - alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { - 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; - const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); - - // Shift to bytes - const Repartition d16; - const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); - - return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; -#endif -} - -template , typename TI, size_t kN, - HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)> -HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const Rebind di; - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, static_cast(kN * 2))))); -#else - (void)d; -#endif - - // No change - even without AVX3, we can shuffle+blend. - return Indices128{vec.raw}; -} - -template -HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( - D d, const TI* idx) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); - const Rebind di; - return IndicesFromVec(d, LoadU(di, idx)); -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { - return TableLookupBytes(v, Vec128{idx.raw}); -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3 - return {_mm_permutexvar_epi16(idx.raw, v.raw)}; -#elif HWY_TARGET == HWY_SSE2 -#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) - typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16))); - return Vec128{reinterpret_cast::type>( - __builtin_shuffle(reinterpret_cast(v.raw), - reinterpret_cast(idx.raw)))}; -#else - const Full128 d_full; - alignas(16) T src_lanes[8]; - alignas(16) uint16_t indices[8]; - alignas(16) T result_lanes[8]; - - Store(Vec128{v.raw}, d_full, src_lanes); - _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); - - for (int i = 0; i < 8; i++) { - result_lanes[i] = src_lanes[indices[i] & 7u]; - } - - return Vec128{Load(d_full, result_lanes).raw}; -#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) -#else - return TableLookupBytes(v, Vec128{idx.raw}); -#endif -} - -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 TableLookupLanes(Vec128 v, - Indices128 idx) { - return {_mm_permutexvar_ph(idx.raw, v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { -#if HWY_TARGET <= HWY_AVX2 - const DFromV d; - const RebindToFloat df; - const Vec128 perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)}; - return BitCast(d, perm); -#elif HWY_TARGET == HWY_SSE2 -#if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) - typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); - return Vec128{reinterpret_cast::type>( - __builtin_shuffle(reinterpret_cast(v.raw), - reinterpret_cast(idx.raw)))}; -#else - const Full128 d_full; - alignas(16) T src_lanes[4]; - alignas(16) uint32_t indices[4]; - alignas(16) T result_lanes[4]; - - Store(Vec128{v.raw}, d_full, src_lanes); - _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); - - for (int i = 0; i < 4; i++) { - result_lanes[i] = src_lanes[indices[i] & 3u]; - } - - return Vec128{Load(d_full, result_lanes).raw}; -#endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) -#else // SSSE3 or SSE4 - return TableLookupBytes(v, Vec128{idx.raw}); -#endif -} - -#if HWY_TARGET <= HWY_SSSE3 -template -HWY_API Vec128 TableLookupLanes(Vec128 v, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX2 - return Vec128{_mm_permutevar_ps(v.raw, idx.raw)}; -#else // SSSE3 or SSE4 - const DFromV df; - const RebindToSigned di; - return BitCast(df, - TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); -#endif // HWY_TARGET <= HWY_AVX2 -} -#endif // HWY_TARGET <= HWY_SSSE3 - -// Single lane: no change -template -HWY_API Vec128 TableLookupLanes(Vec128 v, - Indices128 /* idx */) { - return v; -} - -template -HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { - const DFromV d; - Vec128 vidx{idx.raw}; -#if HWY_TARGET <= HWY_AVX2 - // There is no _mm_permute[x]var_epi64. - vidx += vidx; // bit1 is the decider (unusual) - const RebindToFloat df; - return BitCast( - d, Vec128{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); -#else - // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit - // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 - // to obtain an all-zero or all-one mask. - const RebindToSigned di; - const Vec128 same = (vidx ^ Iota(di, 0)) - Set(di, 1); - const Mask128 mask_same = RebindMask(d, MaskFromVec(same)); - return IfThenElse(mask_same, v, Shuffle01(v)); -#endif -} - -HWY_API Vec128 TableLookupLanes(Vec128 v, - Indices128 idx) { - Vec128 vidx{idx.raw}; -#if HWY_TARGET <= HWY_AVX2 - vidx += vidx; // bit1 is the decider (unusual) - return Vec128{_mm_permutevar_pd(v.raw, vidx.raw)}; -#else - // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit - // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 - // to obtain an all-zero or all-one mask. - const DFromV d; - const RebindToSigned di; - const Vec128 same = (vidx ^ Iota(di, 0)) - Set(di, 1); - const Mask128 mask_same = RebindMask(d, MaskFromVec(same)); - return IfThenElse(mask_same, v, Shuffle01(v)); -#endif -} - -// ------------------------------ ReverseBlocks - -// Single block: no change -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return v; -} - -// ------------------------------ Reverse (Shuffle0123, Shuffle2301) - -// Single lane: no change -template -HWY_API VFromD Reverse(D /* tag */, VFromD v) { - return v; -} - -// 32-bit x2: shuffle -template -HWY_API VFromD Reverse(D /* tag */, const VFromD v) { - return VFromD{Shuffle2301(Vec128>{v.raw}).raw}; -} - -// 64-bit x2: shuffle -template -HWY_API VFromD Reverse(D /* tag */, const VFromD v) { - return Shuffle01(v); -} - -// 32-bit x4: shuffle -template -HWY_API VFromD Reverse(D /* tag */, const VFromD v) { - return Shuffle0123(v); -} - -// 16-bit -template -HWY_API VFromD Reverse(D d, const VFromD v) { - const RebindToUnsigned du; - using VU = VFromD; - const VU vu = BitCast(du, v); // for float16_t - constexpr size_t kN = MaxLanes(d); - if (kN == 1) return v; - if (kN == 2) { - return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))}); - } - if (kN == 4) { - return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); - } - -#if HWY_TARGET == HWY_SSE2 - const VU rev4{ - _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), - _MM_SHUFFLE(0, 1, 2, 3))}; - return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))}); -#else - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; - return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); -#endif -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - constexpr int kN = static_cast(MaxLanes(d)); - if (kN == 1) return v; -#if HWY_TARGET <= HWY_SSSE3 - // NOTE: Lanes with negative shuffle control mask values are set to zero. - alignas(16) static constexpr int8_t kReverse[16] = { - kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, - kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; - const RebindToSigned di; - const VFromD idx = Load(di, kReverse); - return VFromD{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)}; -#else - const RepartitionToWide d16; - return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); -#endif -} - -// ------------------------------ Reverse2 - -// Single lane: no change -template -HWY_API VFromD Reverse2(D /* tag */, VFromD v) { - return v; -} - -// Generic for all vector lengths (128-bit sufficient if SSE2). -template -HWY_API VFromD Reverse2(D d, VFromD v) { -#if HWY_TARGET <= HWY_AVX3 - const Repartition du32; - return BitCast(d, RotateRight<16>(BitCast(du32, v))); -#elif HWY_TARGET == HWY_SSE2 - const RebindToUnsigned du; - using VU = VFromD; - const VU vu = BitCast(du, v); // for float16_t - constexpr size_t kN = MaxLanes(d); - __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1)); - if (kN > 4) { - shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1)); - } - return BitCast(d, VU{shuf_result}); -#else - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C}; - return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); -#endif -} - -// Generic for all vector lengths. -template -HWY_API VFromD Reverse2(D /* tag */, VFromD v) { - return Shuffle2301(v); -} - -// Generic for all vector lengths. -template -HWY_API VFromD Reverse2(D /* tag */, VFromD v) { - return Shuffle01(v); -} - -// ------------------------------ Reverse4 - -template -HWY_API VFromD Reverse4(D d, VFromD v) { - const RebindToUnsigned du; - using VU = VFromD; - const VU vu = BitCast(du, v); // for float16_t - // 4x 16-bit: a single shufflelo suffices. - constexpr size_t kN = MaxLanes(d); - if (kN <= 4) { - return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); - } - -#if HWY_TARGET == HWY_SSE2 - return BitCast(d, VU{_mm_shufflehi_epi16( - _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), - _MM_SHUFFLE(0, 1, 2, 3))}); -#else - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908}; - return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); -#endif -} - -// Generic for all vector lengths. -template -HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { - return Shuffle0123(v); -} - -template -HWY_API VFromD Reverse4(D /* tag */, VFromD /* v */) { - HWY_ASSERT(0); // don't have 4 u64 lanes -} - -// ------------------------------ Reverse8 - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { -#if HWY_TARGET == HWY_SSE2 - const RepartitionToWide dw; - return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); -#else - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; - return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); -#endif -} - -template -HWY_API VFromD Reverse8(D /* tag */, VFromD /* v */) { - HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit -} - -// ------------------------------ ReverseBits - -#if HWY_TARGET <= HWY_AVX3_DL - -#ifdef HWY_NATIVE_REVERSE_BITS_UI8 -#undef HWY_NATIVE_REVERSE_BITS_UI8 -#else -#define HWY_NATIVE_REVERSE_BITS_UI8 -#endif - -template , 16)> -HWY_API V ReverseBits(V v) { - const Full128 du64_full; - const auto affine_matrix = Set(du64_full, 0x8040201008040201u); - return V{_mm_gf2p8affine_epi64_epi8(v.raw, affine_matrix.raw, 0)}; -} -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ------------------------------ InterleaveLower - -// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides -// the least-significant lane) and "b". To concatenate two half-width integers -// into one, use ZipLower/Upper instead (also works with scalar). - -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - return Vec128{_mm_unpacklo_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; // for float16_t - return BitCast( - d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - return Vec128{_mm_unpacklo_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { - return Vec128{_mm_unpacklo_epi64(a.raw, b.raw)}; -} - -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{_mm_unpacklo_ps(a.raw, b.raw)}; -} -template -HWY_API Vec128 InterleaveLower(Vec128 a, - Vec128 b) { - return Vec128{_mm_unpacklo_pd(a.raw, b.raw)}; -} - -// Generic for all vector lengths. -template -HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { - return InterleaveLower(a, b); -} - -// ------------------------------ InterleaveUpper (UpperHalf) - -// Full -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm_unpackhi_epi8(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; // for float16_t - return BitCast( - d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm_unpackhi_epi32(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm_unpackhi_epi64(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm_unpackhi_ps(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm_unpackhi_pd(a.raw, b.raw)}; -} - -// Partial -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const Half d2; - return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, - VFromD{UpperHalf(d2, b).raw}); -} - -// -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper) - -template -HWY_API Vec128 Broadcast(const Vec128 v) { - static_assert(0 <= kLane && kLane < N, "Invalid lane"); - const DFromV d; - -#if HWY_TARGET == HWY_SSE2 - const Full128 d_full; - const Vec128 v_full{v.raw}; - const auto v_interleaved = (kLane < 8) - ? InterleaveLower(d_full, v_full, v_full) - : InterleaveUpper(d_full, v_full, v_full); - return ResizeBitCast( - d, Broadcast(BitCast(Full128(), v_interleaved))); -#else - return TableLookupBytes(v, Set(d, static_cast(kLane))); -#endif -} - -// ------------------------------ ZipLower/ZipUpper (InterleaveLower) - -// Same as Interleave*, except that the return lanes are double-width integers; -// this is necessary because the single-lane scalar cannot return two values. -// Generic for all vector lengths. -template >> -HWY_API VFromD ZipLower(V a, V b) { - return BitCast(DW(), InterleaveLower(a, b)); -} -template , class DW = RepartitionToWide> -HWY_API VFromD ZipLower(DW dw, V a, V b) { - return BitCast(dw, InterleaveLower(D(), a, b)); -} - -template , class DW = RepartitionToWide> -HWY_API VFromD ZipUpper(DW dw, V a, V b) { - return BitCast(dw, InterleaveUpper(D(), a, b)); -} - -// ------------------------------ Per4LaneBlockShuffle -namespace detail { - -#ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#else -#define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 -#endif - -template -HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, - const uint32_t x2, - const uint32_t x1, - const uint32_t x0) { - return ResizeBitCast( - d, Vec128{_mm_set_epi32( - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0))}); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<2> /*lane_size_tag*/, - hwy::SizeTag<8> /*vect_size_tag*/, V v) { - return V{_mm_shufflelo_epi16(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -#if HWY_TARGET == HWY_SSE2 -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<2> /*lane_size_tag*/, - hwy::SizeTag<16> /*vect_size_tag*/, V v) { - constexpr int kShuffle = static_cast(kIdx3210 & 0xFF); - return V{_mm_shufflehi_epi16(_mm_shufflelo_epi16(v.raw, kShuffle), kShuffle)}; -} - -template * = nullptr> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag idx_3210_tag, - hwy::SizeTag<1> /*lane_size_tag*/, - hwy::SizeTag /*vect_size_tag*/, - V v) { - const DFromV d; - const RebindToUnsigned du; - const Rebind du16; - const RebindToSigned di16; - - const auto vu16 = PromoteTo(du16, BitCast(du, v)); - const auto shuf16_result = Per4LaneBlockShuffle( - idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag(), vu16); - return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result))); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag idx_3210_tag, - hwy::SizeTag<1> /*lane_size_tag*/, - hwy::SizeTag<16> /*vect_size_tag*/, V v) { - const DFromV d; - const RebindToUnsigned du; - const Repartition du16; - const RebindToSigned di16; - - const auto zero = Zero(d); - const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero)); - const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero)); - - const auto lo_shuf_result = Per4LaneBlockShuffle( - idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16); - const auto hi_shuf_result = Per4LaneBlockShuffle( - idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16); - - return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result), - BitCast(di16, hi_shuf_result))); -} -#endif - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag<16> /*vect_size_tag*/, V v) { - return V{_mm_shuffle_epi32(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag<16> /*vect_size_tag*/, V v) { - return V{_mm_shuffle_ps(v.raw, v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -} // namespace detail - -// ------------------------------ SlideUpLanes - -namespace detail { - -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - const Full64 du64; - const auto vu64 = ResizeBitCast(du64, v); - return ResizeBitCast( - d, ShiftLeftSame(vu64, static_cast(amt * sizeof(TFromV) * 8))); -} - -#if HWY_TARGET <= HWY_SSSE3 -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - const Repartition du8; - const auto idx = - Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); - return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); -} -#else -template -HWY_INLINE V SlideUpLanes(V v, size_t amt) { - const DFromV d; - const Repartition di32; - const Repartition du64; - constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV); - - const auto vu64 = BitCast(du64, v); - const auto v_hi = IfVecThenElse( - BitCast(du64, Set(di32, -static_cast(amt >= kNumOfLanesPerU64))), - BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64); - const auto v_lo = ShiftLeftBytes<8>(du64, v_hi); - - const int shl_amt = static_cast((amt * sizeof(TFromV) * 8) & 63); - return BitCast( - d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt))); -} -#endif - -} // namespace detail - -template -HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - case 4: - return ShiftLeftLanes<4>(d, v); - case 5: - return ShiftLeftLanes<5>(d, v); - case 6: - return ShiftLeftLanes<6>(d, v); - case 7: - return ShiftLeftLanes<7>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftLeftLanes<1>(d, v); - case 2: - return ShiftLeftLanes<2>(d, v); - case 3: - return ShiftLeftLanes<3>(d, v); - case 4: - return ShiftLeftLanes<4>(d, v); - case 5: - return ShiftLeftLanes<5>(d, v); - case 6: - return ShiftLeftLanes<6>(d, v); - case 7: - return ShiftLeftLanes<7>(d, v); - case 8: - return ShiftLeftLanes<8>(d, v); - case 9: - return ShiftLeftLanes<9>(d, v); - case 10: - return ShiftLeftLanes<10>(d, v); - case 11: - return ShiftLeftLanes<11>(d, v); - case 12: - return ShiftLeftLanes<12>(d, v); - case 13: - return ShiftLeftLanes<13>(d, v); - case 14: - return ShiftLeftLanes<14>(d, v); - case 15: - return ShiftLeftLanes<15>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideUpLanes(v, amt); -} - -// ------------------------------ SlideDownLanes - -namespace detail { - -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - const Repartition, decltype(d)> dv; - return BitCast(d, - ShiftRightSame(BitCast(dv, v), - static_cast(amt * sizeof(TFromV) * 8))); -} - -#if HWY_TARGET <= HWY_SSSE3 -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - const Repartition di8; - auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); - idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); - return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); -} -#else -template -HWY_INLINE V SlideDownLanes(V v, size_t amt) { - const DFromV d; - const Repartition di32; - const Repartition du64; - constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV); - - const auto vu64 = BitCast(du64, v); - const auto v_lo = IfVecThenElse( - BitCast(du64, Set(di32, -static_cast(amt >= kNumOfLanesPerU64))), - BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64); - const auto v_hi = ShiftRightBytes<8>(du64, v_lo); - - const int shr_amt = static_cast((amt * sizeof(TFromV) * 8) & 63); - return BitCast( - d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt))); -} -#endif - -} // namespace detail - -template -HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { - return v; -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - case 4: - return ShiftRightLanes<4>(d, v); - case 5: - return ShiftRightLanes<5>(d, v); - case 6: - return ShiftRightLanes<6>(d, v); - case 7: - return ShiftRightLanes<7>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return ShiftRightLanes<1>(d, v); - case 2: - return ShiftRightLanes<2>(d, v); - case 3: - return ShiftRightLanes<3>(d, v); - case 4: - return ShiftRightLanes<4>(d, v); - case 5: - return ShiftRightLanes<5>(d, v); - case 6: - return ShiftRightLanes<6>(d, v); - case 7: - return ShiftRightLanes<7>(d, v); - case 8: - return ShiftRightLanes<8>(d, v); - case 9: - return ShiftRightLanes<9>(d, v); - case 10: - return ShiftRightLanes<10>(d, v); - case 11: - return ShiftRightLanes<11>(d, v); - case 12: - return ShiftRightLanes<12>(d, v); - case 13: - return ShiftRightLanes<13>(d, v); - case 14: - return ShiftRightLanes<14>(d, v); - case 15: - return ShiftRightLanes<15>(d, v); - } - } -#else - (void)d; -#endif - - return detail::SlideDownLanes(v, amt); -} - -// ================================================== MEMORY (4) - -// ------------------------------ StoreN (ExtractLane) - -#if HWY_TARGET <= HWY_AVX2 - -#ifdef HWY_NATIVE_STORE_N -#undef HWY_NATIVE_STORE_N -#else -#define HWY_NATIVE_STORE_N -#endif - -template -HWY_API void StoreN(VFromD v, D d, TFromD* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const size_t num_of_lanes_to_store = - HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); - -#if HWY_COMPILER_MSVC - // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore - HWY_FENCE; -#endif - - BlendedStore(v, FirstN(d, num_of_lanes_to_store), d, p); - -#if HWY_COMPILER_MSVC - // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore - HWY_FENCE; -#endif -} - -#if HWY_TARGET > HWY_AVX3 -template -HWY_API void StoreN(VFromD v, D d, TFromD* HWY_RESTRICT p, - size_t max_lanes_to_store) { - if (max_lanes_to_store > 0) { - StoreU(v, d, p); - } -} - -template -HWY_API void StoreN(VFromD v, D /*d*/, TFromD* HWY_RESTRICT p, - size_t max_lanes_to_store) { - if (max_lanes_to_store >= 1) { - p[static_cast(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v); - p[0] = GetLane(v); - } -} - -namespace detail { - -template -HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD v_trailing, D /*d*/, - TFromD* HWY_RESTRICT p, - size_t num_of_lanes_to_store) { - // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if - // (num_of_lanes_to_store & 3) != 0 is true - const auto v_full128 = ResizeBitCast(Full128>(), v_trailing); - if ((num_of_lanes_to_store & 2) != 0) { - const uint16_t u16_bits = GetLane(BitCast(Full128(), v_full128)); - p[num_of_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128); - CopyBytes(&u16_bits, - p + (num_of_lanes_to_store & ~size_t{3})); - } else { - p[num_of_lanes_to_store - 1] = GetLane(v_full128); - } -} - -template -HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD v_trailing, D /*d*/, - TFromD* HWY_RESTRICT p, - size_t num_of_lanes_to_store) { - // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16 - // vector if (num_of_lanes_to_store & 1) == 1 is true - p[num_of_lanes_to_store - 1] = GetLane(v_trailing); -} - -} // namespace detail - -template -HWY_API void StoreN(VFromD v, D d, TFromD* HWY_RESTRICT p, - size_t max_lanes_to_store) { - const size_t num_of_lanes_to_store = - HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); - - const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> - d_full; - const RebindToUnsigned du_full; - const Repartition di32_full; - - const auto i32_store_mask = BitCast( - di32_full, VecFromMask(du_full, FirstN(du_full, num_of_lanes_to_store))); - const auto vi32 = ResizeBitCast(di32_full, v); - -#if HWY_COMPILER_MSVC - // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore - HWY_FENCE; -#endif - - BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full, - reinterpret_cast(p)); - - constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD); - constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1; - const size_t trailing_n = (num_of_lanes_to_store & kTrailingLenMask); - - if (trailing_n != 0) { - const auto v_trailing = ResizeBitCast( - d, SlideDownLanes(di32_full, vi32, - num_of_lanes_to_store / kNumOfLanesPerI32)); - detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_of_lanes_to_store); - } - -#if HWY_COMPILER_MSVC - // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore - HWY_FENCE; -#endif -} -#endif // HWY_TARGET > HWY_AVX3 -#endif // HWY_TARGET <= HWY_AVX2 - -// ================================================== COMBINE - -// ------------------------------ Combine (InterleaveLower) - -// N = N/2 + N/2 (upper half undefined) -template >> -HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { - const Half dh; - const RebindToUnsigned duh; - // Treat half-width input as one lane, and expand to two lanes. - using VU = Vec128, 2>; - const VU lo{BitCast(duh, lo_half).raw}; - const VU hi{BitCast(duh, hi_half).raw}; - return BitCast(d, InterleaveLower(lo, hi)); -} - -// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) - -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { - const RebindToUnsigned du; - const Half duh; - return BitCast(d, VFromD{_mm_move_epi64(BitCast(duh, lo).raw)}); -} - -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { - const Half dh; - return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD{lo.raw}); -} - -// ------------------------------ Concat full (InterleaveLower) - -// hiH,hiL loH,loL |-> hiL,loL (= lower halves) -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - const Repartition d64; - return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); -} - -// hiH,hiL loH,loL |-> hiH,loH (= upper halves) -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - const Repartition d64; - return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); -} - -// hiH,hiL loH,loL |-> hiL,loH (= inner halves) -template -HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { - return CombineShiftRightBytes<8>(d, hi, lo); -} - -// hiH,hiL loH,loL |-> hiH,loL (= outer halves) -template -HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { - const Repartition dd; -#if HWY_TARGET >= HWY_SSSE3 - return BitCast( - d, Vec128{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw, - _MM_SHUFFLE2(1, 0))}); -#else - // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle. - return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, hi).raw, - BitCast(dd, lo).raw, 1)}); -#endif -} -template -HWY_API Vec128 ConcatUpperLower(D d, Vec128 hi, - Vec128 lo) { -#if HWY_TARGET >= HWY_SSSE3 - (void)d; - return Vec128{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; -#else - // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle. - const RepartitionToWide dd; - return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, hi).raw, - BitCast(dd, lo).raw, 1)}); -#endif -} -template -HWY_API Vec128 ConcatUpperLower(D /* tag */, Vec128 hi, - Vec128 lo) { -#if HWY_TARGET >= HWY_SSSE3 - return Vec128{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; -#else - // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. - return Vec128{_mm_blend_pd(hi.raw, lo.raw, 1)}; -#endif -} - -// ------------------------------ Concat partial (Combine, LowerHalf) - -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatLowerUpper(D d, const VFromD hi, - const VFromD lo) { - const Half d2; - return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); -} - -template -HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); -} - -// ------------------------------ ConcatOdd - -// 8-bit full -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const Repartition dw; - // Right-shift 8 bits per u16 so we can pack. - const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); - return VFromD{_mm_packus_epi16(uL.raw, uH.raw)}; -} - -// 8-bit x8 -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET == HWY_SSE2 - const Repartition dw; - // Right-shift 8 bits per u16 so we can pack. - const Vec64 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec64 uL = ShiftRight<8>(BitCast(dw, lo)); - return VFromD{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -#else - const Repartition du32; - // Don't care about upper half, no need to zero. - alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7}; - const VFromD shuf = BitCast(d, Load(Full64(), kCompactOddU8)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); -#endif -} - -// 8-bit x4 -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET == HWY_SSE2 - const Repartition dw; - const Twice dw_2; - // Right-shift 8 bits per u16 so we can pack. - const Vec32 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec32 uL = ShiftRight<8>(BitCast(dw, lo)); - const Vec64 uHL = Combine(dw_2, uH, uL); - return VFromD{_mm_packus_epi16(uHL.raw, uHL.raw)}; -#else - const Repartition du16; - // Don't care about upper half, no need to zero. - alignas(16) const uint8_t kCompactOddU8[4] = {1, 3}; - const VFromD shuf = BitCast(d, Load(Full32(), kCompactOddU8)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); -#endif -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns - // 0xFFFF8000, which correctly saturates to 0x8000. - const Repartition dw; - const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); - const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); - return VFromD{_mm_packs_epi32(uL.raw, uH.raw)}; -} - -// 16-bit x4 -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET == HWY_SSE2 - // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns - // 0xFFFF8000, which correctly saturates to 0x8000. - const Repartition dw; - const Vec64 uH = ShiftRight<16>(BitCast(dw, hi)); - const Vec64 uL = ShiftRight<16>(BitCast(dw, lo)); - return VFromD{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -#else - const Repartition du32; - // Don't care about upper half, no need to zero. - alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7}; - const VFromD shuf = BitCast(d, Load(Full64(), kCompactOddU16)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); -#endif -} - -// 32-bit full -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToFloat df; - return BitCast( - d, Vec128{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, - _MM_SHUFFLE(3, 1, 3, 1))}); -} - -// Any type x2 -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - return InterleaveUpper(d, lo, hi); -} - -// ------------------------------ ConcatEven (InterleaveLower) - -// 8-bit full -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const Repartition dw; - // Isolate lower 8 bits per u16 so we can pack. - const Vec128 mask = Set(dw, 0x00FF); - const Vec128 uH = And(BitCast(dw, hi), mask); - const Vec128 uL = And(BitCast(dw, lo), mask); - return VFromD{_mm_packus_epi16(uL.raw, uH.raw)}; -} - -// 8-bit x8 -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET == HWY_SSE2 - const Repartition dw; - // Isolate lower 8 bits per u16 so we can pack. - const Vec64 mask = Set(dw, 0x00FF); - const Vec64 uH = And(BitCast(dw, hi), mask); - const Vec64 uL = And(BitCast(dw, lo), mask); - return VFromD{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -#else - const Repartition du32; - // Don't care about upper half, no need to zero. - alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6}; - const VFromD shuf = BitCast(d, Load(Full64(), kCompactEvenU8)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); -#endif -} - -// 8-bit x4 -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET == HWY_SSE2 - const Repartition dw; - const Twice dw_2; - // Isolate lower 8 bits per u16 so we can pack. - const Vec32 mask = Set(dw, 0x00FF); - const Vec32 uH = And(BitCast(dw, hi), mask); - const Vec32 uL = And(BitCast(dw, lo), mask); - const Vec64 uHL = Combine(dw_2, uH, uL); - return VFromD{_mm_packus_epi16(uHL.raw, uHL.raw)}; -#else - const Repartition du16; - // Don't care about upper half, no need to zero. - alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2}; - const VFromD shuf = BitCast(d, Load(Full32(), kCompactEvenU8)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); -#endif -} - -// 16-bit full -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET <= HWY_SSE4 - // Isolate lower 16 bits per u32 so we can pack. - const Repartition dw; - const Vec128 mask = Set(dw, 0x0000FFFF); - const Vec128 uH = And(BitCast(dw, hi), mask); - const Vec128 uL = And(BitCast(dw, lo), mask); - return VFromD{_mm_packus_epi32(uL.raw, uH.raw)}; -#elif HWY_TARGET == HWY_SSE2 - const Repartition dw; - return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), - BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); -#else - const RebindToUnsigned du; - // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two - // inputs, then concatenate them. - alignas(16) - const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; - const VFromD shuf = BitCast(d, Load(du, kCompactEvenU16)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return ConcatLowerLower(d, H, L); -#endif -} - -// 16-bit x4 -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { -#if HWY_TARGET == HWY_SSE2 - const Repartition dw; - return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), - BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); -#else - const Repartition du32; - // Don't care about upper half, no need to zero. - alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5}; - const VFromD shuf = BitCast(d, Load(Full64(), kCompactEvenU16)); - const VFromD L = TableLookupBytes(lo, shuf); - const VFromD H = TableLookupBytes(hi, shuf); - return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); -#endif -} - -// 32-bit full -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToFloat df; - return BitCast( - d, Vec128{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, - _MM_SHUFFLE(2, 0, 2, 0))}); -} -template -HWY_API VFromD ConcatEven(D /* d */, VFromD hi, VFromD lo) { - return VFromD{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; -} - -// Any T x2 -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - return InterleaveLower(d, lo, hi); -} - -// ------------------------------ DupEven (InterleaveLower) - -template -HWY_API Vec128 DupEven(const Vec128 v) { - return v; -} - -template -HWY_API Vec128 DupEven(const Vec128 v) { - return InterleaveLower(DFromV(), v, v); -} - -template -HWY_API V DupEven(V v) { - const DFromV d; - -#if HWY_TARGET <= HWY_SSSE3 - const RebindToUnsigned du; - alignas(16) static constexpr uint8_t kShuffle[16] = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle))); -#else - const Repartition du16; - return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})), - BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v); -#endif -} - -template -HWY_API Vec64 DupEven(const Vec64 v) { - const DFromV d; - const RebindToUnsigned du; // for float16_t - return BitCast(d, VFromD{ - _mm_shufflelo_epi16(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}); -} - -// Generic for all vector lengths. -template -HWY_API V DupEven(const V v) { - const DFromV d; - const RebindToUnsigned du; // for float16_t -#if HWY_TARGET <= HWY_SSSE3 - alignas(16) static constexpr uint16_t kShuffle[8] = { - 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c}; - return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle))); -#else - return BitCast( - d, VFromD{_mm_shufflehi_epi16( - _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)), - _MM_SHUFFLE(2, 2, 0, 0))}); -#endif -} - -template -HWY_API Vec128 DupEven(Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; -} - -HWY_API Vec128 DupEven(Vec128 v) { - return Vec128{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; -} - -// ------------------------------ DupOdd (InterleaveUpper) - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return v; -} - -template -HWY_API V DupOdd(V v) { - const DFromV d; - -#if HWY_TARGET <= HWY_SSSE3 - const RebindToUnsigned du; - alignas(16) static constexpr uint8_t kShuffle[16] = { - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; - return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle))); -#else - const Repartition du16; - return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})), - BitCast(d, ShiftRight<8>(BitCast(du16, v))), v); -#endif -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - const DFromV d; - const RebindToUnsigned du; // for float16_t - return BitCast(d, VFromD{_mm_shufflelo_epi16( - BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))}); -} - -// Generic for all vector lengths. -template -HWY_API V DupOdd(V v) { - const DFromV d; - const RebindToUnsigned du; // for float16_t -#if HWY_TARGET <= HWY_SSSE3 - alignas(16) static constexpr uint16_t kShuffle[8] = { - 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e}; - return TableLookupBytes(v, BitCast(d, LoadDup128(du, kShuffle))); -#else - return BitCast( - d, VFromD{_mm_shufflehi_epi16( - _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)), - _MM_SHUFFLE(3, 3, 1, 1))}); -#endif -} - -template -HWY_API Vec128 DupOdd(Vec128 v) { - return Vec128{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; -} -template -HWY_API Vec128 DupOdd(Vec128 v) { - return Vec128{ - _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; -} - -template -HWY_API Vec128 DupOdd(const Vec128 v) { - return InterleaveUpper(DFromV(), v, v); -} - -// ------------------------------ TwoTablesLookupLanes (DupEven) - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { - const DFromV d; - const Twice dt; -// TableLookupLanes currently requires table and index vectors to be the same -// size, though a half-length index vector would be sufficient here. -#if HWY_IS_MSAN - const Vec128 idx_vec{idx.raw}; - const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; -#else - // We only keep LowerHalf of the result, which is valid in idx. - const Indices128 idx2{idx.raw}; -#endif - return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec128{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)}; -#else // AVX3 or below - const DFromV d; - const Vec128 idx_vec{idx.raw}; - -#if HWY_TARGET <= HWY_SSE4 - const Repartition du16; - const auto sel_hi_mask = - MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); -#else - const RebindToSigned di; - const auto sel_hi_mask = - RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15})); -#endif - - const auto lo_lookup_result = TableLookupBytes(a, idx_vec); -#if HWY_TARGET <= HWY_AVX3 - const Vec128 lookup_result{_mm_mask_shuffle_epi8( - lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; - return lookup_result; -#else - const auto hi_lookup_result = TableLookupBytes(b, idx_vec); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#endif // HWY_TARGET <= HWY_AVX3 -#endif // HWY_TARGET <= HWY_AVX3_DL -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)}; -#elif HWY_TARGET == HWY_SSE2 - const DFromV d; - const RebindToSigned di; - const Vec128 idx_vec{idx.raw}; - const auto sel_hi_mask = - RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7})); - const auto lo_lookup_result = TableLookupLanes(a, idx); - const auto hi_lookup_result = TableLookupLanes(b, idx); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#else - const DFromV d; - const Repartition du8; - return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), - Indices128{idx.raw})); -#endif -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)}; -#else // AVX2 or below - const DFromV d; - -#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 - const Vec128 idx_vec{idx.raw}; - -#if HWY_TARGET <= HWY_AVX2 - const RebindToFloat d_sel; - const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec))); -#else - const RebindToSigned d_sel; - const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3}); -#endif - - const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx)); - const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx)); - return BitCast(d, - IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); -#else // SSSE3 or SSE4 - const Repartition du8; - return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), - Indices128{idx.raw})); -#endif // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 -#endif // HWY_TARGET <= HWY_AVX3 -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, - Vec128 b, - Indices128 idx) { - return Vec128{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)}; -#elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 - const DFromV d; - -#if HWY_TARGET <= HWY_AVX2 - const auto sel_hi_mask = - MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128{idx.raw}))); -#else - const RebindToSigned di; - const auto sel_hi_mask = - RebindMask(d, Vec128{idx.raw} > Set(di, int32_t{3})); -#endif - - const auto lo_lookup_result = TableLookupLanes(a, idx); - const auto hi_lookup_result = TableLookupLanes(b, idx); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#else // SSSE3 or SSE4 - const DFromV d; - const Repartition du8; - return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), - Indices128{idx.raw})); -#endif -} - -template -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const Vec128 idx_vec{idx.raw}; - const Indices128 idx_mod{And(idx_vec, Set(d, T{1})).raw}; - -#if HWY_TARGET <= HWY_SSE4 - const RebindToFloat d_sel; - const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec))); -#else // SSE2 or SSSE3 - const Repartition di32; - const RebindToSigned d_sel; - const auto sel_hi_mask = MaskFromVec( - BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > - Set(di32, int32_t{1})))); -#endif // HWY_TARGET <= HWY_SSE4 - - const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod)); - const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod)); - return BitCast(d, - IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); -#endif // HWY_TARGET <= HWY_AVX3 -} - -HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, - Indices128 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const RebindToSigned di; - const Vec128 idx_vec{idx.raw}; - const Indices128 idx_mod{And(idx_vec, Set(di, int64_t{1})).raw}; - -#if HWY_TARGET <= HWY_SSE4 - const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec))); -#else // SSE2 or SSSE3 - const Repartition di32; - const auto sel_hi_mask = - MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > - Set(di32, int32_t{1})))); -#endif // HWY_TARGET <= HWY_SSE4 - - const auto lo_lookup_result = TableLookupLanes(a, idx_mod); - const auto hi_lookup_result = TableLookupLanes(b, idx_mod); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#endif // HWY_TARGET <= HWY_AVX3 -} - -// ------------------------------ OddEven (IfThenElse) - -template -HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { - const DFromV d; - const Repartition d8; - alignas(16) static constexpr uint8_t mask[16] = { - 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; - return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); -} - -template -HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - const DFromV d; - const Repartition d8; - alignas(16) static constexpr uint8_t mask[16] = { - 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; - return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); -#else - return Vec128{_mm_blend_epi16(a.raw, b.raw, 0x55)}; -#endif -} - -template -HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); - const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); - return Vec128{_mm_unpacklo_epi32(even, odd)}; -#else - // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle. - const DFromV d; - const RebindToFloat df; - return BitCast(d, Vec128{_mm_blend_ps(BitCast(df, a).raw, - BitCast(df, b).raw, 5)}); -#endif -} - -template -HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { - // Same as ConcatUpperLower for full vectors; do not call that because this - // is more efficient for 64x1 vectors. - const DFromV d; - const RebindToFloat dd; -#if HWY_TARGET >= HWY_SSSE3 - return BitCast( - d, Vec128{_mm_shuffle_pd( - BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))}); -#else - // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. - return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, a).raw, - BitCast(dd, b).raw, 1)}); -#endif -} - -template -HWY_API Vec128 OddEven(Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - // SHUFPS must fill the lower half of the output from one input, so we - // need another shuffle. Unpack avoids another immediate byte. - const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); - const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); - return Vec128{_mm_unpacklo_ps(even, odd)}; -#else - return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; -#endif -} - -// ------------------------------ OddEvenBlocks -template -HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { - return even; -} - -// ------------------------------ SwapAdjacentBlocks - -template -HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { - return v; -} - -// ------------------------------ Shl (ZipLower, Mul) - -// Use AVX2/3 variable shifts where available, otherwise multiply by powers of -// two from loading float exponents, which is considerably faster (according -// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. - -namespace detail { -#if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly -template -HWY_API V AVX2ShlU16Vec128(V v, V bits) { - const DFromV d; - const Rebind du32; - return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); -} -#elif HWY_TARGET > HWY_AVX2 -// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. -template -HWY_INLINE Vec128> Pow2(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const RepartitionToWide dw; - const Rebind df; - const auto zero = Zero(d); - // Move into exponent (this u16 will become the upper half of an f32) - const auto exp = ShiftLeft<23 - 16>(v); - const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f - // Insert 0 into lower halves for reinterpreting as binary32. - const auto f0 = ZipLower(dw, zero, upper); - const auto f1 = ZipUpper(dw, zero, upper); - // See cvtps comment below. - const VFromD bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)}; - const VFromD bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)}; -#if HWY_TARGET <= HWY_SSE4 - return VFromD{_mm_packus_epi32(bits0.raw, bits1.raw)}; -#else - return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0)); -#endif -} - -template -HWY_INLINE Vec128, N> Pow2(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const Twice dt_u; - const RepartitionToWide dt_w; - const RebindToFloat dt_f; - // Move into exponent (this u16 will become the upper half of an f32) - const auto exp = ShiftLeft<23 - 16>(v); - const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f - // Insert 0 into lower halves for reinterpreting as binary32. - const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper)); - // See cvtps comment below. - const VFromD bits0{_mm_cvtps_epi32(BitCast(dt_f, f0).raw)}; -#if HWY_TARGET <= HWY_SSE4 - return VFromD{_mm_packus_epi32(bits0.raw, bits0.raw)}; -#elif HWY_TARGET == HWY_SSSE3 - alignas(16) - const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; - return TableLookupBytes(bits0, Load(du, kCompactEvenU16)); -#else - const RebindToSigned dt_i32; - const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0))); - return VFromD{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)}; -#endif -} - -// Same, for 32-bit shifts. -template -HWY_INLINE Vec128, N> Pow2(const Vec128 v) { - const DFromV d; - const auto exp = ShiftLeft<23>(v); - const auto f = exp + Set(d, 0x3F800000); // 1.0f - // Do not use ConvertTo because we rely on the native 0x80..00 overflow - // behavior. - return Vec128, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))}; -} - -#endif // HWY_TARGET > HWY_AVX2 - -template -HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_sllv_epi16(v.raw, bits.raw)}; -#elif HWY_TARGET == HWY_AVX2 - return AVX2ShlU16Vec128(v, bits); -#else - return v * Pow2(bits); -#endif -} - -#if HWY_TARGET > HWY_AVX3 -HWY_API Vec16 Shl(hwy::UnsignedTag /*tag*/, Vec16 v, - Vec16 bits) { -#if HWY_TARGET <= HWY_SSE4 - const Vec16 bits16{_mm_cvtepu16_epi64(bits.raw)}; -#else - const auto bits16 = And(bits, Vec16{_mm_set_epi64x(0, 0xFFFF)}); -#endif - return Vec16{_mm_sll_epi16(v.raw, bits16.raw)}; -} -#endif - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { - const DFromV d; - const Rebind du16; - return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits)); -} -#elif HWY_TARGET <= HWY_AVX2 -template -HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { - const DFromV d; - const Rebind du32; - return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); -} -template -HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { - const DFromV d; - const Half dh; - const Rebind du16; - const Rebind dh_u32; - - const VFromD lo_shl_result = - PromoteTo(dh_u32, LowerHalf(dh, v)) - << PromoteTo(dh_u32, LowerHalf(dh, bits)); - const VFromD hi_shl_result = - PromoteTo(dh_u32, UpperHalf(dh, v)) - << PromoteTo(dh_u32, UpperHalf(dh, bits)); - const VFromD u16_shl_result = ConcatEven( - du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result)); - return TruncateTo(d, u16_shl_result); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// 8-bit: may use the Shl overload for uint16_t. -template -HWY_API Vec128 Shl(hwy::UnsignedTag tag, Vec128 v, - Vec128 bits) { - const DFromV d; -#if HWY_TARGET <= HWY_AVX3_DL - (void)tag; - // kMask[i] = 0xFF >> i - alignas(16) static constexpr uint8_t kMasks[16] = { - 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; - // kShl[i] = 1 << i - alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, - 0x20, 0x40, 0x80, 0x00}; - v = And(v, TableLookupBytes(Load(Full64(), kMasks), bits)); - const VFromD mul = - TableLookupBytes(Load(Full64(), kShl), bits); - return VFromD{_mm_gf2p8mul_epi8(v.raw, mul.raw)}; -#elif HWY_TARGET <= HWY_AVX2 - (void)tag; - (void)d; - return AVX2ShlU8Vec128(v, bits); -#else - const Repartition dw; - using VW = VFromD; - const VW even_mask = Set(dw, 0x00FF); - const VW odd_mask = Set(dw, 0xFF00); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - // Shift even lanes in-place - const VW evens = Shl(tag, vw, And(bits16, even_mask)); - const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16)); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -#endif -} -HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_SSE4 - const Vec16 bits8{_mm_cvtepu8_epi64(bits.raw)}; -#else - const Vec16 bits8 = - And(Vec16{bits.raw}, Vec16{_mm_set_epi64x(0, 0xFF)}); -#endif - return Vec128{_mm_sll_epi16(v.raw, bits8.raw)}; -} - -template -HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, - Vec128 bits) { -#if HWY_TARGET >= HWY_SSE4 - return v * Pow2(bits); -#else - return Vec128{_mm_sllv_epi32(v.raw, bits.raw)}; -#endif -} - -#if HWY_TARGET >= HWY_SSE4 -HWY_API Vec32 Shl(hwy::UnsignedTag /*tag*/, Vec32 v, - const Vec32 bits) { -#if HWY_TARGET == HWY_SSE4 - const Vec32 bits32{_mm_cvtepu32_epi64(bits.raw)}; -#else - const auto bits32 = - Combine(Full64(), Zero(Full32()), bits); -#endif - return Vec32{_mm_sll_epi32(v.raw, bits32.raw)}; -} -#endif - -HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, - Vec128 bits) { -#if HWY_TARGET >= HWY_SSE4 - const DFromV d; - // Individual shifts and combine - const Vec128 out0{_mm_sll_epi64(v.raw, bits.raw)}; - const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); - const Vec128 out1{_mm_sll_epi64(v.raw, bits1)}; - return ConcatUpperLower(d, out1, out0); -#else - return Vec128{_mm_sllv_epi64(v.raw, bits.raw)}; -#endif -} -HWY_API Vec64 Shl(hwy::UnsignedTag /*tag*/, Vec64 v, - Vec64 bits) { - return Vec64{_mm_sll_epi64(v.raw, bits.raw)}; -} - -// Signed left shift is the same as unsigned. -template -HWY_API Vec128 Shl(hwy::SignedTag /*tag*/, Vec128 v, - Vec128 bits) { - const DFromV di; - const RebindToUnsigned du; - return BitCast(di, - Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); -} - -} // namespace detail - -template -HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { - return detail::Shl(hwy::TypeTag(), v, bits); -} - -// ------------------------------ Shr (mul, mask, BroadcastSignBit) - -// Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use -// widening multiplication by powers of two obtained by loading float exponents, -// followed by a constant right-shift. This is still faster than a scalar or -// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. - -#if HWY_TARGET <= HWY_AVX2 -namespace detail { - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { - const DFromV d; - const Rebind du16; - const RebindToSigned di16; - return DemoteTo(d, - BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits))); -} -#else // AVX2 -template -HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) { - const DFromV d; - const Rebind du32; - const RebindToSigned di32; - return DemoteTo(d, - BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); -} -template -HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { - const DFromV d; - const Rebind du32; - const RebindToSigned di32; - return DemoteTo(d, - BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); -} -template -HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { - const DFromV d; - const Half dh; - const Rebind di16; - const Rebind du16; - const Rebind dh_i32; - const Rebind dh_u32; - - const auto lo_shr_result = - BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >> - PromoteTo(dh_u32, LowerHalf(dh, bits))); - const auto hi_shr_result = - BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >> - PromoteTo(dh_u32, UpperHalf(dh, bits))); - const auto i16_shr_result = - BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result)); - return DemoteTo(d, i16_shr_result); -} -#endif // HWY_TARGET <= HWY_AVX3 - -} // namespace detail -#endif // HWY_TARGET <= HWY_AVX2 - -template -HWY_API Vec128 operator>>(Vec128 in, - const Vec128 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_srlv_epi16(in.raw, bits.raw)}; -#elif HWY_TARGET <= HWY_AVX2 - return detail::AVX2ShrU16Vec128(in, bits); -#else - const DFromV d; - // For bits=0, we cannot mul by 2^16, so fix the result later. - const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); - // Replace output with input where bits == 0. - return IfThenElse(bits == Zero(d), in, out); -#endif -} - -#if HWY_TARGET > HWY_AVX3 -HWY_API Vec16 operator>>(const Vec16 in, - const Vec16 bits) { -#if HWY_TARGET <= HWY_SSE4 - const Vec16 bits16{_mm_cvtepu16_epi64(bits.raw)}; -#else - const auto bits16 = And(bits, Vec16{_mm_set_epi64x(0, 0xFFFF)}); -#endif - return Vec16{_mm_srl_epi16(in.raw, bits16.raw)}; -} -#endif - -// 8-bit uses 16-bit shifts. -template -HWY_API Vec128 operator>>(Vec128 in, - const Vec128 bits) { -#if HWY_TARGET <= HWY_AVX2 - return detail::AVX2ShrU8Vec128(in, bits); -#else - const DFromV d; - const Repartition dw; - using VW = VFromD; - const VW mask = Set(dw, 0x00FF); - const VW vw = BitCast(dw, in); - const VW bits16 = BitCast(dw, bits); - const VW evens = And(vw, mask) >> And(bits16, mask); - // Shift odd lanes in-place - const VW odds = vw >> ShiftRight<8>(bits16); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -#endif -} -HWY_API Vec128 operator>>(const Vec128 in, - const Vec128 bits) { -#if HWY_TARGET <= HWY_SSE4 - const Vec16 in8{_mm_cvtepu8_epi16(in.raw)}; - const Vec16 bits8{_mm_cvtepu8_epi64(bits.raw)}; -#else - const Vec16 mask{_mm_set_epi64x(0, 0xFF)}; - const Vec16 in8 = And(Vec16{in.raw}, mask); - const Vec16 bits8 = And(Vec16{bits.raw}, mask); -#endif - return Vec128{_mm_srl_epi16(in8.raw, bits8.raw)}; -} - -template -HWY_API Vec128 operator>>(const Vec128 in, - const Vec128 bits) { -#if HWY_TARGET >= HWY_SSE4 - // 32x32 -> 64 bit mul, then shift right by 32. - const DFromV d32; - // Move odd lanes into position for the second mul. Shuffle more gracefully - // handles N=1 than repartitioning to u64 and shifting 32 bits right. - const Vec128 in31{_mm_shuffle_epi32(in.raw, 0x31)}; - // For bits=0, we cannot mul by 2^32, so fix the result later. - const auto mul = detail::Pow2(Set(d32, 32) - bits); - const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 - const Vec128 mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; - // No need to shift right, already in the correct position. - const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? - const Vec128 out = OddEven(out31, BitCast(d32, out20)); - // Replace output with input where bits == 0. - return IfThenElse(bits == Zero(d32), in, out); -#else - return Vec128{_mm_srlv_epi32(in.raw, bits.raw)}; -#endif -} - -#if HWY_TARGET >= HWY_SSE4 -HWY_API Vec128 operator>>(const Vec128 in, - const Vec128 bits) { -#if HWY_TARGET == HWY_SSE4 - const Vec32 bits32{_mm_cvtepu32_epi64(bits.raw)}; -#else - const auto bits32 = - Combine(Full64(), Zero(Full32()), bits); -#endif - return Vec128{_mm_srl_epi32(in.raw, bits32.raw)}; -} -#endif - -HWY_API Vec128 operator>>(const Vec128 v, - const Vec128 bits) { -#if HWY_TARGET >= HWY_SSE4 - const DFromV d; - // Individual shifts and combine - const Vec128 out0{_mm_srl_epi64(v.raw, bits.raw)}; - const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); - const Vec128 out1{_mm_srl_epi64(v.raw, bits1)}; - return ConcatUpperLower(d, out1, out0); -#else - return Vec128{_mm_srlv_epi64(v.raw, bits.raw)}; -#endif -} -HWY_API Vec64 operator>>(const Vec64 v, - const Vec64 bits) { - return Vec64{_mm_srl_epi64(v.raw, bits.raw)}; -} - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { - const DFromV d; - const Rebind di16; - return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits)); -} -#elif HWY_TARGET <= HWY_AVX2 // AVX2 -template -HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) { - const DFromV d; - const Rebind di32; - return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); -} -template -HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { - const DFromV d; - const Rebind di32; - return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); -} -template -HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { - const DFromV d; - const Half dh; - const Rebind di16; - const Rebind dh_i32; - - const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >> - PromoteTo(dh_i32, LowerHalf(dh, bits)); - const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >> - PromoteTo(dh_i32, UpperHalf(dh, bits)); - const auto i16_shr_result = - OrderedDemote2To(di16, lo_shr_result, hi_shr_result); - return DemoteTo(d, i16_shr_result); -} -#endif - -#if HWY_TARGET > HWY_AVX3 -// Also used in x86_256-inl.h. -template -HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { - const RebindToUnsigned du; - const auto count = BitCast(du, count_i); // same type as value to shift - // Clear sign and restore afterwards. This is preferable to shifting the MSB - // downwards because Shr is somewhat more expensive than Shl. - const auto sign = BroadcastSignBit(v); - const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below - return BitCast(di, abs >> count) ^ sign; -} -#endif - -} // namespace detail - -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_srav_epi16(v.raw, bits.raw)}; -#elif HWY_TARGET <= HWY_AVX2 - return detail::AVX2ShrI16Vec128(v, bits); -#else - const DFromV d; - return detail::SignedShr(d, v, bits); -#endif -} - -#if HWY_TARGET > HWY_AVX3 -HWY_API Vec16 operator>>(Vec16 v, Vec16 bits) { -#if HWY_TARGET <= HWY_SSE4 - const Vec16 bits16{_mm_cvtepu16_epi64(bits.raw)}; -#else - const auto bits16 = And(bits, Vec16{_mm_set_epi64x(0, 0xFFFF)}); -#endif - return Vec16{_mm_sra_epi16(v.raw, bits16.raw)}; -} -#endif - -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_AVX2 - return detail::AVX2ShrI8Vec128(v, bits); -#else - const DFromV d; - return detail::SignedShr(d, v, bits); -#endif -} -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_SSE4 - const Vec16 vi16{_mm_cvtepi8_epi16(v.raw)}; - const Vec16 bits8{_mm_cvtepu8_epi64(bits.raw)}; -#else - const DFromV d; - const Rebind di16; - const Twice dt; - - const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v))); - const Vec16 bits8 = - And(Vec16{bits.raw}, Vec16{_mm_set_epi64x(0, 0xFF)}); -#endif - return Vec128{_mm_sra_epi16(vi16.raw, bits8.raw)}; -} - -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_AVX2 - return Vec128{_mm_srav_epi32(v.raw, bits.raw)}; -#else - const DFromV d; - return detail::SignedShr(d, v, bits); -#endif -} - -#if HWY_TARGET > HWY_AVX2 -HWY_API Vec32 operator>>(Vec32 v, Vec32 bits) { -#if HWY_TARGET == HWY_SSE4 - const Vec32 bits32{_mm_cvtepu32_epi64(bits.raw)}; -#else - const auto bits32 = Combine(Full64(), Zero(Full32()), bits); -#endif - return Vec32{_mm_sra_epi32(v.raw, bits32.raw)}; -} -#endif - -template -HWY_API Vec128 operator>>(Vec128 v, - Vec128 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec128{_mm_srav_epi64(v.raw, bits.raw)}; -#else - const DFromV d; - return detail::SignedShr(d, v, bits); -#endif -} - -// ------------------------------ MulEven/Odd 64x64 (UpperHalf) - -HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { - const DFromV d; - alignas(16) uint64_t mul[2]; - mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); - return Load(d, mul); -} - -HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { - const DFromV d; - const Half d2; - alignas(16) uint64_t mul[2]; - const uint64_t a1 = GetLane(UpperHalf(d2, a)); - const uint64_t b1 = GetLane(UpperHalf(d2, b)); - mul[0] = Mul128(a1, b1, &mul[1]); - return Load(d, mul); -} - -// ------------------------------ WidenMulPairwiseAdd - -// Generic for all vector lengths. -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { - // TODO(janwas): _mm_dpbf16_ps when available - const RebindToUnsigned du32; - // Lane order within sum0/1 is undefined, hence we can avoid the - // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip - // leads to the odd/even order that RearrangeToOddPlusEven prefers. - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), - Mul(BitCast(df32, ao), BitCast(df32, bo))); -} - -// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. -template >> -HWY_API VFromD WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { - return VFromD{_mm_madd_epi16(a.raw, b.raw)}; -} - -// Generic for all vector lengths. -template >> -HWY_API VFromD WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { - const auto p_lo = a * b; - const auto p_hi = MulHigh(a, b); - - const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo)); - const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)), - ShiftRight<16>(BitCast(du32, p_lo))); - return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1)); -} - -// ------------------------------ SatWidenMulPairwiseAdd - -#if HWY_TARGET <= HWY_SSSE3 - -#ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#else -#define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD -#endif - -// Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16 -// is safe. -template -HWY_API VFromD SatWidenMulPairwiseAdd( - DI16 /* tag */, VFromD> a, - VFromD> b) { - return VFromD{_mm_maddubs_epi16(a.raw, b.raw)}; -} - -#endif - -// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ShiftLeft) - -// Generic for all vector lengths. -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, - const VFromD sum0, - VFromD& sum1) { - // TODO(janwas): _mm_dpbf16_ps when available - const RebindToUnsigned du32; - // Lane order within sum0/1 is undefined, hence we can avoid the - // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip - // leads to the odd/even order that RearrangeToOddPlusEven prefers. - using VU32 = VFromD; - const VU32 odd = Set(du32, 0xFFFF0000u); - const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); - const VU32 ao = And(BitCast(du32, a), odd); - const VU32 be = ShiftLeft<16>(BitCast(du32, b)); - const VU32 bo = And(BitCast(du32, b), odd); - sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); - return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); -} - -// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. -template >> -HWY_API VFromD ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, - const VFromD sum0, - VFromD& /*sum1*/) { - (void)d; -#if HWY_TARGET <= HWY_AVX3_DL - return VFromD{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; -#else - return sum0 + WidenMulPairwiseAdd(d, a, b); -#endif -} - -template >> -HWY_API VFromD ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b, - const VFromD sum0, - VFromD& /*sum1*/) { - (void)d; - return sum0 + WidenMulPairwiseAdd(d, a, b); -} - -// ------------------------------ RearrangeToOddPlusEven -template -HWY_API Vec128 RearrangeToOddPlusEven(const Vec128 sum0, - Vec128 /*sum1*/) { - return sum0; // invariant already holds -} - -template -HWY_API Vec128 RearrangeToOddPlusEven( - const Vec128 sum0, Vec128 /*sum1*/) { - return sum0; // invariant already holds -} - -template -HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { - return Add(sum0, sum1); -} - -// ------------------------------ SumOfMulQuadAccumulate -#if HWY_TARGET <= HWY_AVX3_DL - -#ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE -#endif - -template -HWY_API VFromD SumOfMulQuadAccumulate( - DI32 /*di32*/, VFromD> a_u, - VFromD> b_i, VFromD sum) { - return VFromD{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; -} - -#ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE -#endif -template -HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, - VFromD> a, - VFromD> b, - VFromD sum) { - const Repartition du8; - - const auto a_u = BitCast(du8, a); - const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum); - const auto result_sum_1 = ShiftLeft<8>( - SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32))); - return result_sum_0 - result_sum_1; -} - -#ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#else -#define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE -#endif -template -HWY_API VFromD SumOfMulQuadAccumulate( - DU32 du32, VFromD> a, - VFromD> b, VFromD sum) { - const Repartition du8; - const RebindToSigned di8; - const RebindToSigned di32; - - const auto b_i = BitCast(di8, b); - const auto result_sum_0 = - SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum)); - const auto result_sum_1 = ShiftLeft<8>( - SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32))); - - return BitCast(du32, result_sum_0 - result_sum_1); -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ================================================== CONVERT - -// ------------------------------ Promotions (part w/ narrow lanes -> full) - -// Unsigned: zero-extend. -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - const __m128i zero = _mm_setzero_si128(); - return VFromD{_mm_unpacklo_epi8(v.raw, zero)}; -#else - return VFromD{_mm_cvtepu8_epi16(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - return VFromD{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; -#else - return VFromD{_mm_cvtepu16_epi32(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - return VFromD{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; -#else - return VFromD{_mm_cvtepu32_epi64(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - const __m128i zero = _mm_setzero_si128(); - const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); - return VFromD{_mm_unpacklo_epi16(u16, zero)}; -#else - return VFromD{_mm_cvtepu8_epi32(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { -#if HWY_TARGET > HWY_SSSE3 - const Rebind du32; - return PromoteTo(d, PromoteTo(du32, v)); -#elif HWY_TARGET == HWY_SSSE3 - alignas(16) static constexpr int8_t kShuffle[16] = { - 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1}; - const Repartition di8; - return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); -#else - (void)d; - return VFromD{_mm_cvtepu8_epi64(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { -#if HWY_TARGET > HWY_SSSE3 - const Rebind du32; - return PromoteTo(d, PromoteTo(du32, v)); -#elif HWY_TARGET == HWY_SSSE3 - alignas(16) static constexpr int8_t kShuffle[16] = { - 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1}; - const Repartition di8; - return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); -#else - (void)d; - return VFromD{_mm_cvtepu16_epi64(v.raw)}; -#endif -} - -// Unsigned to signed: same plus cast. -template ), sizeof(TFromV)), - HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> -HWY_API VFromD PromoteTo(D di, V v) { - const RebindToUnsigned du; - return BitCast(di, PromoteTo(du, v)); -} - -// Signed: replicate sign bit. -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - return ShiftRight<8>(VFromD{_mm_unpacklo_epi8(v.raw, v.raw)}); -#else - return VFromD{_mm_cvtepi8_epi16(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - return ShiftRight<16>(VFromD{_mm_unpacklo_epi16(v.raw, v.raw)}); -#else - return VFromD{_mm_cvtepi16_epi32(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - return ShiftRight<32>(VFromD{_mm_unpacklo_epi32(v.raw, v.raw)}); -#else - return VFromD{_mm_cvtepi32_epi64(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); - const __m128i x4 = _mm_unpacklo_epi16(x2, x2); - return ShiftRight<24>(VFromD{x4}); -#else - return VFromD{_mm_cvtepi8_epi32(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - const Repartition di32; - const Half dh_i32; - const VFromD x4{PromoteTo(dh_i32, v).raw}; - const VFromD s4{ - _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))}; - return ZipLower(d, x4, s4); -#else - (void)d; - return VFromD{_mm_cvtepi8_epi64(v.raw)}; -#endif -} -template -HWY_API VFromD PromoteTo(D d, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - const Repartition di32; - const Half dh_i32; - const VFromD x2{PromoteTo(dh_i32, v).raw}; - const VFromD s2{ - _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))}; - return ZipLower(d, x2, s2); -#else - (void)d; - return VFromD{_mm_cvtepi16_epi64(v.raw)}; -#endif -} - -#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) - -// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. -#ifdef HWY_NATIVE_F16C -#undef HWY_NATIVE_F16C -#else -#define HWY_NATIVE_F16C -#endif - -// Workaround for origin tracking bug in Clang msan prior to 11.0 -// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") -#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) -#define HWY_INLINE_F16 HWY_NOINLINE -#else -#define HWY_INLINE_F16 HWY_INLINE -#endif -template -HWY_INLINE_F16 VFromD PromoteTo(D /*tag*/, VFromD> v) { - return VFromD{_mm_cvtph_ps(v.raw)}; -} - -#endif // HWY_NATIVE_F16C - -template -HWY_API VFromD PromoteTo(D df32, VFromD> v) { - const Rebind du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtps_pd(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtepi32_pd(v.raw)}; -} - -// ------------------------------ Demotions (full -> part w/ narrow lanes) - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_packs_epi32(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { -#if HWY_TARGET >= HWY_SSSE3 - const Rebind di32; - const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); - const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); - const auto clamped = Or(zero_if_neg, too_big); -#if HWY_TARGET == HWY_SSE2 - const Rebind du16; - const RebindToSigned di16; - return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); -#else - const Repartition du16; - // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. - alignas(16) static constexpr uint16_t kLower2Bytes[16] = { - 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; - const auto lo2 = Load(du16, kLower2Bytes); - return VFromD{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; -#endif -#else - return VFromD{_mm_packus_epi32(v.raw, v.raw)}; -#endif -} - -template -HWY_API VFromD DemoteTo(D du16, VFromD> v) { - const DFromV du32; - const RebindToSigned di32; -#if HWY_TARGET >= HWY_SSSE3 - const auto too_big = - VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32))); - const auto clamped = Or(BitCast(di32, v), too_big); -#if HWY_TARGET == HWY_SSE2 - const RebindToSigned di16; - return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); -#else - (void)du16; - const Repartition du16_full; - // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. - alignas(16) static constexpr uint16_t kLower2Bytes[16] = { - 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; - const auto lo2 = Load(du16_full, kLower2Bytes); - return VFromD{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw}; -#endif -#else - return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); -#endif -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); - return VFromD{_mm_packus_epi16(i16, i16)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_packus_epi16(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); - return VFromD{_mm_packs_epi16(i16, i16)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_packs_epi16(v.raw, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D du8, VFromD> v) { -#if HWY_TARGET <= HWY_AVX3 - // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned - // integers to 8-bit unsigned integers - (void)du8; - return VFromD{_mm_cvtusepi32_epi8(v.raw)}; -#else - const DFromV du32; - const RebindToSigned di32; - const auto max_i32 = Set(du32, 0x7FFFFFFFu); - -#if HWY_TARGET >= HWY_SSSE3 - // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation - // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. - - // The u8 Min operation below leaves the lower 24 bits of each 32-bit - // lane unchanged. - - // The u8 Min operation below will leave any values that are less than or - // equal to 0x7FFFFFFF unchanged. - - // For values that are greater than or equal to 0x80000000, the u8 Min - // operation below will force the upper 8 bits to 0x7F and leave the lower - // 24 bits unchanged. - - // An u8 Min operation is okay here as any clamped value that is greater than - // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and - // 0x7FFFFFFF through the u8 Min operation below, which will then be converted - // to 0xFF through the i32->u8 demotion. - const Repartition du32_as_du8; - const auto clamped = BitCast( - di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32))); -#else - const auto clamped = BitCast(di32, Min(v, max_i32)); -#endif - - return DemoteTo(du8, clamped); -#endif -} - -template -HWY_API VFromD DemoteTo(D du8, VFromD> v) { - const DFromV du16; - const RebindToSigned di16; - const auto max_i16 = Set(du16, 0x7FFF); - -#if HWY_TARGET >= HWY_SSSE3 - // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation - // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. - - // The u8 Min operation below leaves the lower 8 bits of each 16-bit - // lane unchanged. - - // The u8 Min operation below will leave any values that are less than or - // equal to 0x7FFF unchanged. - - // For values that are greater than or equal to 0x8000, the u8 Min - // operation below will force the upper 8 bits to 0x7F and leave the lower - // 8 bits unchanged. - - // An u8 Min operation is okay here as any clamped value that is greater than - // or equal to 0x8000 will be clamped to a value between 0x7F00 and - // 0x7FFF through the u8 Min operation below, which will then be converted - // to 0xFF through the i16->u8 demotion. - const Repartition du16_as_du8; - const auto clamped = BitCast( - di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16))); -#else - const auto clamped = BitCast(di16, Min(v, max_i16)); -#endif - - return DemoteTo(du8, clamped); -} - -#if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) - -// HWY_NATIVE_F16C was already toggled above. - -// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate). -// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain. -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain") - -template -HWY_API VFromD DemoteTo(D /*tag*/, VFromD> v) { - return VFromD{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; -} - -HWY_DIAGNOSTICS(pop) - -#endif // F16C - -template -HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { - // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16. - const Rebind di32; - const Rebind du32; // for logical shift right - const Rebind du16; - const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); - return BitCast(dbf16, DemoteTo(du16, bits_in_32)); -} - -template >> -HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { - // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16. - const RebindToUnsigned du16; - const Repartition du32; - const VFromD b_in_even = ShiftRight<16>(BitCast(du32, b)); - return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); -} - -// Specializations for partial vectors because packs_epi32 sets lanes above 2*N. -template -HWY_API VFromD ReorderDemote2To(D dn, Vec32 a, Vec32 b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec64 a, - Vec64 b) { - return VFromD{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -} -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return VFromD{_mm_packs_epi32(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec32 a, Vec32 b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API VFromD ReorderDemote2To(D dn, Vec64 a, Vec64 b) { -#if HWY_TARGET >= HWY_SSSE3 - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -#else - (void)dn; - return VFromD{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -#endif -} -template -HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, Vec128 b) { -#if HWY_TARGET >= HWY_SSSE3 - const Half dnh; - const auto u16_a = DemoteTo(dnh, a); - const auto u16_b = DemoteTo(dnh, b); - return Combine(dn, u16_b, u16_a); -#else - (void)dn; - return VFromD{_mm_packus_epi32(a.raw, b.raw)}; -#endif -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV du32; - const RebindToSigned di32; - const auto max_i32 = Set(du32, 0x7FFFFFFFu); - -#if HWY_TARGET >= HWY_SSSE3 - const Repartition du32_as_du8; - // On SSE2/SSSE3, clamp a and b using u8 Min operation - const auto clamped_a = BitCast( - di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32))); - const auto clamped_b = BitCast( - di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32))); -#else - const auto clamped_a = BitCast(di32, Min(a, max_i32)); - const auto clamped_b = BitCast(di32, Min(b, max_i32)); -#endif - - return ReorderDemote2To(dn, clamped_a, clamped_b); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -// Specializations for partial vectors because packs_epi32 sets lanes above 2*N. -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec64 a, - Vec64 b) { - return VFromD{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -} -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return VFromD{_mm_packs_epi16(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec64 a, - Vec64 b) { - return VFromD{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw), - _MM_SHUFFLE(2, 0, 2, 0))}; -} -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, - Vec128 b) { - return VFromD{_mm_packus_epi16(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV du16; - const RebindToSigned di16; - const auto max_i16 = Set(du16, 0x7FFFu); - -#if HWY_TARGET >= HWY_SSSE3 - const Repartition du16_as_du8; - // On SSE2/SSSE3, clamp a and b using u8 Min operation - const auto clamped_a = BitCast( - di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16))); - const auto clamped_b = BitCast( - di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16))); -#else - const auto clamped_a = BitCast(di16, Min(a, max_i16)); - const auto clamped_b = BitCast(di16, Min(b, max_i16)); -#endif - - return ReorderDemote2To(dn, clamped_a, clamped_b); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -template ), - HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return ReorderDemote2To(d, a, b); -} - -template >> -HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { - const RebindToUnsigned du16; - return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtpd_ps(v.raw)}; -} - -namespace detail { - -// Generic for all vector lengths. -template -HWY_INLINE VFromD ClampF64ToI32Max(D d, VFromD v) { - // The max can be exactly represented in binary64, so clamping beforehand - // prevents x86 conversion from raising an exception and returning 80..00. - return Min(v, Set(d, 2147483647.0)); -} - -// For ConvertTo float->int of same size, clamping before conversion would -// change the result because the max integer value is not exactly representable. -// Instead detect the overflow result after conversion and fix it. -// Generic for all vector lengths. -template -HWY_INLINE VFromD FixConversionOverflow(DI di, - VFromD> original, - VFromD converted) { - // Combinations of original and output sign: - // --: normal <0 or -huge_val to 80..00: OK - // -+: -0 to 0 : OK - // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF - // ++: normal >0 : OK - const VFromD sign_wrong = AndNot(BitCast(di, original), converted); -#if HWY_COMPILER_GCC_ACTUAL - // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also - // Add() if using that instead. Work around with one more instruction. - const RebindToUnsigned du; - const VFromD mask = BroadcastSignBit(sign_wrong); - const VFromD max = BitCast(di, ShiftRight<1>(BitCast(du, mask))); - return IfVecThenElse(mask, max, converted); -#else - return Xor(converted, BroadcastSignBit(sign_wrong)); -#endif -} - -} // namespace detail - -template > -HWY_API VFromD DemoteTo(D /* tag */, VFromD v) { - const VFromD clamped = detail::ClampF64ToI32Max(DF(), v); - return VFromD{_mm_cvttpd_epi32(clamped.raw)}; -} - -// For already range-limited input [0, 255]. -template -HWY_API Vec128 U8FromU32(const Vec128 v) { -#if HWY_TARGET == HWY_SSE2 - const RebindToSigned> di32; - const Rebind du8; - return DemoteTo(du8, BitCast(di32, v)); -#else - const DFromV d32; - const Repartition d8; - alignas(16) static constexpr uint32_t k8From32[4] = { - 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; - // Also replicate bytes into all 32 bit lanes for safety. - const auto quad = TableLookupBytes(v, Load(d32, k8From32)); - return LowerHalf(LowerHalf(BitCast(d8, quad))); -#endif -} - -// ------------------------------ MulFixedPoint15 - -#if HWY_TARGET == HWY_SSE2 -HWY_API Vec128 MulFixedPoint15(const Vec128 a, - const Vec128 b) { - const DFromV d; - const Repartition di32; - - auto lo_product = a * b; - auto hi_product = MulHigh(a, b); - - const VFromD i32_product_lo{ - _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; - const VFromD i32_product_hi{ - _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)}; - - const auto round_up_incr = Set(di32, 0x4000); - return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr), - ShiftRight<15>(i32_product_hi + round_up_incr)); -} - -template -HWY_API Vec128 MulFixedPoint15(const Vec128 a, - const Vec128 b) { - const DFromV d; - const Rebind di32; - - const auto lo_product = a * b; - const auto hi_product = MulHigh(a, b); - const VFromD i32_product{ - _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; - - return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000))); -} -#else -template -HWY_API Vec128 MulFixedPoint15(const Vec128 a, - const Vec128 b) { - return Vec128{_mm_mulhrs_epi16(a.raw, b.raw)}; -} -#endif - -// ------------------------------ Truncations - -template -HWY_API VFromD TruncateTo(DTo /* tag */, Vec128 v) { - // BitCast requires the same size; DTo might be u8x1 and v u16x1. - const Repartition, DFromV> dto; - return VFromD{BitCast(dto, v).raw}; -} - -template -HWY_API VFromD TruncateTo(D d, Vec128 v) { -#if HWY_TARGET == HWY_SSE2 - const Vec128 lo{v.raw}; - const Vec128 hi{_mm_unpackhi_epi64(v.raw, v.raw)}; - return Combine(d, hi, lo); -#else - const Repartition> d8; - (void)d; - alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8, - 0, 8, 0, 8, 0, 8, 0, 8}; - const Vec128 v8 = TableLookupBytes(v, Load(d8, kIdx)); - return LowerHalf(LowerHalf(LowerHalf(v8))); -#endif -} - -template -HWY_API VFromD TruncateTo(D d, Vec128 v) { -#if HWY_TARGET == HWY_SSE2 - const Vec128 lo{v.raw}; - const Vec128 hi{_mm_unpackhi_epi64(v.raw, v.raw)}; - return Combine(d, hi, lo); -#else - (void)d; - const Repartition> d16; - alignas(16) static constexpr uint16_t kIdx[8] = { - 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u}; - const Vec128 v16 = TableLookupBytes(v, Load(d16, kIdx)); - return LowerHalf(LowerHalf(v16)); -#endif -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { - return VFromD{_mm_shuffle_epi32(v.raw, 0x88)}; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const DFromV du32; -#if HWY_TARGET == HWY_SSE2 - const RebindToSigned di32; - const Rebind du8; - return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v)))); -#else - const Repartition d; - alignas(16) static constexpr uint8_t kIdx[16] = { - 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, - 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu}; - return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx)))); -#endif -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const DFromV du32; -#if HWY_TARGET == HWY_SSE2 - const RebindToSigned di32; - const Rebind du16; - const RebindToSigned di16; - return BitCast( - du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v))))); -#else - const Repartition d; - return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); -#endif -} - -template -HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { - const DFromV du16; -#if HWY_TARGET == HWY_SSE2 - const RebindToSigned di16; - const Rebind du8; - const RebindToSigned di8; - return BitCast(du8, - DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v))))); -#else - const Repartition d; - return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); -#endif -} - -// ------------------------------ Demotions to/from i64 - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtsepi64_epi32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtsepi64_epi16(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtsepi64_epi8(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtusepi64_epi32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtusepi64_epi16(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtusepi64_epi8(v.raw)}; -} -#else // AVX2 or below -namespace detail { -template -HWY_INLINE VFromD> DemoteFromU64MaskOutResult( - D /*dn*/, VFromD> v) { - return v; -} - -template -HWY_INLINE VFromD> DemoteFromU64MaskOutResult( - D /*dn*/, VFromD> v) { - const DFromV du64; - return And(v, - Set(du64, static_cast(hwy::HighestValue>()))); -} - -template -HWY_INLINE VFromD> DemoteFromU64Saturate( - D dn, VFromD> v) { - const Rebind du64; - const RebindToSigned di64; - constexpr int kShiftAmt = static_cast(sizeof(TFromD) * 8) - - static_cast(hwy::IsSigned>()); - - const auto too_big = BitCast( - du64, VecFromMask( - di64, Gt(BitCast(di64, ShiftRight(v)), Zero(di64)))); - return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); -} - -template -HWY_INLINE VFromD ReorderDemote2From64To32Combine(D dn, V a, V b) { - return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); -} - -} // namespace detail - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - const DFromV di64; - const RebindToUnsigned du64; - const RebindToUnsigned dn_u; - - // Negative values are saturated by first saturating their bitwise inverse - // and then inverting the saturation result - const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); - const auto saturated_vals = Xor( - invert_mask, - detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); - return BitCast(dn, TruncateTo(dn_u, saturated_vals)); -} - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - const DFromV di64; - const RebindToUnsigned du64; - - const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); - return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); -} - -template -HWY_API VFromD DemoteTo(D dn, VFromD> v) { - return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); -} -#endif // HWY_TARGET <= HWY_AVX3 - -template )> -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, - VFromD> b) { - const DFromV d; - const Twice dt; - return DemoteTo(dn, Combine(dt, b, a)); -} - -#if HWY_TARGET > HWY_AVX2 -template -HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV di64; - const RebindToUnsigned du64; - const Half dnh; - - // Negative values are saturated by first saturating their bitwise inverse - // and then inverting the saturation result - const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); - const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); - const auto saturated_a = Xor( - invert_mask_a, - detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); - const auto saturated_b = Xor( - invert_mask_b, - detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); - - return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const DFromV di64; - const RebindToUnsigned du64; - const Half dnh; - - const auto saturated_a = detail::DemoteFromU64Saturate( - dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); - const auto saturated_b = detail::DemoteFromU64Saturate( - dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); - - return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); -} - -template -HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, - Vec128 b) { - const Half dnh; - - const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); - const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); - - return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); -} -#endif // HWY_TARGET > HWY_AVX2 - -// ------------------------------ Integer <=> fp (ShiftRight, OddEven) - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtepu16_ph(v.raw)}; -} -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtepi16_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { - return VFromD{_mm_cvtepi32_ps(v.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API VFromD ConvertTo(D /*df*/, VFromD> v) { - return VFromD{_mm_cvtepu32_ps(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /*dd*/, VFromD> v) { - return VFromD{_mm_cvtepi64_pd(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /*dd*/, VFromD> v) { - return VFromD{_mm_cvtepu64_pd(v.raw)}; -} -#else // AVX2 or below -// Generic for all vector lengths. -template -HWY_API VFromD ConvertTo(D df, VFromD> v) { - // Based on wim's approach (https://stackoverflow.com/questions/34066228/) - const RebindToUnsigned du32; - const RebindToSigned d32; - - const auto msk_lo = Set(du32, 0xFFFF); - const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 - - // Extract the 16 lowest/highest significant bits of v and cast to signed int - const auto v_lo = BitCast(d32, And(v, msk_lo)); - const auto v_hi = BitCast(d32, ShiftRight<16>(v)); - return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); -} - -// Generic for all vector lengths. -template -HWY_API VFromD ConvertTo(D dd, VFromD> v) { - // Based on wim's approach (https://stackoverflow.com/questions/41144668/) - const Repartition d32; - const Repartition d64; - - // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 - const auto k84_63 = Set(d64, 0x4530000080000000ULL); - const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); - - // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) - const auto k52 = Set(d32, 0x43300000); - const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); - - const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); - return (v_upper - k84_63_52) + v_lower; // order matters! -} - -namespace detail { -template -HWY_INLINE VFromD>> U64ToF64VecFast(VW w) { - const DFromV d64; - const RebindToFloat dd; - const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 - return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; -} -} // namespace detail - -// Generic for all vector lengths. -template -HWY_API VFromD ConvertTo(D dd, VFromD> v) { - // Based on wim's approach (https://stackoverflow.com/questions/41144668/) - const RebindToUnsigned d64; - using VU = VFromD; - - const VU msk_lo = Set(d64, 0xFFFFFFFF); - const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 - - // Extract the 32 lowest/highest significant bits of v - const VU v_lo = And(v, msk_lo); - const VU v_hi = ShiftRight<32>(v); - - const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); - return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// Truncates (rounds toward zero). - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D di, VFromD> v) { - return detail::FixConversionOverflow( - di, v, VFromD>{_mm_cvttph_epi16(v.raw)}); -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API VFromD ConvertTo(D di, VFromD> v) { - return detail::FixConversionOverflow( - di, v, VFromD>{_mm_cvttps_epi32(v.raw)}); -} - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API VFromD ConvertTo(DI di, VFromD> v) { - return detail::FixConversionOverflow(di, v, - VFromD{_mm_cvttpd_epi64(v.raw)}); -} - -#else // AVX2 or below - -#if HWY_ARCH_X86_64 -template -HWY_API VFromD ConvertTo(DI di, Vec64 v) { - const Vec64 i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))}; - return detail::FixConversionOverflow(di, v, i0); -} -template -HWY_API VFromD ConvertTo(DI di, Vec128 v) { - const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw)); - const Full64 dd2; - const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw)); - return detail::FixConversionOverflow( - di, v, Vec128{_mm_unpacklo_epi64(i0, i1)}); -} -#endif // HWY_ARCH_X86_64 - -#if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 -template -HWY_API VFromD ConvertTo(DI di, VFromD> v) { - using VI = VFromD; - const RebindToUnsigned du; - using VU = VFromD; - const Repartition du16; - const VI k1075 = Set(di, 1075); /* biased exponent of 2^52 */ - - // Exponent indicates whether the number can be represented as int64_t. - const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF); -#if HWY_TARGET <= HWY_SSE4 - const auto in_range = BitCast(di, biased_exp) < Set(di, 1086); -#else - const Repartition di32; - const auto in_range = MaskFromVec(BitCast( - di, - VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086)))); -#endif - - // If we were to cap the exponent at 51 and add 2^52, the number would be in - // [2^52, 2^53) and mantissa bits could be read out directly. We need to - // round-to-0 (truncate), but changing rounding mode in MXCSR hits a - // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead - // manually shift the mantissa into place (we already have many of the - // inputs anyway). - - // Use 16-bit saturated unsigned subtraction to compute shift_mnt and - // shift_int since biased_exp[i] is a non-negative integer that is less than - // or equal to 2047. - - // 16-bit saturated unsigned subtraction is also more efficient than a - // 64-bit subtraction followed by a 64-bit signed Max operation on - // SSE2/SSSE3/SSE4/AVX2. - - // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be - // zero as the upper 48 bits of both k1075 and biased_exp are zero. - - const VU shift_mnt = BitCast( - du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); - const VU shift_int = BitCast( - du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); - const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1); - // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86 - // returning zero in that case. - const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; - - // For inputs larger than 2^53 - 1, insert zeros at the bottom. - - // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be - // shifted out of the left shift result below as shift_int[i] <= 10 is true - // for any inputs that are less than 2^63. - - const VU shifted = int53 << shift_int; - - // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. - const VI sign_mask = BroadcastSignBit(BitCast(di, v)); - const VI limit = Set(di, LimitsMax()) - sign_mask; - const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit); - - // If the input was negative, negate the integer (two's complement). - return (magnitude ^ sign_mask) - sign_mask; -} -#endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 -#endif // HWY_TARGET <= HWY_AVX3 - -template -HWY_API Vec128 NearestInt(const Vec128 v) { - const RebindToSigned> di; - return detail::FixConversionOverflow( - di, v, VFromD{_mm_cvtps_epi32(v.raw)}); -} - -// ------------------------------ Floating-point rounding (ConvertTo) - -#if HWY_TARGET >= HWY_SSSE3 - -// Toward nearest integer, ties to even -template -HWY_API Vec128 Round(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - // Rely on rounding after addition with a large value such that no mantissa - // bits remain (assuming the current mode is nearest-even). We may need a - // compiler flag for precise floating-point to prevent "optimizing" this out. - const DFromV df; - const auto max = Set(df, MantissaEnd()); - const auto large = CopySignToAbs(max, v); - const auto added = large + v; - const auto rounded = added - large; - // Keep original if NaN or the magnitude is large (already an int). - return IfThenElse(Abs(v) < max, rounded, v); -} - -namespace detail { - -// Truncating to integer and converting back to float is correct except when the -// input magnitude is large, in which case the input was already an integer -// (because mantissa >> exponent is zero). -template -HWY_INLINE Mask128 UseInt(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - return Abs(v) < Set(d, MantissaEnd()); -} - -} // namespace detail - -// Toward zero, aka truncate -template -HWY_API Vec128 Trunc(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); -} - -// Toward +infinity, aka ceiling -template -HWY_API Vec128 Ceil(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - // Truncating a positive non-integer ends up smaller; if so, add 1. - const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); - - return IfThenElse(detail::UseInt(v), int_f - neg1, v); -} - -// Toward -infinity, aka floor -template -HWY_API Vec128 Floor(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV df; - const RebindToSigned di; - - const auto integer = ConvertTo(di, v); // round toward 0 - const auto int_f = ConvertTo(df, integer); - - // Truncating a negative non-integer ends up larger; if so, subtract 1. - const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); - - return IfThenElse(detail::UseInt(v), int_f + neg1, v); -} - -#else - -// Toward nearest integer, ties to even -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Round(const Vec128 v) { - return Vec128{ - _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Round(const Vec128 v) { - return Vec128{ - _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} -template -HWY_API Vec128 Round(const Vec128 v) { - return Vec128{ - _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} - -// Toward zero, aka truncate -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Trunc(const Vec128 v) { - return Vec128{ - _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Trunc(const Vec128 v) { - return Vec128{ - _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} -template -HWY_API Vec128 Trunc(const Vec128 v) { - return Vec128{ - _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} - -// Toward +infinity, aka ceiling -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Ceil(const Vec128 v) { - return Vec128{ - _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Ceil(const Vec128 v) { - return Vec128{ - _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} -template -HWY_API Vec128 Ceil(const Vec128 v) { - return Vec128{ - _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} - -// Toward -infinity, aka floor -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Floor(const Vec128 v) { - return Vec128{ - _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec128 Floor(const Vec128 v) { - return Vec128{ - _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} -template -HWY_API Vec128 Floor(const Vec128 v) { - return Vec128{ - _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} - -#endif // !HWY_SSSE3 - -// ------------------------------ Floating-point classification - -#define HWY_X86_FPCLASS_QNAN 0x01 -#define HWY_X86_FPCLASS_POS0 0x02 -#define HWY_X86_FPCLASS_NEG0 0x04 -#define HWY_X86_FPCLASS_POS_INF 0x08 -#define HWY_X86_FPCLASS_NEG_INF 0x10 -#define HWY_X86_FPCLASS_SUBNORMAL 0x20 -#define HWY_X86_FPCLASS_NEG 0x40 -#define HWY_X86_FPCLASS_SNAN 0x80 - -#if HWY_HAVE_FLOAT16 || HWY_IDE - -template -HWY_API Mask128 IsNaN(const Vec128 v) { - return Mask128{ - _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -} - -template -HWY_API Mask128 IsInf(const Vec128 v) { - return Mask128{_mm_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} - -template -HWY_API Mask128 IsFinite(const Vec128 v) { - // fpclass doesn't have a flag for positive, so we have to check for inf/NaN - // and negate the mask. - return Not(Mask128{_mm_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} - -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API Mask128 IsNaN(const Vec128 v) { -#if HWY_TARGET <= HWY_AVX3 - return Mask128{ - _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -#else - return Mask128{_mm_cmpunord_ps(v.raw, v.raw)}; -#endif -} -template -HWY_API Mask128 IsNaN(const Vec128 v) { -#if HWY_TARGET <= HWY_AVX3 - return Mask128{ - _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -#else - return Mask128{_mm_cmpunord_pd(v.raw, v.raw)}; -#endif -} - -#if HWY_TARGET <= HWY_AVX3 - -template -HWY_API Mask128 IsInf(const Vec128 v) { - return Mask128{_mm_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} -template -HWY_API Mask128 IsInf(const Vec128 v) { - return Mask128{_mm_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask128 IsFinite(const Vec128 v) { - // fpclass doesn't have a flag for positive, so we have to check for inf/NaN - // and negate the mask. - return Not(Mask128{_mm_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} -template -HWY_API Mask128 IsFinite(const Vec128 v) { - return Not(Mask128{_mm_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} - -#else - -template -HWY_API Mask128 IsInf(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - const RebindToSigned di; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask128 IsFinite(const Vec128 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - const VFromD vu = BitCast(du, v); - // Shift left to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). MSVC seems to generate - // incorrect code if we instead add vu + vu. - const VFromD exp = - BitCast(di, ShiftRight() + 1>(ShiftLeft<1>(vu))); - return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ================================================== CRYPTO - -#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 - -// Per-target flag to prevent generic_ops-inl.h from defining AESRound. -#ifdef HWY_NATIVE_AES -#undef HWY_NATIVE_AES -#else -#define HWY_NATIVE_AES -#endif - -HWY_API Vec128 AESRound(Vec128 state, - Vec128 round_key) { - return Vec128{_mm_aesenc_si128(state.raw, round_key.raw)}; -} - -HWY_API Vec128 AESLastRound(Vec128 state, - Vec128 round_key) { - return Vec128{_mm_aesenclast_si128(state.raw, round_key.raw)}; -} - -HWY_API Vec128 AESInvMixColumns(Vec128 state) { - return Vec128{_mm_aesimc_si128(state.raw)}; -} - -HWY_API Vec128 AESRoundInv(Vec128 state, - Vec128 round_key) { - return Vec128{_mm_aesdec_si128(state.raw, round_key.raw)}; -} - -HWY_API Vec128 AESLastRoundInv(Vec128 state, - Vec128 round_key) { - return Vec128{_mm_aesdeclast_si128(state.raw, round_key.raw)}; -} - -template -HWY_API Vec128 AESKeyGenAssist(Vec128 v) { - return Vec128{_mm_aeskeygenassist_si128(v.raw, kRcon)}; -} - -template -HWY_API Vec128 CLMulLower(Vec128 a, - Vec128 b) { - return Vec128{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; -} - -template -HWY_API Vec128 CLMulUpper(Vec128 a, - Vec128 b) { - return Vec128{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; -} - -#endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 - -// ================================================== MISC - -// ------------------------------ LoadMaskBits (TestBit) - -#if HWY_TARGET > HWY_AVX3 -namespace detail { - -template -HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - // Easier than Set(), which would require an >8-bit type, which would not - // compile for T=uint8_t, kN=1. - const VFromD vbits{_mm_cvtsi32_si128(static_cast(mask_bits))}; - -#if HWY_TARGET == HWY_SSE2 - // {b0, b1, ...} ===> {b0, b0, b1, b1, ...} - __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw); - // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...} - unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits); - // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==> - // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1} - const VFromD rep8{ - _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)}; -#else - // Replicate bytes 8x such that each byte contains the bit that governs it. - alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 1, 1, 1, 1}; - const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); -#endif - - alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128}; - return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - const auto vmask_bits = Set(du, static_cast(mask_bits)); - return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; - const auto vmask_bits = Set(du, static_cast(mask_bits)); - return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); -} - -template -HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { - const RebindToUnsigned du; - alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; - return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); -} - -} // namespace detail -#endif // HWY_TARGET > HWY_AVX3 - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - constexpr size_t kN = MaxLanes(d); -#if HWY_TARGET <= HWY_AVX3 - (void)d; - uint64_t mask_bits = 0; - constexpr size_t kNumBytes = (kN + 7) / 8; - CopyBytes(bits, &mask_bits); - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - - return MFromD::FromBits(mask_bits); -#else - uint64_t mask_bits = 0; - constexpr size_t kNumBytes = (kN + 7) / 8; - CopyBytes(bits, &mask_bits); - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - - return detail::LoadMaskBits128(d, mask_bits); -#endif -} - -template -struct CompressIsPartition { -#if HWY_TARGET <= HWY_AVX3 - // AVX3 supports native compress, but a table-based approach allows - // 'partitioning' (also moving mask=false lanes to the top), which helps - // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8 - // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3 - // u32x8 etc.). - enum { value = (sizeof(T) == 8) }; -#else - // generic_ops-inl does not guarantee IsPartition for 8-bit. - enum { value = (sizeof(T) != 1) }; -#endif -}; - -#if HWY_TARGET <= HWY_AVX3 - -// ------------------------------ StoreMaskBits - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - constexpr size_t kN = MaxLanes(d); - constexpr size_t kNumBytes = (kN + 7) / 8; - CopyBytes(&mask.raw, bits); - - // Non-full byte, need to clear the undefined upper bits. - if (kN < 8) { - const int mask_bits = (1 << kN) - 1; - bits[0] = static_cast(bits[0] & mask_bits); - } - - return kNumBytes; -} - -// ------------------------------ Mask testing - -// Beware: the suffix indicates the number of mask bits, not lane size! - -template -HWY_API size_t CountTrue(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); - return PopCount(mask_bits); -} - -template -HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); - return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); -} - -template -HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); - return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; -} - -template -HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); - return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); -} - -template -HWY_API intptr_t FindLastTrue(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); - return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) - : -1; -} - -template -HWY_API bool AllFalse(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); - return mask_bits == 0; -} - -template -HWY_API bool AllTrue(D d, MFromD mask) { - constexpr size_t kN = MaxLanes(d); - const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); - // Cannot use _kortestc because we may have less than 8 mask bits. - return mask_bits == (1ull << kN) - 1; -} - -// ------------------------------ Compress - -// 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512. - -// Single lane: no-op -template -HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { - return v; -} - -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return Vec128{_mm_maskz_compress_ps(mask.raw, v.raw)}; -} - -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - HWY_DASSERT(mask.raw < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[64] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const DFromV d; - const Repartition d8; - const auto index = Load(d8, u8_indices + 16 * mask.raw); - return BitCast(d, TableLookupBytes(BitCast(d8, v), index)); -} - -// ------------------------------ CompressNot (Compress) - -// Single lane: no-op -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { - return v; -} - -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // See CompressIsPartition, PrintCompressNot64x2NibbleTables - alignas(16) static constexpr uint64_t packed_array[16] = { - 0x00000010, 0x00000001, 0x00000010, 0x00000010}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 2) - - // _mm_permutexvar_epi64 will ignore the upper bits. - const DFromV d; - const RebindToUnsigned du64; - const auto packed = Set(du64, packed_array[mask.raw]); - alignas(16) static constexpr uint64_t shifts[2] = {0, 4}; - const auto indices = Indices128{(packed >> Load(du64, shifts)).raw}; - return TableLookupLanes(v, indices); -} - -// ------------------------------ CompressBlocksNot -HWY_API Vec128 CompressBlocksNot(Vec128 v, - Mask128 /* m */) { - return v; -} - -// ------------------------------ CompressStore (defined in x86_512) - -// ------------------------------ CompressBlendedStore (CompressStore) -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - // AVX-512 already does the blending at no extra cost (latency 11, - // rthroughput 2 - same as compress plus store). - if (HWY_TARGET == HWY_AVX3_DL || - (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD) > 2)) { - // We're relying on the mask to blend. Clear the undefined upper bits. - constexpr size_t kN = MaxLanes(d); - if (kN != 16 / sizeof(TFromD)) { - m = And(m, FirstN(d, kN)); - } - return CompressStore(v, m, d, unaligned); - } else { - const size_t count = CountTrue(d, m); - const VFromD compressed = Compress(v, m); -#if HWY_MEM_OPS_MIGHT_FAULT - // BlendedStore tests mask for each lane, but we know that the mask is - // FirstN, so we can just copy. - alignas(16) TFromD buf[MaxLanes(d)]; - Store(compressed, d, buf); - CopyBytes(buf, unaligned, count * sizeof(TFromD)); -#else - BlendedStore(compressed, FirstN(d, count), d, unaligned); -#endif - detail::MaybeUnpoison(unaligned, count); - return count; - } -} - -// ------------------------------ CompressBitsStore (defined in x86_512) - -#else // AVX2 or below - -// ------------------------------ StoreMaskBits - -namespace detail { - -constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { - return static_cast(static_cast(mask_bits)); -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, - const Mask128 mask) { - const Simd d; - const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; - return U64FromInt(_mm_movemask_epi8(sign_bits)); -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, - const Mask128 mask) { - // Remove useless lower half of each u16 while preserving the sign bit. - const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); - return U64FromInt(_mm_movemask_epi8(sign_bits)); -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { - const Simd d; - const Simd df; - const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return U64FromInt(_mm_movemask_ps(sign_bits.raw)); -} - -template -HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { - const Simd d; - const Simd df; - const auto sign_bits = BitCast(df, VecFromMask(d, mask)); - return U64FromInt(_mm_movemask_pd(sign_bits.raw)); -} - -template -HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { - return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); -} - -} // namespace detail - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; - const uint64_t mask_bits = detail::BitsFromMask(mask); - CopyBytes(&mask_bits, bits); - return kNumBytes; -} - -// ------------------------------ Mask testing - -template -HWY_API bool AllFalse(D /* tag */, MFromD mask) { - // Cheaper than PTEST, which is 2 uop / 3L. - return detail::BitsFromMask(mask) == 0; -} - -template -HWY_API bool AllTrue(D d, MFromD mask) { - constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; - return detail::BitsFromMask(mask) == kAllBits; -} - -template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); -} - -template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - return Num0BitsBelowLS1Bit_Nonzero32( - static_cast(detail::BitsFromMask(mask))); -} - -template -HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); - return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; -} - -template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - return 31 - Num0BitsAboveMS1Bit_Nonzero32( - static_cast(detail::BitsFromMask(mask))); -} - -template -HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); - return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) - : -1; -} - -// ------------------------------ Compress, CompressBits - -namespace detail { - -// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. -template -HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Rebind d8; - const Twice d8t; - const RebindToUnsigned du; - - // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need - // byte indices for PSHUFB (one vector's worth for each of 256 combinations of - // 8 mask bits). Loading them directly would require 4 KiB. We can instead - // store lane indices and convert to byte indices (2*lane + 0..1), with the - // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane - // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. - // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles - // is likely more costly than the higher cache footprint from storing bytes. - alignas(16) static constexpr uint8_t table[2048] = { - // PrintCompress16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // - 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // - 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // - 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // - 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // - 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // - 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // - 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // - 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // - 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // - 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // - 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // - 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // - 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // - 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // - 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // - 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // - 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // - 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // - 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // - 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // - 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // - 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // - 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // - 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // - 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // - 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // - 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // - 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // - 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // - 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // - 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // - 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // - 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // - 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // - 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // - 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // - 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // - 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // - 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // - 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // - 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // - 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // - 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // - 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // - 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // - 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // - 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // - 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // - 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // - 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // - 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // - 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // - 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // - 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // - 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // - 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // - 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // - 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // - 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // - 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // - 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // - 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // - 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // - 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // - 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // - 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // - 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // - 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // - 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // - 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // - 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // - 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // - 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // - 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // - 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // - 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // - 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // - 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // - 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // - 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // - 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // - 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // - 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // - 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // - 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // - 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // - 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // - 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // - 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // - 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // - 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // - 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // - 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // - 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // - 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // - 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // - 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // - 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // - 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // - 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // - 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // - 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // - 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // - 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // - 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // - 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // - 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // - 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // - 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // - 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // - 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // - 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // - 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // - 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // - 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // - 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // - 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // - 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // - 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // - 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // - 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; - const VFromD pairs = ZipLower(byte_idx, byte_idx); - return BitCast(d, pairs + Set(du, 0x0100)); -} - -template -HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 256); - const Rebind d8; - const Twice d8t; - const RebindToUnsigned du; - - // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need - // byte indices for PSHUFB (one vector's worth for each of 256 combinations of - // 8 mask bits). Loading them directly would require 4 KiB. We can instead - // store lane indices and convert to byte indices (2*lane + 0..1), with the - // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane - // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. - // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles - // is likely more costly than the higher cache footprint from storing bytes. - alignas(16) static constexpr uint8_t table[2048] = { - // PrintCompressNot16x8Tables - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // - 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // - 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // - 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // - 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // - 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // - 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // - 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // - 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // - 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // - 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // - 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // - 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // - 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // - 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // - 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // - 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // - 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // - 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // - 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // - 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // - 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // - 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // - 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // - 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // - 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // - 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // - 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // - 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // - 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // - 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // - 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // - 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // - 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // - 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // - 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // - 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // - 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // - 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // - 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // - 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // - 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // - 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // - 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // - 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // - 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // - 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // - 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // - 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // - 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // - 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // - 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // - 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // - 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // - 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // - 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // - 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // - 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // - 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // - 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // - 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // - 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // - 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // - 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // - 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // - 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // - 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // - 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // - 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // - 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // - 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // - 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // - 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // - 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // - 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // - 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // - 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // - 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // - 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // - 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // - 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // - 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // - 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // - 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // - 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // - 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // - 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // - 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // - 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // - 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // - 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // - 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // - 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // - 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // - 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // - 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // - 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // - 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // - 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // - 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // - 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // - 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // - 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // - 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // - 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // - 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // - 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // - 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // - 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // - 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // - 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // - 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // - 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // - 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // - 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // - 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // - 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // - 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // - 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // - 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // - 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // - 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; - - const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; - const VFromD pairs = ZipLower(byte_idx, byte_idx); - return BitCast(d, pairs + Set(du, 0x0100)); -} - -template -HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[256] = { - // PrintCompress32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // - 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // - 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // - 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // - 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // - 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 16); - - // There are only 4 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[256] = { - // PrintCompressNot32x4Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, - 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, - 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, - 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[64] = { - // PrintCompress64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { - HWY_DASSERT(mask_bits < 4); - - // There are only 2 lanes, so we can afford to load the index vector directly. - alignas(16) static constexpr uint8_t u8_indices[64] = { - // PrintCompressNot64x2Tables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - - const Repartition d8; - return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); -} - -template -HWY_API Vec128 CompressBits(Vec128 v, uint64_t mask_bits) { - const DFromV d; - const RebindToUnsigned du; - - HWY_DASSERT(mask_bits < (1ull << N)); - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); -} - -template -HWY_API Vec128 CompressNotBits(Vec128 v, uint64_t mask_bits) { - const DFromV d; - const RebindToUnsigned du; - - HWY_DASSERT(mask_bits < (1ull << N)); - const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); - return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); -} - -} // namespace detail - -// Single lane: no-op -template -HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. - const DFromV d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskL, maskH); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -// General case, 2 or 4 bytes -template -HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { - return detail::CompressBits(v, detail::BitsFromMask(mask)); -} - -// ------------------------------ CompressNot - -// Single lane: no-op -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { - return v; -} - -// Two lanes: conditional swap -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. - const DFromV d; - const Vec128 m = VecFromMask(d, mask); - const Vec128 maskL = DupEven(m); - const Vec128 maskH = DupOdd(m); - const Vec128 swap = AndNot(maskH, maskL); - return IfVecThenElse(swap, Shuffle01(v), v); -} - -template -HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { - // For partial vectors, we cannot pull the Not() into the table because - // BitsFromMask clears the upper bits. - if (N < 16 / sizeof(T)) { - return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); - } - return detail::CompressNotBits(v, detail::BitsFromMask(mask)); -} - -// ------------------------------ CompressBlocksNot -HWY_API Vec128 CompressBlocksNot(Vec128 v, - Mask128 /* m */) { - return v; -} - -template -HWY_API Vec128 CompressBits(Vec128 v, - const uint8_t* HWY_RESTRICT bits) { - uint64_t mask_bits = 0; - constexpr size_t kNumBytes = (N + 7) / 8; - CopyBytes(bits, &mask_bits); - if (N < 8) { - mask_bits &= (1ull << N) - 1; - } - - return detail::CompressBits(v, mask_bits); -} - -// ------------------------------ CompressStore, CompressBitsStore - -template -HWY_API size_t CompressStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - - const uint64_t mask_bits = detail::BitsFromMask(m); - HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); - const size_t count = PopCount(mask_bits); - - // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); - StoreU(compressed, d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - - const uint64_t mask_bits = detail::BitsFromMask(m); - HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); - const size_t count = PopCount(mask_bits); - - // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); - BlendedStore(compressed, FirstN(d, count), d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - - uint64_t mask_bits = 0; - constexpr size_t kN = MaxLanes(d); - constexpr size_t kNumBytes = (kN + 7) / 8; - CopyBytes(bits, &mask_bits); - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - const size_t count = PopCount(mask_bits); - - // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). - const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); - const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); - StoreU(compressed, d, unaligned); - - detail::MaybeUnpoison(unaligned, count); - return count; -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Expand - -// Otherwise, use the generic_ops-inl.h fallback. -#if HWY_TARGET <= HWY_AVX3 || HWY_IDE - -// The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL), -// but we still want to override generic_ops-inl's table-based implementation -// whenever we have the 32-bit expand provided by AVX3. -#ifdef HWY_NATIVE_EXPAND -#undef HWY_NATIVE_EXPAND -#else -#define HWY_NATIVE_EXPAND -#endif - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 - -template -HWY_INLINE Vec128 NativeExpand(Vec128 v, - Mask128 mask) { - return Vec128{_mm_maskz_expand_epi8(mask.raw, v.raw)}; -} - -template -HWY_INLINE Vec128 NativeExpand(Vec128 v, - Mask128 mask) { - return Vec128{_mm_maskz_expand_epi16(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint8_t* HWY_RESTRICT unaligned) { - return VFromD{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint16_t* HWY_RESTRICT unaligned) { - return VFromD{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)}; -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -template -HWY_INLINE Vec128 NativeExpand(Vec128 v, - Mask128 mask) { - return Vec128{_mm_maskz_expand_epi32(mask.raw, v.raw)}; -} - -template -HWY_INLINE Vec128 NativeExpand(Vec128 v, - Mask128 mask) { - return Vec128{_mm_maskz_expand_epi64(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint32_t* HWY_RESTRICT unaligned) { - return VFromD{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint64_t* HWY_RESTRICT unaligned) { - return VFromD{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)}; -} - -} // namespace detail - -// Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo. -#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 - -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - const DFromV d; - const RebindToUnsigned du; - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -template -HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { - const DFromV d; - const RebindToUnsigned du; - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -} - -// ------------------------------ LoadExpand - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - const RebindToUnsigned du; - using TU = TFromD; - const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); -#else - return Expand(LoadU(d, unaligned), mask); -#endif -} - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET <= HWY_AVX3 - const RebindToUnsigned du; - using TU = TFromD; - const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); -#else - return Expand(LoadU(d, unaligned), mask); -#endif -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ StoreInterleaved2/3/4 - -// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in -// generic_ops-inl.h. - -// ------------------------------ Additional mask logical operations - -#if HWY_TARGET <= HWY_AVX3 -namespace detail { - -template -static HWY_INLINE uint32_t AVX3Blsi(T x) { - using TU = MakeUnsigned; - const auto u32_val = static_cast(static_cast(x)); -#if HWY_COMPILER_CLANGCL - return static_cast(u32_val & (0u - u32_val)); -#else - return static_cast(_blsi_u32(u32_val)); -#endif -} -template -static HWY_INLINE uint64_t AVX3Blsi(T x) { - const auto u64_val = static_cast(x); -#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 - return static_cast(u64_val & (0ULL - u64_val)); -#else - return static_cast(_blsi_u64(u64_val)); -#endif -} - -template -static HWY_INLINE uint32_t AVX3Blsmsk(T x) { - using TU = MakeUnsigned; - const auto u32_val = static_cast(static_cast(x)); -#if HWY_COMPILER_CLANGCL - return static_cast(u32_val ^ (u32_val - 1u)); -#else - return static_cast(_blsmsk_u32(u32_val)); -#endif -} -template -static HWY_INLINE uint64_t AVX3Blsmsk(T x) { - const auto u64_val = static_cast(x); -#if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 - return static_cast(u64_val ^ (u64_val - 1ULL)); -#else - return static_cast(_blsmsk_u64(u64_val)); -#endif -} - -} // namespace detail - -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; - return Mask128{static_cast::Raw>( - (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)}; -} -template -HWY_API Mask128 SetBeforeFirst(Mask128 mask) { - constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; - return Mask128{static_cast::Raw>( - (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)}; -} -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { - constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; - return Mask128{static_cast::Raw>( - detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)}; -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - return Mask128{ - static_cast::Raw>(detail::AVX3Blsi(mask.raw))}; -} -#else // AVX2 or below -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const FixedTag d; - const auto vmask = VecFromMask(d, mask); - return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Simd d; - const auto vmask = VecFromMask(d, mask); - const auto neg_vmask = - ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); - return MaskFromVec(Or(vmask, neg_vmask)); -} -template -HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { - const Full128 d; - const Repartition di64; - const Repartition df32; - const Repartition di32; - using VF = VFromD; - - auto vmask = BitCast(di64, VecFromMask(d, mask)); - vmask = Or(vmask, Neg(vmask)); - - // Copy the sign bit of the first int64_t lane to the second int64_t lane - const auto vmask2 = BroadcastSignBit( - BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw, - _MM_SHUFFLE(1, 1, 0, 0))})); - return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2)))); -} - -template -HWY_API Mask128 SetBeforeFirst(Mask128 mask) { - return Not(SetAtOrAfterFirst(mask)); -} - -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - return mask; -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const FixedTag d; - const RebindToSigned di; - - const auto vmask = BitCast(di, VecFromMask(d, mask)); - const auto zero = Zero(di); - const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); - return MaskFromVec(BitCast(d, And(vmask, vmask2))); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Simd d; - const RebindToSigned di; - - const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); - const auto only_first_vmask = - BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); - return MaskFromVec(only_first_vmask); -} -template -HWY_API Mask128 SetOnlyFirst(Mask128 mask) { - const Full128 d; - const RebindToSigned di; - const Repartition di64; - - const auto zero = Zero(di64); - const auto vmask = BitCast(di64, VecFromMask(d, mask)); - const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); - const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); - return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); -} - -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { - const FixedTag d; - const RebindToSigned di; - using TI = MakeSigned; - - return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); -} -template -HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { - const Simd d; - return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Reductions - -namespace detail { - -// N=1: no-op -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v) { - return v; -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v) { - return v; -} - -// N=2 -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v10) { - const DFromV d; - return Add(v10, Reverse2(d, v10)); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v10) { - const DFromV d; - return Min(v10, Reverse2(d, v10)); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v10) { - const DFromV d; - return Max(v10, Reverse2(d, v10)); -} - -// N=4 (only 16/32-bit, else >128-bit) -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Add(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Add(v03_12_12_03, v12_03_03_12); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Min(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Min(v03_12_12_03, v12_03_03_12); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Max(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Max(v03_12_12_03, v12_03_03_12); -} - -#undef HWY_X86_IF_NOT_MINPOS -#if HWY_TARGET <= HWY_SSE4 -// Skip the T_SIZE = 2 overload in favor of the following two. -#define HWY_X86_IF_NOT_MINPOS(T) \ - hwy::EnableIf()>* = nullptr - -HWY_INLINE Vec128 MinOfLanes(Vec128 v) { - return Broadcast<0>(Vec128{_mm_minpos_epu16(v.raw)}); -} - -HWY_INLINE Vec128 MaxOfLanes(Vec128 v) { - const DFromV d; - const Vec128 max = Set(d, LimitsMax()); - return max - MinOfLanes(max - v); -} -#else -#define HWY_X86_IF_NOT_MINPOS(T) hwy::EnableIf* = nullptr -#endif // HWY_TARGET <= HWY_SSE4 - -// N=8 (only 16-bit, else >128-bit) -template -HWY_INLINE Vec128 SumOfLanes(Vec128 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} - -template -HWY_INLINE T ReduceSum(Vec128 v) { - return GetLane(SumOfLanes(v)); -} - -// u8, N=8, N=16: -HWY_INLINE uint8_t ReduceSum(Vec64 v) { - return static_cast(GetLane(SumsOf8(v)) & 0xFF); -} -HWY_INLINE Vec64 SumOfLanes(Vec64 v) { - const Full64 d; - return Set(d, ReduceSum(v)); -} -HWY_INLINE uint8_t ReduceSum(Vec128 v) { - uint64_t sums = ReduceSum(SumsOf8(v)); - return static_cast(sums & 0xFF); -} -HWY_INLINE Vec128 SumOfLanes(Vec128 v) { - const DFromV d; - return Set(d, ReduceSum(v)); -} -template -HWY_INLINE int8_t ReduceSum(const Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const auto is_neg = v < Zero(d); - - // Sum positive and negative lanes separately, then combine to get the result. - const auto positive = SumsOf8(BitCast(du, IfThenZeroElse(is_neg, v))); - const auto negative = SumsOf8(BitCast(du, IfThenElseZero(is_neg, Abs(v)))); - return static_cast(ReduceSum(positive - negative) & 0xFF); -} -template -HWY_INLINE Vec128 SumOfLanes(const Vec128 v) { - const DFromV d; - return Set(d, ReduceSum(v)); -} - -#if HWY_TARGET <= HWY_SSE4 -HWY_INLINE Vec64 MinOfLanes(Vec64 v) { - const DFromV d; - const Rebind d16; - return TruncateTo(d, MinOfLanes(PromoteTo(d16, v))); -} -HWY_INLINE Vec128 MinOfLanes(Vec128 v) { - const Half> d; - Vec64 result = - Min(MinOfLanes(UpperHalf(d, v)), MinOfLanes(LowerHalf(d, v))); - return Combine(DFromV(), result, result); -} - -HWY_INLINE Vec64 MaxOfLanes(Vec64 v) { - const Vec64 m(Set(DFromV(), LimitsMax())); - return m - MinOfLanes(m - v); -} -HWY_INLINE Vec128 MaxOfLanes(Vec128 v) { - const Vec128 m(Set(DFromV(), LimitsMax())); - return m - MinOfLanes(m - v); -} -#elif HWY_TARGET >= HWY_SSSE3 -template -HWY_API Vec128 MaxOfLanes(Vec128 v) { - const DFromV d; - const RepartitionToWide d16; - const RepartitionToWide d32; - Vec128 vm = Max(v, Reverse2(d, v)); - vm = Max(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); - vm = Max(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); - if (N > 8) { - const RepartitionToWide d64; - vm = Max(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); - } - return vm; -} - -template -HWY_API Vec128 MinOfLanes(Vec128 v) { - const DFromV d; - const RepartitionToWide d16; - const RepartitionToWide d32; - Vec128 vm = Min(v, Reverse2(d, v)); - vm = Min(vm, BitCast(d, Reverse2(d16, BitCast(d16, vm)))); - vm = Min(vm, BitCast(d, Reverse2(d32, BitCast(d32, vm)))); - if (N > 8) { - const RepartitionToWide d64; - vm = Min(vm, BitCast(d, Reverse2(d64, BitCast(d64, vm)))); - } - return vm; -} -#endif - -// Implement min/max of i8 in terms of u8 by toggling the sign bit. -template -HWY_INLINE Vec128 MinOfLanes(Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const auto mask = SignBit(du); - const auto vu = Xor(BitCast(du, v), mask); - return BitCast(d, Xor(MinOfLanes(vu), mask)); -} -template -HWY_INLINE Vec128 MaxOfLanes(Vec128 v) { - const DFromV d; - const RebindToUnsigned du; - const auto mask = SignBit(du); - const auto vu = Xor(BitCast(du, v), mask); - return BitCast(d, Xor(MaxOfLanes(vu), mask)); -} - -} // namespace detail - -template -HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { - return detail::SumOfLanes(v); -} -template -HWY_API TFromD ReduceSum(D /* tag */, VFromD v) { - return detail::ReduceSum(v); -} -template -HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { - return detail::MinOfLanes(v); -} -template -HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { - return detail::MaxOfLanes(v); -} - -// ------------------------------ Lt128 - -namespace detail { - -// Returns vector-mask for Lt128. Generic for all vector lengths. -template -HWY_INLINE VFromD Lt128Vec(const D d, VFromD a, VFromD b) { - // Truth table of Eq and Lt for Hi and Lo u64. - // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) - // =H =L cH cL | out = cH | (=H & cL) - // 0 0 0 0 | 0 - // 0 0 0 1 | 0 - // 0 0 1 0 | 1 - // 0 0 1 1 | 1 - // 0 1 0 0 | 0 - // 0 1 0 1 | 0 - // 0 1 1 0 | 1 - // 1 0 0 0 | 0 - // 1 0 0 1 | 1 - // 1 1 0 0 | 0 - const auto eqHL = Eq(a, b); - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - const VFromD ltLX = ShiftLeftLanes<1>(ltHL); - const VFromD vecHx = IfThenElse(eqHL, ltLX, ltHL); - return InterleaveUpper(d, vecHx, vecHx); -} - -// Returns vector-mask for Eq128. Generic for all vector lengths. -template -HWY_INLINE VFromD Eq128Vec(D d, VFromD a, VFromD b) { - const auto eqHL = VecFromMask(d, Eq(a, b)); - const auto eqLH = Reverse2(d, eqHL); - return And(eqHL, eqLH); -} - -template -HWY_INLINE VFromD Ne128Vec(D d, VFromD a, VFromD b) { - const auto neHL = VecFromMask(d, Ne(a, b)); - const auto neLH = Reverse2(d, neHL); - return Or(neHL, neLH); -} - -template -HWY_INLINE VFromD Lt128UpperVec(D d, VFromD a, VFromD b) { - // No specialization required for AVX-512: Mask <-> Vec is fast, and - // copying mask bits to their neighbor seems infeasible. - const VFromD ltHL = VecFromMask(d, Lt(a, b)); - return InterleaveUpper(d, ltHL, ltHL); -} - -template -HWY_INLINE VFromD Eq128UpperVec(D d, VFromD a, VFromD b) { - // No specialization required for AVX-512: Mask <-> Vec is fast, and - // copying mask bits to their neighbor seems infeasible. - const VFromD eqHL = VecFromMask(d, Eq(a, b)); - return InterleaveUpper(d, eqHL, eqHL); -} - -template -HWY_INLINE VFromD Ne128UpperVec(D d, VFromD a, VFromD b) { - // No specialization required for AVX-512: Mask <-> Vec is fast, and - // copying mask bits to their neighbor seems infeasible. - const VFromD neHL = VecFromMask(d, Ne(a, b)); - return InterleaveUpper(d, neHL, neHL); -} - -} // namespace detail - -template -HWY_API MFromD Lt128(D d, VFromD a, VFromD b) { - return MaskFromVec(detail::Lt128Vec(d, a, b)); -} - -template -HWY_API MFromD Eq128(D d, VFromD a, VFromD b) { - return MaskFromVec(detail::Eq128Vec(d, a, b)); -} - -template -HWY_API MFromD Ne128(D d, VFromD a, VFromD b) { - return MaskFromVec(detail::Ne128Vec(d, a, b)); -} - -template -HWY_API MFromD Lt128Upper(D d, VFromD a, VFromD b) { - return MaskFromVec(detail::Lt128UpperVec(d, a, b)); -} - -template -HWY_API MFromD Eq128Upper(D d, VFromD a, VFromD b) { - return MaskFromVec(detail::Eq128UpperVec(d, a, b)); -} - -template -HWY_API MFromD Ne128Upper(D d, VFromD a, VFromD b) { - return MaskFromVec(detail::Ne128UpperVec(d, a, b)); -} - -// ------------------------------ Min128, Max128 (Lt128) - -// Avoids the extra MaskFromVec in Lt128. -template -HWY_API VFromD Min128(D d, VFromD a, VFromD b) { - return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); -} - -template -HWY_API VFromD Max128(D d, VFromD a, VFromD b) { - return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); -} - -template -HWY_API VFromD Min128Upper(D d, VFromD a, VFromD b) { - return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); -} - -template -HWY_API VFromD Max128Upper(D d, VFromD a, VFromD b) { - return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); -} - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -#if HWY_TARGET <= HWY_AVX3 - -#ifdef HWY_NATIVE_LEADING_ZERO_COUNT -#undef HWY_NATIVE_LEADING_ZERO_COUNT -#else -#define HWY_NATIVE_LEADING_ZERO_COUNT -#endif - -template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> -HWY_API V LeadingZeroCount(V v) { - return V{_mm_lzcnt_epi32(v.raw)}; -} - -template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> -HWY_API V LeadingZeroCount(V v) { - return V{_mm_lzcnt_epi64(v.raw)}; -} - -// HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h -// for AVX3 targets - -#endif // HWY_TARGET <= HWY_AVX3 - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - -// the warning seems to be issued at the call site of intrinsics, i.e. our code. -HWY_DIAGNOSTICS(pop) diff --git a/deps/highway/include/hwy/ops/x86_256-inl.h b/deps/highway/include/hwy/ops/x86_256-inl.h deleted file mode 100644 index 2f188e72..00000000 --- a/deps/highway/include/hwy/ops/x86_256-inl.h +++ /dev/null @@ -1,7428 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when -// compiling for that target. -// External include guard in highway.h - see comment there. - -// WARNING: most operations do not cross 128-bit block boundaries. In -// particular, "Broadcast", pack and zip behavior may be surprising. - -// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL -#include "hwy/base.h" - -// Avoid uninitialized warnings in GCC's avx512fintrin.h - see -// https://github.com/google/highway/issues/710) -HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_GCC_ACTUAL -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") -HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, - ignored "-Wmaybe-uninitialized") -#endif - -// Must come before HWY_COMPILER_CLANGCL -#include // AVX2+ - -#if HWY_COMPILER_CLANGCL -// Including should be enough, but Clang's headers helpfully skip -// including these headers when _MSC_VER is defined, like when using clang-cl. -// Include these directly here. -#include -// avxintrin defines __m256i and must come before avx2intrin. -#include -#include // _pext_u64 -#include -#include -#include -#endif // HWY_COMPILER_CLANGCL - -// For half-width vectors. Already includes base.h. -#include "hwy/ops/shared-inl.h" -// Already included by shared-inl, but do it again to avoid IDE warnings. -#include "hwy/ops/x86_128-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { -namespace detail { - -template -struct Raw256 { - using type = __m256i; -}; -#if HWY_HAVE_FLOAT16 -template <> -struct Raw256 { - using type = __m256h; -}; -#endif // HWY_HAVE_FLOAT16 -template <> -struct Raw256 { - using type = __m256; -}; -template <> -struct Raw256 { - using type = __m256d; -}; - -} // namespace detail - -template -class Vec256 { - using Raw = typename detail::Raw256::type; - - public: - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV - - // Compound assignment. Only usable if there is a corresponding non-member - // binary operator overload. For example, only f32 and f64 support division. - HWY_INLINE Vec256& operator*=(const Vec256 other) { - return *this = (*this * other); - } - HWY_INLINE Vec256& operator/=(const Vec256 other) { - return *this = (*this / other); - } - HWY_INLINE Vec256& operator+=(const Vec256 other) { - return *this = (*this + other); - } - HWY_INLINE Vec256& operator-=(const Vec256 other) { - return *this = (*this - other); - } - HWY_INLINE Vec256& operator&=(const Vec256 other) { - return *this = (*this & other); - } - HWY_INLINE Vec256& operator|=(const Vec256 other) { - return *this = (*this | other); - } - HWY_INLINE Vec256& operator^=(const Vec256 other) { - return *this = (*this ^ other); - } - - Raw raw; -}; - -#if HWY_TARGET <= HWY_AVX3 - -namespace detail { - -// Template arg: sizeof(lane type) -template -struct RawMask256 {}; -template <> -struct RawMask256<1> { - using type = __mmask32; -}; -template <> -struct RawMask256<2> { - using type = __mmask16; -}; -template <> -struct RawMask256<4> { - using type = __mmask8; -}; -template <> -struct RawMask256<8> { - using type = __mmask8; -}; - -} // namespace detail - -template -struct Mask256 { - using Raw = typename detail::RawMask256::type; - - static Mask256 FromBits(uint64_t mask_bits) { - return Mask256{static_cast(mask_bits)}; - } - - Raw raw; -}; - -#else // AVX2 - -// FF..FF or 0. -template -struct Mask256 { - typename detail::Raw256::type raw; -}; - -#endif // AVX2 - -#if HWY_TARGET <= HWY_AVX3 -namespace detail { - -// Used by Expand() emulation, which is required for both AVX3 and AVX2. -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - return mask.raw; -} - -} // namespace detail -#endif // HWY_TARGET <= HWY_AVX3 - -template -using Full256 = Simd; - -// ------------------------------ BitCast - -namespace detail { - -HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; } -#if HWY_HAVE_FLOAT16 -HWY_INLINE __m256i BitCastToInteger(__m256h v) { - return _mm256_castph_si256(v); -} -#endif // HWY_HAVE_FLOAT16 -HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); } -HWY_INLINE __m256i BitCastToInteger(__m256d v) { - return _mm256_castpd_si256(v); -} - -template -HWY_INLINE Vec256 BitCastToByte(Vec256 v) { - return Vec256{BitCastToInteger(v.raw)}; -} - -// Cannot rely on function overloading because return types differ. -template -struct BitCastFromInteger256 { - HWY_INLINE __m256i operator()(__m256i v) { return v; } -}; -#if HWY_HAVE_FLOAT16 -template <> -struct BitCastFromInteger256 { - HWY_INLINE __m256h operator()(__m256i v) { return _mm256_castsi256_ph(v); } -}; -#endif // HWY_HAVE_FLOAT16 -template <> -struct BitCastFromInteger256 { - HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); } -}; -template <> -struct BitCastFromInteger256 { - HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); } -}; - -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, Vec256 v) { - return VFromD{BitCastFromInteger256>()(v.raw)}; -} - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, Vec256 v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(v)); -} - -// ------------------------------ Zero - -// Cannot use VFromD here because it is defined in terms of Zero. -template -HWY_API Vec256> Zero(D /* tag */) { - return Vec256>{_mm256_setzero_si256()}; -} -template -HWY_API Vec256 Zero(D /* tag */) { - return Vec256{_mm256_setzero_si256()}; -} -template -HWY_API Vec256 Zero(D /* tag */) { -#if HWY_HAVE_FLOAT16 - return Vec256{_mm256_setzero_ph()}; -#else - return Vec256{_mm256_setzero_si256()}; -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec256 Zero(D /* tag */) { - return Vec256{_mm256_setzero_ps()}; -} -template -HWY_API Vec256 Zero(D /* tag */) { - return Vec256{_mm256_setzero_pd()}; -} - -// ------------------------------ Set - -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm256_set1_epi8(static_cast(t))}; // NOLINT -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm256_set1_epi16(static_cast(t))}; // NOLINT -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm256_set1_epi32(static_cast(t))}; -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm256_set1_epi64x(static_cast(t))}; // NOLINT -} -// bfloat16_t is handled by x86_128-inl.h. -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec256 Set(D /* tag */, float16_t t) { - return Vec256{_mm256_set1_ph(t)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec256 Set(D /* tag */, float t) { - return Vec256{_mm256_set1_ps(t)}; -} -template -HWY_API Vec256 Set(D /* tag */, double t) { - return Vec256{_mm256_set1_pd(t)}; -} - -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") - -// Returns a vector with uninitialized elements. -template -HWY_API VFromD Undefined(D /* tag */) { - // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC - // generate an XOR instruction. - return VFromD{_mm256_undefined_si256()}; -} -template -HWY_API Vec256 Undefined(D /* tag */) { - return Vec256{_mm256_undefined_si256()}; -} -template -HWY_API Vec256 Undefined(D /* tag */) { -#if HWY_HAVE_FLOAT16 - return Vec256{_mm256_undefined_ph()}; -#else - return Vec256{_mm256_undefined_si256()}; -#endif -} -template -HWY_API Vec256 Undefined(D /* tag */) { - return Vec256{_mm256_undefined_ps()}; -} -template -HWY_API Vec256 Undefined(D /* tag */) { - return Vec256{_mm256_undefined_pd()}; -} - -HWY_DIAGNOSTICS(pop) - -// ------------------------------ ResizeBitCast - -// 32-byte vector to 32-byte vector (or 64-byte vector to 64-byte vector on -// AVX3) -template ))> -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, v); -} - -// 32-byte vector to 16-byte vector (or 64-byte vector to 32-byte vector on -// AVX3) -template )) / 2)> -HWY_API VFromD ResizeBitCast(D d, FromV v) { - const DFromV d_from; - const Half dh_from; - return BitCast(d, LowerHalf(dh_from, v)); -} - -// 32-byte vector (or 64-byte vector on AVX3) to <= 8-byte vector -template -HWY_API VFromD ResizeBitCast(D /*d*/, FromV v) { - return VFromD{ResizeBitCast(Full128>(), v).raw}; -} - -// <= 16-byte vector to 32-byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, Vec256{_mm256_castsi128_si256( - ResizeBitCast(Full128(), v).raw)}); -} - -// ================================================== LOGICAL - -// ------------------------------ And - -template -HWY_API Vec256 And(Vec256 a, Vec256 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm256_and_si256(a.raw, b.raw)}); -} - -HWY_API Vec256 And(Vec256 a, Vec256 b) { - return Vec256{_mm256_and_ps(a.raw, b.raw)}; -} -HWY_API Vec256 And(Vec256 a, Vec256 b) { - return Vec256{_mm256_and_pd(a.raw, b.raw)}; -} - -// ------------------------------ AndNot - -// Returns ~not_mask & mask. -template -HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast( - d, VFromD{_mm256_andnot_si256(not_mask.raw, mask.raw)}); -} -HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { - return Vec256{_mm256_andnot_ps(not_mask.raw, mask.raw)}; -} -HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { - return Vec256{_mm256_andnot_pd(not_mask.raw, mask.raw)}; -} - -// ------------------------------ Or - -template -HWY_API Vec256 Or(Vec256 a, Vec256 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm256_or_si256(a.raw, b.raw)}); -} - -HWY_API Vec256 Or(Vec256 a, Vec256 b) { - return Vec256{_mm256_or_ps(a.raw, b.raw)}; -} -HWY_API Vec256 Or(Vec256 a, Vec256 b) { - return Vec256{_mm256_or_pd(a.raw, b.raw)}; -} - -// ------------------------------ Xor - -template -HWY_API Vec256 Xor(Vec256 a, Vec256 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm256_xor_si256(a.raw, b.raw)}); -} - -HWY_API Vec256 Xor(Vec256 a, Vec256 b) { - return Vec256{_mm256_xor_ps(a.raw, b.raw)}; -} -HWY_API Vec256 Xor(Vec256 a, Vec256 b) { - return Vec256{_mm256_xor_pd(a.raw, b.raw)}; -} - -// ------------------------------ Not -template -HWY_API Vec256 Not(const Vec256 v) { - const DFromV d; - using TU = MakeUnsigned; -#if HWY_TARGET <= HWY_AVX3 - const __m256i vu = BitCast(RebindToUnsigned(), v).raw; - return BitCast(d, Vec256{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)}); -#else - return Xor(v, BitCast(d, Vec256{_mm256_set1_epi32(-1)})); -#endif -} - -// ------------------------------ Xor3 -template -HWY_API Vec256 Xor3(Vec256 x1, Vec256 x2, Vec256 x3) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m256i ret = _mm256_ternarylogic_epi64( - BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); - return BitCast(d, VU{ret}); -#else - return Xor(x1, Xor(x2, x3)); -#endif -} - -// ------------------------------ Or3 -template -HWY_API Vec256 Or3(Vec256 o1, Vec256 o2, Vec256 o3) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m256i ret = _mm256_ternarylogic_epi64( - BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); - return BitCast(d, VU{ret}); -#else - return Or(o1, Or(o2, o3)); -#endif -} - -// ------------------------------ OrAnd -template -HWY_API Vec256 OrAnd(Vec256 o, Vec256 a1, Vec256 a2) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m256i ret = _mm256_ternarylogic_epi64( - BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); - return BitCast(d, VU{ret}); -#else - return Or(o, And(a1, a2)); -#endif -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec256 IfVecThenElse(Vec256 mask, Vec256 yes, Vec256 no) { -#if HWY_TARGET <= HWY_AVX3 - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw, - BitCast(du, yes).raw, - BitCast(du, no).raw, 0xCA)}); -#else - return IfThenElse(MaskFromVec(mask), yes, no); -#endif -} - -// ------------------------------ Operator overloads (internal-only if float) - -template -HWY_API Vec256 operator&(const Vec256 a, const Vec256 b) { - return And(a, b); -} - -template -HWY_API Vec256 operator|(const Vec256 a, const Vec256 b) { - return Or(a, b); -} - -template -HWY_API Vec256 operator^(const Vec256 a, const Vec256 b) { - return Xor(a, b); -} - -// ------------------------------ PopulationCount - -// 8/16 require BITALG, 32/64 require VPOPCNTDQ. -#if HWY_TARGET <= HWY_AVX3_DL - -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -namespace detail { - -template -HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<1> /* tag */, Vec256 v) { - return Vec256{_mm256_popcnt_epi8(v.raw)}; -} -template -HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<2> /* tag */, Vec256 v) { - return Vec256{_mm256_popcnt_epi16(v.raw)}; -} -template -HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<4> /* tag */, Vec256 v) { - return Vec256{_mm256_popcnt_epi32(v.raw)}; -} -template -HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<8> /* tag */, Vec256 v) { - return Vec256{_mm256_popcnt_epi64(v.raw)}; -} - -} // namespace detail - -template -HWY_API Vec256 PopulationCount(Vec256 v) { - return detail::PopulationCount(hwy::SizeTag(), v); -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ================================================== MASK - -#if HWY_TARGET <= HWY_AVX3 - -// ------------------------------ IfThenElse - -// Returns mask ? b : a. - -namespace detail { - -// Templates for signed/unsigned integer of a particular size. -template -HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<1> /* tag */, Mask256 mask, - Vec256 yes, Vec256 no) { - return Vec256{_mm256_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<2> /* tag */, Mask256 mask, - Vec256 yes, Vec256 no) { - return Vec256{_mm256_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<4> /* tag */, Mask256 mask, - Vec256 yes, Vec256 no) { - return Vec256{_mm256_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<8> /* tag */, Mask256 mask, - Vec256 yes, Vec256 no) { - return Vec256{_mm256_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; -} - -} // namespace detail - -template -HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { - return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); -} -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 IfThenElse(Mask256 mask, - Vec256 yes, - Vec256 no) { - return Vec256{_mm256_mask_blend_ph(mask.raw, no.raw, yes.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, - Vec256 no) { - return Vec256{_mm256_mask_blend_ps(mask.raw, no.raw, yes.raw)}; -} -HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, - Vec256 no) { - return Vec256{_mm256_mask_blend_pd(mask.raw, no.raw, yes.raw)}; -} - -namespace detail { - -template -HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<1> /* tag */, Mask256 mask, - Vec256 yes) { - return Vec256{_mm256_maskz_mov_epi8(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<2> /* tag */, Mask256 mask, - Vec256 yes) { - return Vec256{_mm256_maskz_mov_epi16(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<4> /* tag */, Mask256 mask, - Vec256 yes) { - return Vec256{_mm256_maskz_mov_epi32(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256 mask, - Vec256 yes) { - return Vec256{_mm256_maskz_mov_epi64(mask.raw, yes.raw)}; -} - -} // namespace detail - -template -HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { - return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); -} -HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { - return Vec256{_mm256_maskz_mov_ps(mask.raw, yes.raw)}; -} -HWY_API Vec256 IfThenElseZero(Mask256 mask, - Vec256 yes) { - return Vec256{_mm256_maskz_mov_pd(mask.raw, yes.raw)}; -} - -namespace detail { - -template -HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<1> /* tag */, Mask256 mask, - Vec256 no) { - // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. - return Vec256{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<2> /* tag */, Mask256 mask, - Vec256 no) { - return Vec256{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<4> /* tag */, Mask256 mask, - Vec256 no) { - return Vec256{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256 mask, - Vec256 no) { - return Vec256{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; -} - -} // namespace detail - -template -HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { - return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); -} -HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { - return Vec256{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; -} -HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { - return Vec256{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; -} - -template -HWY_API Vec256 ZeroIfNegative(const Vec256 v) { - static_assert(IsSigned(), "Only for float"); - // AVX3 MaskFromVec only looks at the MSB - return IfThenZeroElse(MaskFromVec(v), v); -} - -// ------------------------------ Mask logical - -namespace detail { - -template -HWY_INLINE Mask256 And(hwy::SizeTag<1> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kand_mask32(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask32>(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask256 And(hwy::SizeTag<2> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kand_mask16(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask16>(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask256 And(hwy::SizeTag<4> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kand_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask256 And(hwy::SizeTag<8> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kand_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(a.raw & b.raw)}; -#endif -} - -template -HWY_INLINE Mask256 AndNot(hwy::SizeTag<1> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kandn_mask32(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask32>(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask256 AndNot(hwy::SizeTag<2> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kandn_mask16(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask16>(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask256 AndNot(hwy::SizeTag<4> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kandn_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask256 AndNot(hwy::SizeTag<8> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kandn_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(~a.raw & b.raw)}; -#endif -} - -template -HWY_INLINE Mask256 Or(hwy::SizeTag<1> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kor_mask32(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask32>(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask256 Or(hwy::SizeTag<2> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kor_mask16(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask16>(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask256 Or(hwy::SizeTag<4> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kor_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask256 Or(hwy::SizeTag<8> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kor_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(a.raw | b.raw)}; -#endif -} - -template -HWY_INLINE Mask256 Xor(hwy::SizeTag<1> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxor_mask32(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask32>(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask256 Xor(hwy::SizeTag<2> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxor_mask16(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask16>(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask256 Xor(hwy::SizeTag<4> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxor_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask256 Xor(hwy::SizeTag<8> /*tag*/, const Mask256 a, - const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxor_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(a.raw ^ b.raw)}; -#endif -} - -template -HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, - const Mask256 a, const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxnor_mask32(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; -#endif -} -template -HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, - const Mask256 a, const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxnor_mask16(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; -#endif -} -template -HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, - const Mask256 a, const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{_kxnor_mask8(a.raw, b.raw)}; -#else - return Mask256{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; -#endif -} -template -HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, - const Mask256 a, const Mask256 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask256{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; -#else - return Mask256{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; -#endif -} - -} // namespace detail - -template -HWY_API Mask256 And(const Mask256 a, Mask256 b) { - return detail::And(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { - return detail::AndNot(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask256 Or(const Mask256 a, Mask256 b) { - return detail::Or(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { - return detail::Xor(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask256 Not(const Mask256 m) { - // Flip only the valid bits. - constexpr size_t N = 32 / sizeof(T); - return Xor(m, Mask256::FromBits((1ull << N) - 1)); -} - -template -HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { - return detail::ExclusiveNeither(hwy::SizeTag(), a, b); -} - -#else // AVX2 - -// ------------------------------ Mask - -// Mask and Vec are the same (true = FF..FF). -template -HWY_API Mask256 MaskFromVec(const Vec256 v) { - return Mask256{v.raw}; -} - -template -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{v.raw}; -} - -// ------------------------------ IfThenElse - -// mask ? yes : no -template -HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { - return Vec256{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)}; -} -HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, - Vec256 no) { - return Vec256{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)}; -} -HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, - Vec256 no) { - return Vec256{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)}; -} - -// mask ? yes : 0 -template -HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { - const DFromV d; - return yes & VecFromMask(d, mask); -} - -// mask ? 0 : no -template -HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { - const DFromV d; - return AndNot(VecFromMask(d, mask), no); -} - -template -HWY_API Vec256 ZeroIfNegative(Vec256 v) { - static_assert(IsSigned(), "Only for float"); - const DFromV d; - const auto zero = Zero(d); - // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes - return IfThenElse(MaskFromVec(v), zero, v); -} - -// ------------------------------ Mask logical - -template -HWY_API Mask256 Not(const Mask256 m) { - const Full256 d; - return MaskFromVec(Not(VecFromMask(d, m))); -} - -template -HWY_API Mask256 And(const Mask256 a, Mask256 b) { - const Full256 d; - return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { - const Full256 d; - return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask256 Or(const Mask256 a, Mask256 b) { - const Full256 d; - return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { - const Full256 d; - return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); -} - -template -HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { - const Full256 d; - return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ================================================== COMPARE - -#if HWY_TARGET <= HWY_AVX3 - -// Comparisons set a mask bit to 1 if the condition is true, else 0. - -template -HWY_API MFromD RebindMask(DTo /*tag*/, Mask256 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - return MFromD{m.raw}; -} - -namespace detail { - -template -HWY_INLINE Mask256 TestBit(hwy::SizeTag<1> /*tag*/, const Vec256 v, - const Vec256 bit) { - return Mask256{_mm256_test_epi8_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask256 TestBit(hwy::SizeTag<2> /*tag*/, const Vec256 v, - const Vec256 bit) { - return Mask256{_mm256_test_epi16_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask256 TestBit(hwy::SizeTag<4> /*tag*/, const Vec256 v, - const Vec256 bit) { - return Mask256{_mm256_test_epi32_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask256 TestBit(hwy::SizeTag<8> /*tag*/, const Vec256 v, - const Vec256 bit) { - return Mask256{_mm256_test_epi64_mask(v.raw, bit.raw)}; -} - -} // namespace detail - -template -HWY_API Mask256 TestBit(const Vec256 v, const Vec256 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return detail::TestBit(hwy::SizeTag(), v, bit); -} - -// ------------------------------ Equality - -template -HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpeq_epi8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpeq_epi16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpeq_epi32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpeq_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Mask256 operator==(Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -// ------------------------------ Inequality - -template -HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpneq_epi8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpneq_epi16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpneq_epi32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { - return Mask256{_mm256_cmpneq_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Mask256 operator!=(Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -// ------------------------------ Strict inequality - -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epi8_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epi16_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epi32_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epi64_mask(a.raw, b.raw)}; -} - -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epu8_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epu16_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epu32_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpgt_epu64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} - -// ------------------------------ Weak inequality - -#if HWY_HAVE_FLOAT16 -HWY_API Mask256 operator>=(Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} - -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpge_epi8_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpge_epi16_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpge_epi32_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpge_epi64_mask(a.raw, b.raw)}; -} - -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpge_epu8_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>=(const Vec256 a, - const Vec256 b) { - return Mask256{_mm256_cmpge_epu16_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>=(const Vec256 a, - const Vec256 b) { - return Mask256{_mm256_cmpge_epu32_mask(a.raw, b.raw)}; -} -HWY_API Mask256 operator>=(const Vec256 a, - const Vec256 b) { - return Mask256{_mm256_cmpge_epu64_mask(a.raw, b.raw)}; -} - -// ------------------------------ Mask - -namespace detail { - -template -HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec256 v) { - return Mask256{_mm256_movepi8_mask(v.raw)}; -} -template -HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec256 v) { - return Mask256{_mm256_movepi16_mask(v.raw)}; -} -template -HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec256 v) { - return Mask256{_mm256_movepi32_mask(v.raw)}; -} -template -HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec256 v) { - return Mask256{_mm256_movepi64_mask(v.raw)}; -} - -} // namespace detail - -template -HWY_API Mask256 MaskFromVec(const Vec256 v) { - return detail::MaskFromVec(hwy::SizeTag(), v); -} -// There do not seem to be native floating-point versions of these instructions. -template -HWY_API Mask256 MaskFromVec(const Vec256 v) { - const RebindToSigned> di; - return Mask256{MaskFromVec(BitCast(di, v)).raw}; -} - -template -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_movm_epi8(v.raw)}; -} - -template -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_movm_epi16(v.raw)}; -} - -template -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_movm_epi32(v.raw)}; -} - -template -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_movm_epi64(v.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_castsi256_ph(_mm256_movm_epi16(v.raw))}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))}; -} - -HWY_API Vec256 VecFromMask(const Mask256 v) { - return Vec256{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))}; -} - -#else // AVX2 - -// Comparisons fill a lane with 1-bits if the condition is true, else 0. - -template -HWY_API MFromD RebindMask(DTo d_to, Mask256 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - const Full256 dfrom; - return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m))); -} - -template -HWY_API Mask256 TestBit(const Vec256 v, const Vec256 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return (v & bit) == bit; -} - -// ------------------------------ Equality - -template -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpeq_epi8(a.raw, b.raw)}; -} - -template -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpeq_epi16(a.raw, b.raw)}; -} - -template -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpeq_epi32(a.raw, b.raw)}; -} - -template -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmpeq_epi64(a.raw, b.raw)}; -} - -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -HWY_API Mask256 operator==(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -// ------------------------------ Inequality - -template -HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { - return Not(a == b); -} -HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)}; -} -HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { - return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -// ------------------------------ Strict inequality - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8 -// to perform an unsigned comparison instead of the intended signed. Workaround -// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy -#if HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 903 -#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1 -#else -#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0 -#endif - -HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, - Vec256 b) { -#if HWY_AVX2_GCC_CMPGT8_WORKAROUND - using i8x32 = signed char __attribute__((__vector_size__(32))); - return Mask256{static_cast<__m256i>(reinterpret_cast(a.raw) > - reinterpret_cast(b.raw))}; -#else - return Mask256{_mm256_cmpgt_epi8(a.raw, b.raw)}; -#endif -} -HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmpgt_epi16(a.raw, b.raw)}; -} -HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmpgt_epi32(a.raw, b.raw)}; -} -HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmpgt_epi64(a.raw, b.raw)}; -} - -template -HWY_INLINE Mask256 Gt(hwy::UnsignedTag /*tag*/, Vec256 a, Vec256 b) { - const Full256 du; - const RebindToSigned di; - const Vec256 msb = Set(du, (LimitsMax() >> 1) + 1); - return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb))); -} - -HWY_API Mask256 Gt(hwy::FloatTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)}; -} -HWY_API Mask256 Gt(hwy::FloatTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)}; -} - -} // namespace detail - -template -HWY_API Mask256 operator>(Vec256 a, Vec256 b) { - return detail::Gt(hwy::TypeTag(), a, b); -} - -// ------------------------------ Weak inequality - -namespace detail { - -template -HWY_INLINE Mask256 Ge(hwy::SignedTag tag, Vec256 a, Vec256 b) { - return Not(Gt(tag, b, a)); -} - -template -HWY_INLINE Mask256 Ge(hwy::UnsignedTag tag, Vec256 a, Vec256 b) { - return Not(Gt(tag, b, a)); -} - -HWY_INLINE Mask256 Ge(hwy::FloatTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)}; -} -HWY_INLINE Mask256 Ge(hwy::FloatTag /*tag*/, Vec256 a, - Vec256 b) { - return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)}; -} - -} // namespace detail - -template -HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { - return detail::Ge(hwy::TypeTag(), a, b); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Reversed comparisons - -template -HWY_API Mask256 operator<(const Vec256 a, const Vec256 b) { - return b > a; -} - -template -HWY_API Mask256 operator<=(const Vec256 a, const Vec256 b) { - return b >= a; -} - -// ------------------------------ Min (Gt, IfThenElse) - -// Unsigned -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_min_epu8(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, - const Vec256 b) { - return Vec256{_mm256_min_epu16(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, - const Vec256 b) { - return Vec256{_mm256_min_epu32(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, - const Vec256 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_min_epu64(a.raw, b.raw)}; -#else - const Full256 du; - const Full256 di; - const auto msb = Set(du, 1ull << 63); - const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); - return IfThenElse(gt, b, a); -#endif -} - -// Signed -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_min_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_min_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_min_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_min_epi64(a.raw, b.raw)}; -#else - return IfThenElse(a < b, a, b); -#endif -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Min(Vec256 a, Vec256 b) { - return Vec256{_mm256_min_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_min_ps(a.raw, b.raw)}; -} -HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_min_pd(a.raw, b.raw)}; -} - -// ------------------------------ Max (Gt, IfThenElse) - -// Unsigned -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_max_epu8(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, - const Vec256 b) { - return Vec256{_mm256_max_epu16(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, - const Vec256 b) { - return Vec256{_mm256_max_epu32(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, - const Vec256 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_max_epu64(a.raw, b.raw)}; -#else - const Full256 du; - const Full256 di; - const auto msb = Set(du, 1ull << 63); - const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); - return IfThenElse(gt, a, b); -#endif -} - -// Signed -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_max_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_max_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_max_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_max_epi64(a.raw, b.raw)}; -#else - return IfThenElse(a < b, b, a); -#endif -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Max(Vec256 a, Vec256 b) { - return Vec256{_mm256_max_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_max_ps(a.raw, b.raw)}; -} -HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { - return Vec256{_mm256_max_pd(a.raw, b.raw)}; -} - -// ------------------------------ Iota - -namespace detail { - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm256_set_epi8( - static_cast(31), static_cast(30), static_cast(29), - static_cast(28), static_cast(27), static_cast(26), - static_cast(25), static_cast(24), static_cast(23), - static_cast(22), static_cast(21), static_cast(20), - static_cast(19), static_cast(18), static_cast(17), - static_cast(16), static_cast(15), static_cast(14), - static_cast(13), static_cast(12), static_cast(11), - static_cast(10), static_cast(9), static_cast(8), - static_cast(7), static_cast(6), static_cast(5), - static_cast(4), static_cast(3), static_cast(2), - static_cast(1), static_cast(0))}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm256_set_epi16( - int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, int16_t{11}, - int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, int16_t{5}, - int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})}; -} - -#if HWY_HAVE_FLOAT16 -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{ - _mm256_set_ph(float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12}, - float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8}, - float16_t{7}, float16_t{6}, float16_t{5}, float16_t{4}, - float16_t{3}, float16_t{2}, float16_t{1}, float16_t{0})}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm256_set_epi32(int32_t{7}, int32_t{6}, int32_t{5}, - int32_t{4}, int32_t{3}, int32_t{2}, - int32_t{1}, int32_t{0})}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{ - _mm256_set_epi64x(int64_t{3}, int64_t{2}, int64_t{1}, int64_t{0})}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{ - _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm256_set_pd(3.0, 2.0, 1.0, 0.0)}; -} - -} // namespace detail - -template -HWY_API VFromD Iota(D d, const T2 first) { - return detail::Iota0(d) + Set(d, static_cast>(first)); -} - -// ------------------------------ FirstN (Iota, Lt) - -template > -HWY_API M FirstN(const D d, size_t n) { -#if HWY_TARGET <= HWY_AVX3 - (void)d; - constexpr size_t kN = MaxLanes(d); -#if HWY_ARCH_X86_64 - const uint64_t all = (1ull << kN) - 1; - // BZHI only looks at the lower 8 bits of n! - return M::FromBits((n > 255) ? all : _bzhi_u64(all, n)); -#else - const uint32_t all = static_cast((1ull << kN) - 1); - // BZHI only looks at the lower 8 bits of n! - return M::FromBits((n > 255) ? all - : _bzhi_u32(all, static_cast(n))); -#endif // HWY_ARCH_X86_64 -#else - const RebindToSigned di; // Signed comparisons are cheaper. - using TI = TFromD; - return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(n))); -#endif -} - -// ================================================== ARITHMETIC - -// ------------------------------ Addition - -// Unsigned -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_ps(a.raw, b.raw)}; -} -HWY_API Vec256 operator+(Vec256 a, Vec256 b) { - return Vec256{_mm256_add_pd(a.raw, b.raw)}; -} - -// ------------------------------ Subtraction - -// Unsigned -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_ps(a.raw, b.raw)}; -} -HWY_API Vec256 operator-(Vec256 a, Vec256 b) { - return Vec256{_mm256_sub_pd(a.raw, b.raw)}; -} - -// ------------------------------ SumsOf8 -HWY_API Vec256 SumsOf8(Vec256 v) { - return Vec256{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())}; -} - -HWY_API Vec256 SumsOf8AbsDiff(Vec256 a, Vec256 b) { - return Vec256{_mm256_sad_epu8(a.raw, b.raw)}; -} - -// ------------------------------ SaturatedAdd - -// Returns a + b clamped to the destination range. - -// Unsigned -HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { - return Vec256{_mm256_adds_epu8(a.raw, b.raw)}; -} -HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { - return Vec256{_mm256_adds_epu16(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { - return Vec256{_mm256_adds_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { - return Vec256{_mm256_adds_epi16(a.raw, b.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { - const DFromV d; - const auto sum = a + b; - const auto overflow_mask = MaskFromVec( - Vec256{_mm256_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); - const auto i32_max = Set(d, LimitsMax()); - const Vec256 overflow_result{_mm256_mask_ternarylogic_epi32( - i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, sum); -} - -HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { - const DFromV d; - const auto sum = a + b; - const auto overflow_mask = MaskFromVec( - Vec256{_mm256_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); - const auto i64_max = Set(d, LimitsMax()); - const Vec256 overflow_result{_mm256_mask_ternarylogic_epi64( - i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, sum); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ SaturatedSub - -// Returns a - b clamped to the destination range. - -// Unsigned -HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { - return Vec256{_mm256_subs_epu8(a.raw, b.raw)}; -} -HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { - return Vec256{_mm256_subs_epu16(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { - return Vec256{_mm256_subs_epi8(a.raw, b.raw)}; -} -HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { - return Vec256{_mm256_subs_epi16(a.raw, b.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { - const DFromV d; - const auto diff = a - b; - const auto overflow_mask = MaskFromVec( - Vec256{_mm256_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); - const auto i32_max = Set(d, LimitsMax()); - const Vec256 overflow_result{_mm256_mask_ternarylogic_epi32( - i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, diff); -} - -HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { - const DFromV d; - const auto diff = a - b; - const auto overflow_mask = MaskFromVec( - Vec256{_mm256_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); - const auto i64_max = Set(d, LimitsMax()); - const Vec256 overflow_result{_mm256_mask_ternarylogic_epi64( - i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, diff); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Average - -// Returns (a + b + 1) / 2 - -// Unsigned -HWY_API Vec256 AverageRound(Vec256 a, Vec256 b) { - return Vec256{_mm256_avg_epu8(a.raw, b.raw)}; -} -HWY_API Vec256 AverageRound(Vec256 a, Vec256 b) { - return Vec256{_mm256_avg_epu16(a.raw, b.raw)}; -} - -// ------------------------------ Abs (Sub) - -// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. -HWY_API Vec256 Abs(Vec256 v) { -#if HWY_COMPILER_MSVC - // Workaround for incorrect codegen? (wrong result) - const DFromV d; - const auto zero = Zero(d); - return Vec256{_mm256_max_epi8(v.raw, (zero - v).raw)}; -#else - return Vec256{_mm256_abs_epi8(v.raw)}; -#endif -} -HWY_API Vec256 Abs(const Vec256 v) { - return Vec256{_mm256_abs_epi16(v.raw)}; -} -HWY_API Vec256 Abs(const Vec256 v) { - return Vec256{_mm256_abs_epi32(v.raw)}; -} -// i64 is implemented after BroadcastSignBit. - -template -HWY_API Vec256 Abs(const Vec256 v) { - const DFromV d; - const RebindToSigned di; - using TI = TFromD; - return v & BitCast(d, Set(di, static_cast(~SignMask()))); -} - -// ------------------------------ Integer multiplication - -// Unsigned -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi16(a.raw, b.raw)}; -} -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; -} - -// Returns the upper 16 bits of a * b in each lane. -HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { - return Vec256{_mm256_mulhi_epu16(a.raw, b.raw)}; -} -HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { - return Vec256{_mm256_mulhi_epi16(a.raw, b.raw)}; -} - -HWY_API Vec256 MulFixedPoint15(Vec256 a, Vec256 b) { - return Vec256{_mm256_mulhrs_epi16(a.raw, b.raw)}; -} - -// Multiplies even lanes (0, 2 ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -HWY_API Vec256 MulEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_mul_epi32(a.raw, b.raw)}; -} -HWY_API Vec256 MulEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_mul_epu32(a.raw, b.raw)}; -} - -// ------------------------------ ShiftLeft - -template -HWY_API Vec256 ShiftLeft(Vec256 v) { - return Vec256{_mm256_slli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftLeft(Vec256 v) { - return Vec256{_mm256_slli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftLeft(Vec256 v) { - return Vec256{_mm256_slli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftLeft(Vec256 v) { - return Vec256{_mm256_slli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftLeft(Vec256 v) { - return Vec256{_mm256_slli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftLeft(Vec256 v) { - return Vec256{_mm256_slli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftLeft(const Vec256 v) { - const Full256 d8; - const RepartitionToWide d16; - const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); - return kBits == 1 - ? (v + v) - : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); -} - -// ------------------------------ ShiftRight - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - return Vec256{_mm256_srli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - return Vec256{_mm256_srli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - return Vec256{_mm256_srli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - const Full256 d8; - // Use raw instead of BitCast to support N=1. - const Vec256 shifted{ShiftRight(Vec256{v.raw}).raw}; - return shifted & Set(d8, 0xFF >> kBits); -} - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - return Vec256{_mm256_srai_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - return Vec256{_mm256_srai_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec256 ShiftRight(Vec256 v) { - const Full256 di; - const Full256 du; - const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); - const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// i64 is implemented after BroadcastSignBit. - -// ------------------------------ RotateRight - -template -HWY_API Vec256 RotateRight(const Vec256 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - // AVX3 does not support 8/16-bit. - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -template -HWY_API Vec256 RotateRight(const Vec256 v) { - static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_ror_epi32(v.raw, kBits)}; -#else - if (kBits == 0) return v; - return Or(ShiftRight(v), ShiftLeft(v)); -#endif -} - -template -HWY_API Vec256 RotateRight(const Vec256 v) { - static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_ror_epi64(v.raw, kBits)}; -#else - if (kBits == 0) return v; - return Or(ShiftRight(v), ShiftLeft(v)); -#endif -} - -// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) - -HWY_API Vec256 BroadcastSignBit(const Vec256 v) { - const DFromV d; - return VecFromMask(v < Zero(d)); -} - -HWY_API Vec256 BroadcastSignBit(const Vec256 v) { - return ShiftRight<15>(v); -} - -HWY_API Vec256 BroadcastSignBit(const Vec256 v) { - return ShiftRight<31>(v); -} - -HWY_API Vec256 BroadcastSignBit(const Vec256 v) { -#if HWY_TARGET == HWY_AVX2 - const DFromV d; - return VecFromMask(v < Zero(d)); -#else - return Vec256{_mm256_srai_epi64(v.raw, 63)}; -#endif -} - -template -HWY_API Vec256 ShiftRight(const Vec256 v) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{ - _mm256_srai_epi64(v.raw, static_cast(kBits))}; -#else - const Full256 di; - const Full256 du; - const auto right = BitCast(di, ShiftRight(BitCast(du, v))); - const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); - return right | sign; -#endif -} - -HWY_API Vec256 Abs(const Vec256 v) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_abs_epi64(v.raw)}; -#else - const DFromV d; - const auto zero = Zero(d); - return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); -#endif -} - -// ------------------------------ IfNegativeThenElse (BroadcastSignBit) -HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, - Vec256 no) { - // int8: AVX2 IfThenElse only looks at the MSB. - return IfThenElse(MaskFromVec(v), yes, no); -} - -template -HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { - static_assert(IsSigned(), "Only works for signed/float"); - -#if HWY_TARGET <= HWY_AVX3 - const auto mask = MaskFromVec(v); -#else - // 16-bit: no native blendv on AVX2, so copy sign to lower byte's MSB. - const DFromV d; - const RebindToSigned di; - const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); -#endif - - return IfThenElse(mask, yes, no); -} - -template -HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { - static_assert(IsSigned(), "Only works for signed/float"); - -#if HWY_TARGET <= HWY_AVX3 - // No need to cast to float on AVX3 as IfThenElse only looks at the MSB on - // AVX3 - return IfThenElse(MaskFromVec(v), yes, no); -#else - const DFromV d; - const RebindToFloat df; - // 32/64-bit: use float IfThenElse, which only looks at the MSB. - const MFromD msb = MaskFromVec(BitCast(df, v)); - return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no))); -#endif -} - -// ------------------------------ ShiftLeftSame - -HWY_API Vec256 ShiftLeftSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_slli_epi16(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec256 ShiftLeftSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_slli_epi32(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec256 ShiftLeftSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_slli_epi64(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_slli_epi16(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_slli_epi32(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_slli_epi64(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { - const Full256 d8; - const RepartitionToWide d16; - const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); - return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); -} - -// ------------------------------ ShiftRightSame (BroadcastSignBit) - -HWY_API Vec256 ShiftRightSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_srli_epi16(v.raw, bits)}; - } -#endif - return Vec256{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec256 ShiftRightSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_srli_epi32(v.raw, bits)}; - } -#endif - return Vec256{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec256 ShiftRightSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_srli_epi64(v.raw, bits)}; - } -#endif - return Vec256{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { - const Full256 d8; - const RepartitionToWide d16; - const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); - return shifted & Set(d8, static_cast(0xFF >> bits)); -} - -HWY_API Vec256 ShiftRightSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_srai_epi16(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec256 ShiftRightSame(const Vec256 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{_mm256_srai_epi32(v.raw, bits)}; - } -#endif - return Vec256{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec256 ShiftRightSame(const Vec256 v, - const int bits) { -#if HWY_TARGET <= HWY_AVX3 -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec256{ - _mm256_srai_epi64(v.raw, static_cast(bits))}; - } -#endif - return Vec256{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -#else - const Full256 di; - const Full256 du; - const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); - const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); - return right | sign; -#endif -} - -HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { - const Full256 di; - const Full256 du; - const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); - const auto shifted_sign = - BitCast(di, Set(du, static_cast(0x80 >> bits))); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// ------------------------------ Neg (Xor, Sub) - -// Tag dispatch instead of SFINAE for MSVC 2017 compatibility -namespace detail { - -template -HWY_INLINE Vec256 Neg(hwy::FloatTag /*tag*/, const Vec256 v) { - const DFromV d; - return Xor(v, SignBit(d)); -} - -template -HWY_INLINE Vec256 Neg(hwy::SpecialTag /*tag*/, const Vec256 v) { - const DFromV d; - return Xor(v, SignBit(d)); -} - -// Not floating-point -template -HWY_INLINE Vec256 Neg(hwy::SignedTag /*tag*/, const Vec256 v) { - const DFromV d; - return Zero(d) - v; -} - -} // namespace detail - -template -HWY_API Vec256 Neg(const Vec256 v) { - return detail::Neg(hwy::TypeTag(), v); -} - -// ------------------------------ Floating-point mul / div - -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mul_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mul_ps(a.raw, b.raw)}; -} -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mul_pd(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 operator/(Vec256 a, Vec256 b) { - return Vec256{_mm256_div_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 operator/(Vec256 a, Vec256 b) { - return Vec256{_mm256_div_ps(a.raw, b.raw)}; -} -HWY_API Vec256 operator/(Vec256 a, Vec256 b) { - return Vec256{_mm256_div_pd(a.raw, b.raw)}; -} - -// Approximate reciprocal -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 ApproximateReciprocal(Vec256 v) { - return Vec256{_mm256_rcp_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Vec256 ApproximateReciprocal(Vec256 v) { - return Vec256{_mm256_rcp_ps(v.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -HWY_API Vec256 ApproximateReciprocal(Vec256 v) { - return Vec256{_mm256_rcp14_pd(v.raw)}; -} -#endif - -// ------------------------------ Floating-point multiply-add variants - -#if HWY_HAVE_FLOAT16 - -HWY_API Vec256 MulAdd(Vec256 mul, Vec256 x, - Vec256 add) { - return Vec256{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)}; -} - -HWY_API Vec256 NegMulAdd(Vec256 mul, Vec256 x, - Vec256 add) { - return Vec256{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)}; -} - -HWY_API Vec256 MulSub(Vec256 mul, Vec256 x, - Vec256 sub) { - return Vec256{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)}; -} - -HWY_API Vec256 NegMulSub(Vec256 mul, Vec256 x, - Vec256 sub) { - return Vec256{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)}; -} - -#endif // HWY_HAVE_FLOAT16 - -HWY_API Vec256 MulAdd(Vec256 mul, Vec256 x, - Vec256 add) { -#ifdef HWY_DISABLE_BMI2_FMA - return mul * x + add; -#else - return Vec256{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)}; -#endif -} -HWY_API Vec256 MulAdd(Vec256 mul, Vec256 x, - Vec256 add) { -#ifdef HWY_DISABLE_BMI2_FMA - return mul * x + add; -#else - return Vec256{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)}; -#endif -} - -HWY_API Vec256 NegMulAdd(Vec256 mul, Vec256 x, - Vec256 add) { -#ifdef HWY_DISABLE_BMI2_FMA - return add - mul * x; -#else - return Vec256{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)}; -#endif -} -HWY_API Vec256 NegMulAdd(Vec256 mul, Vec256 x, - Vec256 add) { -#ifdef HWY_DISABLE_BMI2_FMA - return add - mul * x; -#else - return Vec256{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)}; -#endif -} - -HWY_API Vec256 MulSub(Vec256 mul, Vec256 x, - Vec256 sub) { -#ifdef HWY_DISABLE_BMI2_FMA - return mul * x - sub; -#else - return Vec256{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)}; -#endif -} -HWY_API Vec256 MulSub(Vec256 mul, Vec256 x, - Vec256 sub) { -#ifdef HWY_DISABLE_BMI2_FMA - return mul * x - sub; -#else - return Vec256{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)}; -#endif -} - -HWY_API Vec256 NegMulSub(Vec256 mul, Vec256 x, - Vec256 sub) { -#ifdef HWY_DISABLE_BMI2_FMA - return Neg(mul * x) - sub; -#else - return Vec256{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)}; -#endif -} -HWY_API Vec256 NegMulSub(Vec256 mul, Vec256 x, - Vec256 sub) { -#ifdef HWY_DISABLE_BMI2_FMA - return Neg(mul * x) - sub; -#else - return Vec256{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)}; -#endif -} - -// ------------------------------ Floating-point square root - -// Full precision square root -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Sqrt(Vec256 v) { - return Vec256{_mm256_sqrt_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Sqrt(Vec256 v) { - return Vec256{_mm256_sqrt_ps(v.raw)}; -} -HWY_API Vec256 Sqrt(Vec256 v) { - return Vec256{_mm256_sqrt_pd(v.raw)}; -} - -// Approximate reciprocal square root -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 ApproximateReciprocalSqrt(Vec256 v) { - return Vec256{_mm256_rsqrt_ph(v.raw)}; -} -#endif -HWY_API Vec256 ApproximateReciprocalSqrt(Vec256 v) { - return Vec256{_mm256_rsqrt_ps(v.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -HWY_API Vec256 ApproximateReciprocalSqrt(Vec256 v) { -#if HWY_COMPILER_MSVC - const DFromV d; - return Vec256{_mm256_mask_rsqrt14_pd( - Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)}; -#else - return Vec256{_mm256_rsqrt14_pd(v.raw)}; -#endif -} -#endif - -// ------------------------------ Floating-point rounding - -// Toward nearest integer, tie to even -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Round(Vec256 v) { - return Vec256{_mm256_roundscale_ph( - v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Round(Vec256 v) { - return Vec256{ - _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec256 Round(Vec256 v) { - return Vec256{ - _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} - -// Toward zero, aka truncate -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Trunc(Vec256 v) { - return Vec256{ - _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Trunc(Vec256 v) { - return Vec256{ - _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec256 Trunc(Vec256 v) { - return Vec256{ - _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} - -// Toward +infinity, aka ceiling -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Ceil(Vec256 v) { - return Vec256{ - _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Ceil(Vec256 v) { - return Vec256{ - _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec256 Ceil(Vec256 v) { - return Vec256{ - _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} - -// Toward -infinity, aka floor -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 Floor(Vec256 v) { - return Vec256{ - _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 Floor(Vec256 v) { - return Vec256{ - _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec256 Floor(Vec256 v) { - return Vec256{ - _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} - -// ------------------------------ Floating-point classification - -#if HWY_HAVE_FLOAT16 || HWY_IDE - -HWY_API Mask256 IsNaN(Vec256 v) { - return Mask256{_mm256_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -} - -HWY_API Mask256 IsInf(Vec256 v) { - return Mask256{_mm256_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} - -HWY_API Mask256 IsFinite(Vec256 v) { - // fpclass doesn't have a flag for positive, so we have to check for inf/NaN - // and negate the mask. - return Not(Mask256{_mm256_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} - -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask256 IsNaN(Vec256 v) { -#if HWY_TARGET <= HWY_AVX3 - return Mask256{_mm256_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -#else - return Mask256{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)}; -#endif -} -HWY_API Mask256 IsNaN(Vec256 v) { -#if HWY_TARGET <= HWY_AVX3 - return Mask256{_mm256_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -#else - return Mask256{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)}; -#endif -} - -#if HWY_TARGET <= HWY_AVX3 - -HWY_API Mask256 IsInf(Vec256 v) { - return Mask256{_mm256_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} -HWY_API Mask256 IsInf(Vec256 v) { - return Mask256{_mm256_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} - -HWY_API Mask256 IsFinite(Vec256 v) { - // fpclass doesn't have a flag for positive, so we have to check for inf/NaN - // and negate the mask. - return Not(Mask256{_mm256_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} -HWY_API Mask256 IsFinite(Vec256 v) { - return Not(Mask256{_mm256_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} - -#else - -template -HWY_API Mask256 IsInf(const Vec256 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - const RebindToSigned di; - const VFromD vi = BitCast(di, v); - // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. - return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2()))); -} - -// Returns whether normal/subnormal/zero. -template -HWY_API Mask256 IsFinite(const Vec256 v) { - static_assert(IsFloat(), "Only for float"); - const DFromV d; - const RebindToUnsigned du; - const RebindToSigned di; // cheaper than unsigned comparison - const VFromD vu = BitCast(du, v); - // Shift left to clear the sign bit, then right so we can compare with the - // max exponent (cannot compare with MaxExponentTimes2 directly because it is - // negative and non-negative floats would be greater). MSVC seems to generate - // incorrect code if we instead add vu + vu. - const VFromD exp = - BitCast(di, ShiftRight() + 1>(ShiftLeft<1>(vu))); - return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ================================================== MEMORY - -// ------------------------------ Load - -template -HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { - return VFromD{ - _mm256_load_si256(reinterpret_cast(aligned))}; -} -// bfloat16_t is handled by x86_128-inl.h. -template -HWY_API Vec256 Load(D d, const float16_t* HWY_RESTRICT aligned) { -#if HWY_HAVE_FLOAT16 - (void)d; - return Vec256{_mm256_load_ph(aligned)}; -#else - const RebindToUnsigned du; - return BitCast(d, Load(du, reinterpret_cast(aligned))); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec256 Load(D /* tag */, const float* HWY_RESTRICT aligned) { - return Vec256{_mm256_load_ps(aligned)}; -} -template -HWY_API Vec256 Load(D /* tag */, const double* HWY_RESTRICT aligned) { - return Vec256{_mm256_load_pd(aligned)}; -} - -template -HWY_API VFromD LoadU(D /* tag */, const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_loadu_si256(reinterpret_cast(p))}; -} -// bfloat16_t is handled by x86_128-inl.h. -template -HWY_API Vec256 LoadU(D d, const float16_t* HWY_RESTRICT p) { -#if HWY_HAVE_FLOAT16 - (void)d; - return Vec256{_mm256_loadu_ph(p)}; -#else - const RebindToUnsigned du; - return BitCast(d, LoadU(du, reinterpret_cast(p))); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec256 LoadU(D /* tag */, const float* HWY_RESTRICT p) { - return Vec256{_mm256_loadu_ps(p)}; -} -template -HWY_API Vec256 LoadU(D /* tag */, const double* HWY_RESTRICT p) { - return Vec256{_mm256_loadu_pd(p)}; -} - -// ------------------------------ MaskedLoad - -#if HWY_TARGET <= HWY_AVX3 - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_maskz_loadu_epi8(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - return BitCast(d, VFromD{_mm256_maskz_loadu_epi16(m.raw, p)}); -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_maskz_loadu_epi32(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_maskz_loadu_epi64(m.raw, p)}; -} - -template -HWY_API Vec256 MaskedLoad(Mask256 m, D /* tag */, - const float* HWY_RESTRICT p) { - return Vec256{_mm256_maskz_loadu_ps(m.raw, p)}; -} - -template -HWY_API Vec256 MaskedLoad(Mask256 m, D /* tag */, - const double* HWY_RESTRICT p) { - return Vec256{_mm256_maskz_loadu_pd(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_mask_loadu_epi8(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - return BitCast( - d, VFromD{_mm256_mask_loadu_epi16(v.raw, m.raw, p)}); -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_mask_loadu_epi32(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm256_mask_loadu_epi64(v.raw, m.raw, p)}; -} - -template -HWY_API Vec256 MaskedLoadOr(VFromD v, Mask256 m, D /* tag */, - const float* HWY_RESTRICT p) { - return Vec256{_mm256_mask_loadu_ps(v.raw, m.raw, p)}; -} - -template -HWY_API Vec256 MaskedLoadOr(VFromD v, Mask256 m, D /* tag */, - const double* HWY_RESTRICT p) { - return Vec256{_mm256_mask_loadu_pd(v.raw, m.raw, p)}; -} - -#else // AVX2 - -// There is no maskload_epi8/16, so blend instead. -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - return IfThenElseZero(m, LoadU(d, p)); -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - return VFromD{_mm256_maskload_epi32(pi, m.raw)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - return VFromD{_mm256_maskload_epi64(pi, m.raw)}; -} - -template -HWY_API Vec256 MaskedLoad(Mask256 m, D d, - const float* HWY_RESTRICT p) { - const Vec256 mi = - BitCast(RebindToSigned(), VecFromMask(d, m)); - return Vec256{_mm256_maskload_ps(p, mi.raw)}; -} - -template -HWY_API Vec256 MaskedLoad(Mask256 m, D d, - const double* HWY_RESTRICT p) { - const Vec256 mi = - BitCast(RebindToSigned(), VecFromMask(d, m)); - return Vec256{_mm256_maskload_pd(p, mi.raw)}; -} - -#endif - -// ------------------------------ LoadDup128 - -// Loads 128 bit and duplicates into both 128-bit halves. This avoids the -// 3-cycle cost of moving data between 128-bit halves and avoids port 5. -template -HWY_API VFromD LoadDup128(D /* tag */, const TFromD* HWY_RESTRICT p) { - const Full128> d128; -#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 - // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note - // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the - // upper half undefined) is fine because we're overwriting that anyway. - // This workaround seems in turn to generate incorrect code in MSVC 2022 - // (19.31), so use broadcastsi128 there. - const __m128i v128 = LoadU(d128, p).raw; - return VFromD{ - _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)}; -#else - // The preferred path. This is perhaps surprising, because vbroadcasti128 - // with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to - // pattern-match this to vbroadcastf128 with a memory operand as desired. - return VFromD{_mm256_broadcastsi128_si256(LoadU(d128, p).raw)}; -#endif -} -template -HWY_API Vec256 LoadDup128(D /* tag */, const float* HWY_RESTRICT p) { -#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 - const Full128 d128; - const __m128 v128 = LoadU(d128, p).raw; - return Vec256{ - _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)}; -#else - return Vec256{_mm256_broadcast_ps(reinterpret_cast(p))}; -#endif -} -template -HWY_API Vec256 LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { -#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 - const Full128 d128; - const __m128d v128 = LoadU(d128, p).raw; - return Vec256{ - _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)}; -#else - return Vec256{ - _mm256_broadcast_pd(reinterpret_cast(p))}; -#endif -} - -// ------------------------------ Store - -template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { - _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw); -} -template -HWY_API void Store(Vec256 v, D d, float16_t* HWY_RESTRICT aligned) { -#if HWY_HAVE_FLOAT16 - (void)d; - _mm256_store_ph(aligned, v.raw); -#else - const RebindToUnsigned du; - Store(BitCast(du, v), du, reinterpret_cast(aligned)); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API void Store(Vec256 v, D /* tag */, float* HWY_RESTRICT aligned) { - _mm256_store_ps(aligned, v.raw); -} -template -HWY_API void Store(Vec256 v, D /* tag */, - double* HWY_RESTRICT aligned) { - _mm256_store_pd(aligned, v.raw); -} - -template -HWY_API void StoreU(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { - _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw); -} -template -HWY_API void StoreU(Vec256 v, D d, float16_t* HWY_RESTRICT p) { -#if HWY_HAVE_FLOAT16 - (void)d; - _mm256_storeu_ph(p, v.raw); -#else - const RebindToUnsigned du; - StoreU(BitCast(du, v), du, reinterpret_cast(p)); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API void StoreU(Vec256 v, D /* tag */, float* HWY_RESTRICT p) { - _mm256_storeu_ps(p, v.raw); -} -template -HWY_API void StoreU(Vec256 v, D /* tag */, double* HWY_RESTRICT p) { - _mm256_storeu_pd(p, v.raw); -} - -// ------------------------------ BlendedStore - -#if HWY_TARGET <= HWY_AVX3 - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm256_mask_storeu_epi8(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - _mm256_mask_storeu_epi16(reinterpret_cast(p), - RebindMask(du, m).raw, BitCast(du, v).raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm256_mask_storeu_epi32(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm256_mask_storeu_epi64(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(Vec256 v, Mask256 m, D /* tag */, - float* HWY_RESTRICT p) { - _mm256_mask_storeu_ps(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(Vec256 v, Mask256 m, D /* tag */, - double* HWY_RESTRICT p) { - _mm256_mask_storeu_pd(p, m.raw, v.raw); -} - -#else // AVX2 - -// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD -// allows AC# if "Alignment checking enabled and: 256-bit memory operand not -// 32-byte aligned". Fortunately AC# is not enabled by default and requires both -// OS support (CR0) and the application to set rflags.AC. We assume these remain -// disabled because x86/x64 code and compiler output often contain misaligned -// scalar accesses, which would also fault. -// -// Caveat: these are slow on AMD Jaguar/Bulldozer. - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - // There is no maskload_epi8/16. Blending is also unsafe because loading a - // full vector that crosses the array end causes asan faults. Resort to scalar - // code; the caller should instead use memcpy, assuming m is FirstN(d, n). - const RebindToUnsigned du; - using TU = TFromD; - alignas(32) TU buf[MaxLanes(d)]; - alignas(32) TU mask[MaxLanes(d)]; - Store(BitCast(du, v), du, buf); - Store(BitCast(du, VecFromMask(d, m)), du, mask); - for (size_t i = 0; i < MaxLanes(d); ++i) { - if (mask[i]) { - CopySameSize(buf + i, p + i); - } - } -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - _mm256_maskstore_epi32(pi, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - auto pi = reinterpret_cast(p); // NOLINT - _mm256_maskstore_epi64(pi, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(Vec256 v, Mask256 m, D d, - float* HWY_RESTRICT p) { - const Vec256 mi = - BitCast(RebindToSigned(), VecFromMask(d, m)); - _mm256_maskstore_ps(p, mi.raw, v.raw); -} - -template -HWY_API void BlendedStore(Vec256 v, Mask256 m, D d, - double* HWY_RESTRICT p) { - const Vec256 mi = - BitCast(RebindToSigned(), VecFromMask(d, m)); - _mm256_maskstore_pd(p, mi.raw, v.raw); -} - -#endif - -// ------------------------------ Non-temporal stores - -template -HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - const RebindToUnsigned du; // for float16_t - _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), BitCast(du, v).raw); -} -template -HWY_API void Stream(Vec256 v, D /* tag */, float* HWY_RESTRICT aligned) { - _mm256_stream_ps(aligned, v.raw); -} -template -HWY_API void Stream(Vec256 v, D /* tag */, - double* HWY_RESTRICT aligned) { - _mm256_stream_pd(aligned, v.raw); -} - -// ------------------------------ ScatterOffset - -// Work around warnings in the intrinsic definitions (passing -1 as a mask). -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") - -#if HWY_TARGET <= HWY_AVX3 - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - Vec256 offset) { - _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1); -} - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - Vec256 offset) { - _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1); -} - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, float* HWY_RESTRICT base, - const Vec256 offset) { - _mm256_i32scatter_ps(base, offset.raw, v.raw, 1); -} - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, double* HWY_RESTRICT base, - const Vec256 offset) { - _mm256_i64scatter_pd(base, offset.raw, v.raw, 1); -} - -// ------------------------------ ScatterIndex - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm256_i32scatter_epi32(base, index.raw, v.raw, 4); -} - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm256_i64scatter_epi64(base, index.raw, v.raw, 8); -} - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, float* HWY_RESTRICT base, - VFromD> index) { - _mm256_i32scatter_ps(base, index.raw, v.raw, 4); -} - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, double* HWY_RESTRICT base, - VFromD> index) { - _mm256_i64scatter_pd(base, index.raw, v.raw, 8); -} - -// ------------------------------ MaskedScatterIndex - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm256_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, 4); -} - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm256_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, 8); -} - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - float* HWY_RESTRICT base, - VFromD> index) { - _mm256_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, 4); -} - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - double* HWY_RESTRICT base, - VFromD> index) { - _mm256_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, 8); -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Gather - -template -HWY_INLINE VFromD GatherOffset(D /* tag */, - const TFromD* HWY_RESTRICT base, - Vec256 offset) { - return VFromD{_mm256_i32gather_epi32( - reinterpret_cast(base), offset.raw, 1)}; -} -template -HWY_INLINE VFromD GatherIndex(D /* tag */, - const TFromD* HWY_RESTRICT base, - Vec256 index) { - return VFromD{_mm256_i32gather_epi32( - reinterpret_cast(base), index.raw, 4)}; -} - -template -HWY_INLINE VFromD GatherOffset(D /* tag */, - const TFromD* HWY_RESTRICT base, - Vec256 offset) { - return VFromD{_mm256_i64gather_epi64( - reinterpret_cast(base), offset.raw, 1)}; -} -template -HWY_INLINE VFromD GatherIndex(D /* tag */, - const TFromD* HWY_RESTRICT base, - Vec256 index) { - return VFromD{_mm256_i64gather_epi64( - reinterpret_cast(base), index.raw, 8)}; -} - -template -HWY_API Vec256 GatherOffset(D /* tag */, const float* HWY_RESTRICT base, - Vec256 offset) { - return Vec256{_mm256_i32gather_ps(base, offset.raw, 1)}; -} -template -HWY_API Vec256 GatherIndex(D /* tag */, const float* HWY_RESTRICT base, - Vec256 index) { - return Vec256{_mm256_i32gather_ps(base, index.raw, 4)}; -} -template -HWY_API Vec256 GatherOffset(D /* tag */, - const double* HWY_RESTRICT base, - Vec256 offset) { - return Vec256{_mm256_i64gather_pd(base, offset.raw, 1)}; -} -template -HWY_API Vec256 GatherIndex(D /* tag */, const double* HWY_RESTRICT base, - Vec256 index) { - return Vec256{_mm256_i64gather_pd(base, index.raw, 8)}; -} - -// ------------------------------ MaskedGatherIndex - -template -HWY_INLINE VFromD MaskedGatherIndex(MFromD m, D d, - const TFromD* HWY_RESTRICT base, - Vec256 index) { -#if HWY_TARGET <= HWY_AVX3 - return VFromD{ - _mm256_mmask_i32gather_epi32(Zero(d).raw, m.raw, index.raw, - reinterpret_cast(base), 4)}; -#else - return VFromD{_mm256_mask_i32gather_epi32( - Zero(d).raw, reinterpret_cast(base), index.raw, m.raw, - 4)}; -#endif -} - -template -HWY_INLINE VFromD MaskedGatherIndex(MFromD m, D d, - const TFromD* HWY_RESTRICT base, - Vec256 index) { -#if HWY_TARGET <= HWY_AVX3 - return VFromD{_mm256_mmask_i64gather_epi64( - Zero(d).raw, m.raw, index.raw, - reinterpret_cast(base), 8)}; -#else - // For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros. - const RebindToFloat df; - return BitCast(d, Vec256{_mm256_mask_i64gather_pd( - Zero(df).raw, reinterpret_cast(base), - index.raw, RebindMask(df, m).raw, 8)}); -#endif -} - -template -HWY_API Vec256 MaskedGatherIndex(MFromD m, D d, - const float* HWY_RESTRICT base, - Vec256 index) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{ - _mm256_mmask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, 4)}; -#else - return Vec256{ - _mm256_mask_i32gather_ps(Zero(d).raw, base, index.raw, m.raw, 4)}; -#endif -} - -template -HWY_API Vec256 MaskedGatherIndex(MFromD m, D d, - const double* HWY_RESTRICT base, - Vec256 index) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{ - _mm256_mmask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, 8)}; -#else - return Vec256{ - _mm256_mask_i64gather_pd(Zero(d).raw, base, index.raw, m.raw, 8)}; -#endif -} - -HWY_DIAGNOSTICS(pop) - -// ================================================== SWIZZLE - -// ------------------------------ LowerHalf - -template -HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { - return VFromD{_mm256_castsi256_si128(v.raw)}; -} -template -HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { - return Vec128{_mm256_castsi256_si128(v.raw)}; -} -template -HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { -#if HWY_HAVE_FLOAT16 - return Vec128{_mm256_castph256_ph128(v.raw)}; -#else - return Vec128{_mm256_castsi256_si128(v.raw)}; -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { - return Vec128{_mm256_castps256_ps128(v.raw)}; -} -template -HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { - return Vec128{_mm256_castpd256_pd128(v.raw)}; -} - -template -HWY_API Vec128 LowerHalf(Vec256 v) { - const Full128 dh; - return LowerHalf(dh, v); -} - -// ------------------------------ UpperHalf - -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - const RebindToUnsigned du; // for float16_t - const Twice dut; - return BitCast(d, VFromD{ - _mm256_extracti128_si256(BitCast(dut, v).raw, 1)}); -} -template -HWY_API VFromD UpperHalf(D /* tag */, Vec256 v) { - return VFromD{_mm256_extractf128_ps(v.raw, 1)}; -} -template -HWY_API VFromD UpperHalf(D /* tag */, Vec256 v) { - return VFromD{_mm256_extractf128_pd(v.raw, 1)}; -} - -// ------------------------------ ExtractLane (Store) -template -HWY_API T ExtractLane(const Vec256 v, size_t i) { - const DFromV d; - HWY_DASSERT(i < Lanes(d)); - -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - constexpr size_t kLanesPerBlock = 16 / sizeof(T); - if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) { - return ExtractLane(LowerHalf(Half(), v), i); - } -#endif - - alignas(32) T lanes[32 / sizeof(T)]; - Store(v, d, lanes); - return lanes[i]; -} - -// ------------------------------ InsertLane (Store) -template -HWY_API Vec256 InsertLane(const Vec256 v, size_t i, T t) { - return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); -} - -// ------------------------------ GetLane (LowerHalf) -template -HWY_API T GetLane(const Vec256 v) { - return GetLane(LowerHalf(v)); -} - -// ------------------------------ ExtractBlock (LowerHalf, UpperHalf) - -template -HWY_API Vec128 ExtractBlock(Vec256 v) { - static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); - const Half> dh; - return (kBlockIdx == 0) ? LowerHalf(dh, v) : UpperHalf(dh, v); -} - -// ------------------------------ ZeroExtendVector - -// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper -// bits undefined. Although it makes sense for them to be zero (VEX encoded -// 128-bit instructions zero the upper lanes to avoid large penalties), a -// compiler could decide to optimize out code that relies on this. -// -// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the -// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For -// older GCC, we can still obtain the desired code thanks to pattern -// recognition; note that the expensive insert instruction is not actually -// generated, see https://gcc.godbolt.org/z/1MKGaP. - -#if !defined(HWY_HAVE_ZEXT) -#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \ - (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \ - (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000) -#define HWY_HAVE_ZEXT 1 -#else -#define HWY_HAVE_ZEXT 0 -#endif -#endif // defined(HWY_HAVE_ZEXT) - -template -HWY_API VFromD ZeroExtendVector(D /* tag */, VFromD> lo) { -#if HWY_HAVE_ZEXT - return VFromD{_mm256_zextsi128_si256(lo.raw)}; -#else - return VFromD{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)}; -#endif -} -template -HWY_API Vec256 ZeroExtendVector(D d, Vec128 lo) { - (void)d; -#if HWY_HAVE_ZEXT - return VFromD{_mm256_zextsi128_si256(lo.raw)}; -#else - return VFromD{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)}; -#endif // HWY_HAVE_ZEXT -} -template -HWY_API Vec256 ZeroExtendVector(D d, Vec128 lo) { -#if HWY_HAVE_FLOAT16 -#if HWY_HAVE_ZEXT - (void)d; - return Vec256{_mm256_zextph128_ph256(lo.raw)}; -#else - const RebindToUnsigned du; - return BitCast(d, ZeroExtendVector(du, BitCast(du, lo))); -#endif // HWY_HAVE_ZEXT -#else - (void)d; -#if HWY_HAVE_ZEXT - return VFromD{_mm256_zextsi128_si256(lo.raw)}; -#else - return VFromD{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)}; -#endif // HWY_HAVE_ZEXT -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec256 ZeroExtendVector(D /* tag */, Vec128 lo) { -#if HWY_HAVE_ZEXT - return Vec256{_mm256_zextps128_ps256(lo.raw)}; -#else - return Vec256{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)}; -#endif -} -template -HWY_API Vec256 ZeroExtendVector(D /* tag */, Vec128 lo) { -#if HWY_HAVE_ZEXT - return Vec256{_mm256_zextpd128_pd256(lo.raw)}; -#else - return Vec256{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)}; -#endif -} - -// ------------------------------ ZeroExtendResizeBitCast - -namespace detail { - -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */, - DTo d_to, DFrom d_from, VFromD v) { - const Twice dt_from; - const Twice dq_from; - return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v))); -} - -} // namespace detail - -// ------------------------------ Combine - -template -HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { - const auto lo256 = ZeroExtendVector(d, lo); - return VFromD{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)}; -} -template -HWY_API Vec256 Combine(D d, Vec128 hi, Vec128 lo) { - const auto lo256 = ZeroExtendVector(d, lo); - return Vec256{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)}; -} -template -HWY_API Vec256 Combine(D d, Vec128 hi, Vec128 lo) { - const auto lo256 = ZeroExtendVector(d, lo); - return Vec256{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)}; -} - -// ------------------------------ ShiftLeftBytes -template -HWY_API VFromD ShiftLeftBytes(D /* tag */, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - // This is the same operation as _mm256_bslli_epi128. - return VFromD{_mm256_slli_si256(v.raw, kBytes)}; -} - -// ------------------------------ ShiftRightBytes -template -HWY_API VFromD ShiftRightBytes(D /* tag */, VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - // This is the same operation as _mm256_bsrli_epi128. - return VFromD{_mm256_srli_si256(v.raw, kBytes)}; -} - -// ------------------------------ CombineShiftRightBytes -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - const Repartition d8; - return BitCast(d, Vec256{_mm256_alignr_epi8( - BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); -} - -// ------------------------------ Broadcast - -template -HWY_API Vec256 Broadcast(const Vec256 v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const VU vu = BitCast(du, v); // for float16_t - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - if (kLane < 4) { - const __m256i lo = _mm256_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); - return BitCast(d, VU{_mm256_unpacklo_epi64(lo, lo)}); - } else { - const __m256i hi = - _mm256_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); - return BitCast(d, VU{_mm256_unpackhi_epi64(hi, hi)}); - } -} -template -HWY_API Vec256 Broadcast(const Vec256 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec256{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)}; -} - -template -HWY_API Vec256 Broadcast(const Vec256 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec256{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; -} - -template -HWY_API Vec256 Broadcast(Vec256 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; -} - -template -HWY_API Vec256 Broadcast(const Vec256 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - return Vec256{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)}; -} - -// ------------------------------ BroadcastBlock - -template -HWY_API Vec256 BroadcastBlock(Vec256 v) { - static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); - const DFromV d; - return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v) - : ConcatUpperUpper(d, v, v); -} - -// ------------------------------ BroadcastLane - -namespace detail { - -template -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec256 v) { - const Half> dh; - return Vec256{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)}; -} - -template -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec256 v) { - const Half> dh; - return Vec256{_mm256_broadcastw_epi16(LowerHalf(dh, v).raw)}; -} - -template -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec256 v) { - const Half> dh; - return Vec256{_mm256_broadcastd_epi32(LowerHalf(dh, v).raw)}; -} - -template -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec256 v) { - const Half> dh; - return Vec256{_mm256_broadcastq_epi64(LowerHalf(dh, v).raw)}; -} - -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec256 v) { - const Half> dh; - return Vec256{_mm256_broadcastss_ps(LowerHalf(dh, v).raw)}; -} - -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec256 v) { - const Half> dh; - return Vec256{_mm256_broadcastsd_pd(LowerHalf(dh, v).raw)}; -} - -template * = nullptr, - HWY_IF_NOT_T_SIZE(T, 8)> -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag /* lane_idx_tag */, - Vec256 v) { - constexpr size_t kLanesPerBlock = 16 / sizeof(T); - constexpr int kBlockIdx = static_cast(kLaneIdx / kLanesPerBlock); - constexpr int kLaneInBlkIdx = - static_cast(kLaneIdx) & (kLanesPerBlock - 1); - return Broadcast(BroadcastBlock(v)); -} - -template * = nullptr, - HWY_IF_UI64(T)> -HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag /* lane_idx_tag */, - Vec256 v) { - static_assert(kLaneIdx <= 3, "Invalid lane"); - return Vec256{ - _mm256_permute4x64_epi64(v.raw, static_cast(0x55 * kLaneIdx))}; -} - -template * = nullptr> -HWY_INLINE Vec256 BroadcastLane( - hwy::SizeTag /* lane_idx_tag */, Vec256 v) { - static_assert(kLaneIdx <= 3, "Invalid lane"); - return Vec256{ - _mm256_permute4x64_pd(v.raw, static_cast(0x55 * kLaneIdx))}; -} - -} // namespace detail - -template -HWY_API Vec256 BroadcastLane(Vec256 v) { - static_assert(kLaneIdx >= 0, "Invalid lane"); - return detail::BroadcastLane(hwy::SizeTag(kLaneIdx)>(), - v); -} - -// ------------------------------ Hard-coded shuffles - -// Notation: let Vec256 have lanes 7,6,5,4,3,2,1,0 (0 is -// least-significant). Shuffle0321 rotates four-lane blocks one lane to the -// right (the previous least-significant lane is now most-significant => -// 47650321). These could also be implemented via CombineShiftRightBytes but -// the shuffle_abcd notation is more convenient. - -// Swap 32-bit halves in 64-bit halves. -template -HWY_API Vec256 Shuffle2301(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0xB1)}; -} -HWY_API Vec256 Shuffle2301(const Vec256 v) { - return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)}; -} - -// Used by generic_ops-inl.h -namespace detail { - -template -HWY_API Vec256 ShuffleTwo2301(const Vec256 a, const Vec256 b) { - const DFromV d; - const RebindToFloat df; - constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); - return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, - BitCast(df, b).raw, m)}); -} -template -HWY_API Vec256 ShuffleTwo1230(const Vec256 a, const Vec256 b) { - const DFromV d; - const RebindToFloat df; - constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); - return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, - BitCast(df, b).raw, m)}); -} -template -HWY_API Vec256 ShuffleTwo3012(const Vec256 a, const Vec256 b) { - const DFromV d; - const RebindToFloat df; - constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); - return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, - BitCast(df, b).raw, m)}); -} - -} // namespace detail - -// Swap 64-bit halves -HWY_API Vec256 Shuffle1032(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec256 Shuffle1032(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec256 Shuffle1032(const Vec256 v) { - // Shorter encoding than _mm256_permute_ps. - return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)}; -} -HWY_API Vec256 Shuffle01(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec256 Shuffle01(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; -} -HWY_API Vec256 Shuffle01(const Vec256 v) { - // Shorter encoding than _mm256_permute_pd. - return Vec256{_mm256_shuffle_pd(v.raw, v.raw, 5)}; -} - -// Rotate right 32 bits -HWY_API Vec256 Shuffle0321(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x39)}; -} -HWY_API Vec256 Shuffle0321(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x39)}; -} -HWY_API Vec256 Shuffle0321(const Vec256 v) { - return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x39)}; -} -// Rotate left 32 bits -HWY_API Vec256 Shuffle2103(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x93)}; -} -HWY_API Vec256 Shuffle2103(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x93)}; -} -HWY_API Vec256 Shuffle2103(const Vec256 v) { - return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x93)}; -} - -// Reverse -HWY_API Vec256 Shuffle0123(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x1B)}; -} -HWY_API Vec256 Shuffle0123(const Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, 0x1B)}; -} -HWY_API Vec256 Shuffle0123(const Vec256 v) { - return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)}; -} - -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. -template -struct Indices256 { - __m256i raw; -}; - -// 8-bit lanes: indices remain unchanged -template -HWY_API Indices256> IndicesFromVec(D /* tag */, Vec256 vec) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const Full256 di; - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, static_cast(2 * Lanes(di)))))); -#endif - return Indices256>{vec.raw}; -} - -// 16-bit lanes: convert indices to 32x8 unless AVX3 is available -template -HWY_API Indices256> IndicesFromVec(D /* tag */, Vec256 vec) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); - const Full256 di; -#if HWY_IS_DEBUG_BUILD - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, static_cast(2 * Lanes(di)))))); -#endif - -#if HWY_TARGET <= HWY_AVX3 - (void)di; - return Indices256>{vec.raw}; -#else - const Repartition d8; - using V8 = VFromD; - alignas(32) static constexpr uint8_t kByteOffsets[32] = { - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; - - // Broadcast each lane index to all 2 bytes of T - alignas(32) static constexpr uint8_t kBroadcastLaneBytes[32] = { - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); - - // Shift to bytes - const Repartition d16; - const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); - - return Indices256>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; -#endif // HWY_TARGET <= HWY_AVX3 -} - -// Native 8x32 instruction: indices remain unchanged -template -HWY_API Indices256> IndicesFromVec(D /* tag */, Vec256 vec) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const Full256 di; - HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && - AllTrue(di, Lt(vec, Set(di, static_cast(2 * Lanes(di)))))); -#endif - return Indices256>{vec.raw}; -} - -// 64-bit lanes: convert indices to 8x32 unless AVX3 is available -template -HWY_API Indices256> IndicesFromVec(D d, Vec256 idx64) { - static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); - const Rebind di; - (void)di; // potentially unused -#if HWY_IS_DEBUG_BUILD - HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) && - AllTrue(di, Lt(idx64, Set(di, static_cast(2 * Lanes(di)))))); -#endif - -#if HWY_TARGET <= HWY_AVX3 - (void)d; - return Indices256>{idx64.raw}; -#else - const Repartition df; // 32-bit! - // Replicate 64-bit index into upper 32 bits - const Vec256 dup = - BitCast(di, Vec256{_mm256_moveldup_ps(BitCast(df, idx64).raw)}); - // For each idx64 i, idx32 are 2*i and 2*i+1. - const Vec256 idx32 = dup + dup + Set(di, TI(1) << 32); - return Indices256>{idx32.raw}; -#endif -} - -template -HWY_API Indices256> SetTableIndices(D d, const TI* idx) { - const Rebind di; - return IndicesFromVec(d, LoadU(di, idx)); -} - -template -HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_permutexvar_epi8(idx.raw, v.raw)}; -#else - const Vec256 idx_vec{idx.raw}; - const DFromV d; - const Repartition du16; - const auto sel_hi_mask = - MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); - - const auto a = ConcatLowerLower(d, v, v); - const auto b = ConcatUpperUpper(d, v, v); - const auto lo_lookup_result = TableLookupBytes(a, idx_vec); - -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_mask_shuffle_epi8( - lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; -#else - const auto hi_lookup_result = TableLookupBytes(b, idx_vec); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#endif // HWY_TARGET <= HWY_AVX3 -#endif // HWY_TARGET <= HWY_AVX3_DL -} - -template -HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutexvar_epi16(idx.raw, v.raw)}; -#else - const DFromV d; - const Repartition du8; - return BitCast( - d, TableLookupLanes(BitCast(du8, v), Indices256{idx.raw})); -#endif -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 TableLookupLanes(Vec256 v, - Indices256 idx) { - return Vec256{_mm256_permutexvar_ph(idx.raw, v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { - return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; -} - -template -HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutexvar_epi64(idx.raw, v.raw)}; -#else - return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; -#endif -} - -HWY_API Vec256 TableLookupLanes(const Vec256 v, - const Indices256 idx) { - return Vec256{_mm256_permutevar8x32_ps(v.raw, idx.raw)}; -} - -HWY_API Vec256 TableLookupLanes(const Vec256 v, - const Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutexvar_pd(idx.raw, v.raw)}; -#else - const Full256 df; - const Full256 du; - return BitCast(df, Vec256{_mm256_permutevar8x32_epi32( - BitCast(du, v).raw, idx.raw)}); -#endif -} - -template -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, - Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_permutex2var_epi8(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const auto sel_hi_mask = - MaskFromVec(BitCast(d, ShiftLeft<2>(Vec256{idx.raw}))); - const auto lo_lookup_result = TableLookupLanes(a, idx); - const auto hi_lookup_result = TableLookupLanes(b, idx); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#endif -} - -template -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, - Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutex2var_epi16(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const Repartition du8; - return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), - Indices256{idx.raw})); -#endif -} - -template -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, - Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutex2var_epi32(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const RebindToFloat df; - const Vec256 idx_vec{idx.raw}; - - const auto sel_hi_mask = MaskFromVec(BitCast(df, ShiftLeft<28>(idx_vec))); - const auto lo_lookup_result = BitCast(df, TableLookupLanes(a, idx)); - const auto hi_lookup_result = BitCast(df, TableLookupLanes(b, idx)); - return BitCast(d, - IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); -#endif -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, - Vec256 b, - Indices256 idx) { - return Vec256{_mm256_permutex2var_ph(a.raw, idx.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, - Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutex2var_ps(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const auto sel_hi_mask = - MaskFromVec(BitCast(d, ShiftLeft<28>(Vec256{idx.raw}))); - const auto lo_lookup_result = TableLookupLanes(a, idx); - const auto hi_lookup_result = TableLookupLanes(b, idx); - return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); -#endif -} - -template -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, - Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutex2var_epi64(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const Repartition du32; - return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b), - Indices256{idx.raw})); -#endif -} - -HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, - Indices256 idx) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_permutex2var_pd(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const Repartition du32; - return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b), - Indices256{idx.raw})); -#endif -} - -// ------------------------------ SwapAdjacentBlocks - -template -HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { - return Vec256{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(1, 0, 3, 2))}; -} - -HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { - return Vec256{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))}; -} - -HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { - // Assume no domain-crossing penalty between float/double (true on SKX). - const DFromV d; - const RepartitionToWide dw; - return BitCast(d, SwapAdjacentBlocks(BitCast(dw, v))); -} - -// ------------------------------ Reverse (RotateRight) - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; - return TableLookupLanes(v, SetTableIndices(d, kReverse)); -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - alignas(32) static constexpr int64_t kReverse[4] = {3, 2, 1, 0}; - return TableLookupLanes(v, SetTableIndices(d, kReverse)); -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { -#if HWY_TARGET <= HWY_AVX3 - const RebindToSigned di; - alignas(32) static constexpr int16_t kReverse[16] = { - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - const Vec256 idx = Load(di, kReverse); - return BitCast(d, Vec256{ - _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); -#else - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; - const auto rev128 = TableLookupBytes(v, LoadDup128(di, kShuffle)); - return VFromD{ - _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))}; -#endif -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { -#if HWY_TARGET <= HWY_AVX3_DL - alignas(32) static constexpr TFromD kReverse[32] = { - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - return TableLookupLanes(v, SetTableIndices(d, kReverse)); -#else - // First reverse bytes within blocks via PSHUFB, then swap blocks. - alignas(32) static constexpr TFromD kReverse[32] = { - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - return SwapAdjacentBlocks(TableLookupBytes(v, Load(d, kReverse))); -#endif -} - -// ------------------------------ Reverse2 (in x86_128) - -// ------------------------------ Reverse4 (SwapAdjacentBlocks) - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908}; - return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); -} - -// 32 bit Reverse4 defined in x86_128. - -template -HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { - // Could also use _mm256_permute4x64_epi64. - return SwapAdjacentBlocks(Shuffle01(v)); -} - -// ------------------------------ Reverse8 - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const RebindToSigned di; - alignas(16) static constexpr int16_t kShuffle[8] = { - 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100}; - return BitCast(d, TableLookupBytes(v, LoadDup128(di, kShuffle))); -} - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - return Reverse(d, v); -} - -template -HWY_API VFromD Reverse8(D /* tag */, const VFromD /* v */) { - HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes -} - -// ------------------------------ ReverseBits - -#if HWY_TARGET <= HWY_AVX3_DL -template , 32)> -HWY_API V ReverseBits(V v) { - const Full256 du64; - const auto affine_matrix = Set(du64, 0x8040201008040201u); - return V{_mm256_gf2p8affine_epi64_epi8(v.raw, affine_matrix.raw, 0)}; -} -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ------------------------------ InterleaveLower - -// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides -// the least-significant lane) and "b". To concatenate two half-width integers -// into one, use ZipLower/Upper instead (also works with scalar). - -template -HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { - return Vec256{_mm256_unpacklo_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; // for float16_t - return BitCast( - d, VU{_mm256_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); -} -template -HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { - return Vec256{_mm256_unpacklo_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { - return Vec256{_mm256_unpacklo_epi64(a.raw, b.raw)}; -} - -HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { - return Vec256{_mm256_unpacklo_ps(a.raw, b.raw)}; -} -HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { - return Vec256{_mm256_unpacklo_pd(a.raw, b.raw)}; -} - -// ------------------------------ InterleaveUpper - -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm256_unpackhi_epi8(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const RebindToUnsigned du; - using VU = VFromD; // for float16_t - return BitCast( - d, VU{_mm256_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm256_unpackhi_epi32(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm256_unpackhi_epi64(a.raw, b.raw)}; -} - -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm256_unpackhi_ps(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm256_unpackhi_pd(a.raw, b.raw)}; -} - -// ------------------------------ Blocks (LowerHalf, ZeroExtendVector) - -// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL. -// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no -// extra cost) for LowerLower and UpperLower. - -// hiH,hiL loH,loL |-> hiL,loL (= lower halves) -template -HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { - const Half d2; - return VFromD{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)}; -} -template -HWY_API Vec256 ConcatLowerLower(D d, Vec256 hi, - Vec256 lo) { - const Half d2; - return Vec256{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)}; -} -template -HWY_API Vec256 ConcatLowerLower(D d, Vec256 hi, - Vec256 lo) { - const Half d2; - return Vec256{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)}; -} - -// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) -template -HWY_API VFromD ConcatLowerUpper(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)}; -} -template -HWY_API Vec256 ConcatLowerUpper(D /* tag */, Vec256 hi, - Vec256 lo) { - return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)}; -} -template -HWY_API Vec256 ConcatLowerUpper(D /* tag */, Vec256 hi, - Vec256 lo) { - return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)}; -} - -// hiH,hiL loH,loL |-> hiH,loL (= outer halves) -template -HWY_API VFromD ConcatUpperLower(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)}; -} -template -HWY_API Vec256 ConcatUpperLower(D /* tag */, Vec256 hi, - Vec256 lo) { - return Vec256{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)}; -} -template -HWY_API Vec256 ConcatUpperLower(D /* tag */, Vec256 hi, - Vec256 lo) { - return Vec256{_mm256_blend_pd(hi.raw, lo.raw, 3)}; -} - -// hiH,hiL loH,loL |-> hiH,loH (= upper halves) -template -HWY_API VFromD ConcatUpperUpper(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)}; -} -template -HWY_API Vec256 ConcatUpperUpper(D /* tag */, Vec256 hi, - Vec256 lo) { - return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)}; -} -template -HWY_API Vec256 ConcatUpperUpper(D /* tag */, Vec256 hi, - Vec256 lo) { - return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)}; -} - -// ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower) -template -HWY_API Vec256 InsertBlock(Vec256 v, Vec128 blk_to_insert) { - static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); - - const DFromV d; - const auto vec_to_insert = ResizeBitCast(d, blk_to_insert); - return (kBlockIdx == 0) ? ConcatUpperLower(d, v, vec_to_insert) - : ConcatLowerLower(d, vec_to_insert, v); -} - -// ------------------------------ ConcatOdd - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(32) static constexpr uint8_t kIdx[32] = { - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, - 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi8( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RepartitionToWide dw; - // Unsigned 8-bit shift so we can pack. - const Vec256 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec256 uL = ShiftRight<8>(BitCast(dw, lo)); - const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); - return VFromD{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(32) static constexpr uint16_t kIdx[16] = { - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi16( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RepartitionToWide dw; - // Unsigned 16-bit shift so we can pack. - const Vec256 uH = ShiftRight<16>(BitCast(dw, hi)); - const Vec256 uL = ShiftRight<16>(BitCast(dw, lo)); - const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); - return VFromD{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi32( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RebindToFloat df; - const Vec256 v3131{_mm256_shuffle_ps( - BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))}; - return VFromD{_mm256_permute4x64_epi64(BitCast(du, v3131).raw, - _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; - return VFromD{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; -#else - const VFromD v3131{ - _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; - return BitCast(d, Vec256{_mm256_permute4x64_epi64( - BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))}); -#endif -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi64( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RebindToFloat df; - const Vec256 v31{ - _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)}; - return VFromD{ - _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -template -HWY_API Vec256 ConcatOdd(D d, Vec256 hi, Vec256 lo) { -#if HWY_TARGET <= HWY_AVX3 - const RebindToUnsigned du; - alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; - return Vec256{ - _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; -#else - (void)d; - const Vec256 v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)}; - return Vec256{ - _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -// ------------------------------ ConcatEven - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(64) static constexpr uint8_t kIdx[32] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi8( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RepartitionToWide dw; - // Isolate lower 8 bits per u16 so we can pack. - const Vec256 mask = Set(dw, 0x00FF); - const Vec256 uH = And(BitCast(dw, hi), mask); - const Vec256 uL = And(BitCast(dw, lo), mask); - const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); - return VFromD{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(64) static constexpr uint16_t kIdx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi16( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RepartitionToWide dw; - // Isolate lower 16 bits per u32 so we can pack. - const Vec256 mask = Set(dw, 0x0000FFFF); - const Vec256 uH = And(BitCast(dw, hi), mask); - const Vec256 uL = And(BitCast(dw, lo), mask); - const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); - return VFromD{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi32( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RebindToFloat df; - const Vec256 v2020{_mm256_shuffle_ps( - BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))}; - return VFromD{_mm256_permute4x64_epi64(BitCast(du, v2020).raw, - _MM_SHUFFLE(3, 1, 2, 0))}; - -#endif -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; - return VFromD{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; -#else - const VFromD v2020{ - _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; - return BitCast(d, Vec256{_mm256_permute4x64_epi64( - BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))}); - -#endif -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3 - alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; - return BitCast( - d, Vec256{_mm256_permutex2var_epi64( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RebindToFloat df; - const Vec256 v20{ - _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)}; - return VFromD{ - _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))}; - -#endif -} - -template -HWY_API Vec256 ConcatEven(D d, Vec256 hi, Vec256 lo) { -#if HWY_TARGET <= HWY_AVX3 - const RebindToUnsigned du; - alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; - return Vec256{ - _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; -#else - (void)d; - const Vec256 v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)}; - return Vec256{ - _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))}; -#endif -} - -// ------------------------------ DupEven (InterleaveLower) - -template -HWY_API Vec256 DupEven(Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; -} -HWY_API Vec256 DupEven(Vec256 v) { - return Vec256{ - _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; -} - -template -HWY_API Vec256 DupEven(const Vec256 v) { - const DFromV d; - return InterleaveLower(d, v, v); -} - -// ------------------------------ DupOdd (InterleaveUpper) - -template -HWY_API Vec256 DupOdd(Vec256 v) { - return Vec256{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; -} -HWY_API Vec256 DupOdd(Vec256 v) { - return Vec256{ - _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; -} - -template -HWY_API Vec256 DupOdd(const Vec256 v) { - const DFromV d; - return InterleaveUpper(d, v, v); -} - -// ------------------------------ OddEven - -template -HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { - const DFromV d; - const Full256 d8; - alignas(32) static constexpr uint8_t mask[16] = { - 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; - return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a); -} - -template -HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { - const DFromV d; - const RebindToUnsigned du; // for float16_t - return BitCast(d, VFromD{_mm256_blend_epi16( - BitCast(du, a).raw, BitCast(du, b).raw, 0x55)}); -} - -#if HWY_HAVE_FLOAT16 -HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_mask_blend_ph(a.raw, b.raw, 0x55)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_blend_epi32(a.raw, b.raw, 0x55)}; -} - -template -HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_blend_epi32(a.raw, b.raw, 0x33)}; -} - -HWY_API Vec256 OddEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_blend_ps(a.raw, b.raw, 0x55)}; -} - -HWY_API Vec256 OddEven(Vec256 a, Vec256 b) { - return Vec256{_mm256_blend_pd(a.raw, b.raw, 5)}; -} - -// ------------------------------ OddEvenBlocks - -template -Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { - return Vec256{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)}; -} - -HWY_API Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { - return Vec256{_mm256_blend_ps(odd.raw, even.raw, 0xFu)}; -} - -HWY_API Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { - return Vec256{_mm256_blend_pd(odd.raw, even.raw, 0x3u)}; -} - -// ------------------------------ ReverseBlocks (SwapAdjacentBlocks) - -template -HWY_API VFromD ReverseBlocks(D /*d*/, VFromD v) { - return SwapAdjacentBlocks(v); -} - -// ------------------------------ TableLookupBytes (ZeroExtendVector) - -// Both full -template -HWY_API Vec256 TableLookupBytes(Vec256 bytes, Vec256 from) { - return Vec256{_mm256_shuffle_epi8(bytes.raw, from.raw)}; -} - -// Partial index vector -template -HWY_API Vec128 TableLookupBytes(Vec256 bytes, Vec128 from) { - const Full256 di; - const Half dih; - // First expand to full 128, then 256. - const auto from_256 = ZeroExtendVector(di, Vec128{from.raw}); - const auto tbl_full = TableLookupBytes(bytes, from_256); - // Shrink to 128, then partial. - return Vec128{LowerHalf(dih, tbl_full).raw}; -} - -// Partial table vector -template -HWY_API Vec256 TableLookupBytes(Vec128 bytes, Vec256 from) { - const Full256 d; - // First expand to full 128, then 256. - const auto bytes_256 = ZeroExtendVector(d, Vec128{bytes.raw}); - return TableLookupBytes(bytes_256, from); -} - -// Partial both are handled by x86_128. - -// ------------------------------ I8/U8 Broadcast (TableLookupBytes) - -template -HWY_API Vec256 Broadcast(const Vec256 v) { - static_assert(0 <= kLane && kLane < 16, "Invalid lane"); - return TableLookupBytes(v, Set(Full256(), static_cast(kLane))); -} - -// ------------------------------ Per4LaneBlockShuffle - -namespace detail { - -template -HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, - const uint32_t x2, - const uint32_t x1, - const uint32_t x0) { - return BitCast(d, Vec256{_mm256_set_epi32( - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0), - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0))}); -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag<32> /*vect_size_tag*/, V v) { - return V{_mm256_shuffle_epi32(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag<32> /*vect_size_tag*/, V v) { - return V{_mm256_shuffle_ps(v.raw, v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, - hwy::SizeTag<8> /*lane_size_tag*/, - hwy::SizeTag<32> /*vect_size_tag*/, V v) { - const DFromV d; - return ConcatLowerLower(d, v, v); -} - -template -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, - hwy::SizeTag<8> /*lane_size_tag*/, - hwy::SizeTag<32> /*vect_size_tag*/, V v) { - const DFromV d; - return ConcatUpperUpper(d, v, v); -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<8> /*lane_size_tag*/, - hwy::SizeTag<32> /*vect_size_tag*/, V v) { - return V{_mm256_permute4x64_epi64(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<8> /*lane_size_tag*/, - hwy::SizeTag<32> /*vect_size_tag*/, V v) { - return V{_mm256_permute4x64_pd(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -} // namespace detail - -// ------------------------------ SlideUpLanes - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo) { - const DFromV d; - const Repartition du32; - return BitCast(d, - Vec256{_mm256_alignr_epi32( - BitCast(du32, hi).raw, BitCast(du32, lo).raw, kI32Lanes)}); -} - -template -HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo) { - const DFromV d; - const Repartition du64; - return BitCast(d, - Vec256{_mm256_alignr_epi64( - BitCast(du64, hi).raw, BitCast(du64, lo).raw, kI64Lanes)}); -} - -template -HWY_INLINE V SlideUpI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 3, - "kI64Lanes must be between 0 and 3"); - const DFromV d; - return CombineShiftRightI64Lanes<4 - kI64Lanes>(v, Zero(d)); -} -#else // AVX2 -template )> -HWY_INLINE V SlideUpI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 3, - "kI64Lanes must be between 0 and 3"); - constexpr int kIdx0 = (-kI64Lanes) & 3; - constexpr int kIdx1 = (-kI64Lanes + 1) & 3; - constexpr int kIdx2 = (-kI64Lanes + 2) & 3; - constexpr int kIdx3 = (-kI64Lanes + 3) & 3; - constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0); - constexpr int kBlendMask = (1 << (kI64Lanes * 2)) - 1; - - const DFromV d; - return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210), - Zero(d).raw, kBlendMask)}; -} - -template )> -HWY_INLINE V SlideUpI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 3, - "kI64Lanes must be between 0 and 3"); - constexpr int kIdx0 = (-kI64Lanes) & 3; - constexpr int kIdx1 = (-kI64Lanes + 1) & 3; - constexpr int kIdx2 = (-kI64Lanes + 2) & 3; - constexpr int kIdx3 = (-kI64Lanes + 3) & 3; - constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0); - constexpr int kBlendMask = (1 << kI64Lanes) - 1; - - const DFromV d; - const Repartition dd; - return BitCast(d, Vec256{_mm256_blend_pd( - _mm256_permute4x64_pd(BitCast(dd, v).raw, kIdx3210), - Zero(dd).raw, kBlendMask)}); -} -#endif // HWY_TARGET <= HWY_AVX3 - -template HWY_AVX3) ? (1 << 2) : 0))> -HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { - const Repartition du8; - - const auto idx_vec = - Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromD))); - const Indices256> idx{idx_vec.raw}; - -#if HWY_TARGET <= HWY_AVX3_DL - return TwoTablesLookupLanes(v, Zero(d), idx); -#else - return TableLookupLanes(v, idx); -#endif -} - -template -HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { - const RebindToUnsigned du; - using TU = TFromD; - - const auto idx = Iota(du, static_cast(size_t{0} - amt)); -#if HWY_TARGET <= HWY_AVX3 - const auto masked_idx = - And(idx, Set(du, static_cast(MaxLanes(d) * 2 - 1))); - return TwoTablesLookupLanes(v, Zero(d), IndicesFromVec(d, masked_idx)); -#else - const auto masked_idx = And(idx, Set(du, static_cast(MaxLanes(d) - 1))); - return IfThenElseZero(RebindMask(d, idx == masked_idx), - TableLookupLanes(v, IndicesFromVec(d, masked_idx))); -#endif -} - -#if HWY_TARGET > HWY_AVX3 -template -HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { - const RepartitionToNarrow dn; - return BitCast(d, TableLookupSlideUpLanes(dn, BitCast(dn, v), amt * 2)); -} -#endif // HWY_TARGET > HWY_AVX3 - -} // namespace detail - -template -HWY_API VFromD SlideUpBlocks(D d, VFromD v) { - static_assert(0 <= kBlocks && kBlocks <= 1, - "kBlocks must be between 0 and 1"); - return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v; -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); - if (__builtin_constant_p(amt)) { - const auto v_lo = ConcatLowerLower(d, v, Zero(d)); - switch (amt * sizeof(TFromD)) { - case 0: - return v; - case 1: - return CombineShiftRightBytes<15>(d, v, v_lo); - case 2: - return CombineShiftRightBytes<14>(d, v, v_lo); - case 3: - return CombineShiftRightBytes<13>(d, v, v_lo); - case 4: -#if HWY_TARGET <= HWY_AVX3 - return detail::CombineShiftRightI32Lanes<7>(v, Zero(d)); -#else - return CombineShiftRightBytes<12>(d, v, v_lo); -#endif - case 5: - return CombineShiftRightBytes<11>(d, v, v_lo); - case 6: - return CombineShiftRightBytes<10>(d, v, v_lo); - case 7: - return CombineShiftRightBytes<9>(d, v, v_lo); - case 8: - return detail::SlideUpI64Lanes<1>(v); - case 9: - return CombineShiftRightBytes<7>(d, v, v_lo); - case 10: - return CombineShiftRightBytes<6>(d, v, v_lo); - case 11: - return CombineShiftRightBytes<5>(d, v, v_lo); - case 12: -#if HWY_TARGET <= HWY_AVX3 - return detail::CombineShiftRightI32Lanes<5>(v, Zero(d)); -#else - return CombineShiftRightBytes<4>(d, v, v_lo); -#endif - case 13: - return CombineShiftRightBytes<3>(d, v, v_lo); - case 14: - return CombineShiftRightBytes<2>(d, v, v_lo); - case 15: - return CombineShiftRightBytes<1>(d, v, v_lo); - case 16: - return ConcatLowerLower(d, v, Zero(d)); -#if HWY_TARGET <= HWY_AVX3 - case 20: - return detail::CombineShiftRightI32Lanes<3>(v, Zero(d)); -#endif - case 24: - return detail::SlideUpI64Lanes<3>(v); -#if HWY_TARGET <= HWY_AVX3 - case 28: - return detail::CombineShiftRightI32Lanes<1>(v, Zero(d)); -#endif - } - } - - if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { - const Half dh; - return Combine(d, SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock), - Zero(dh)); - } -#endif - - return detail::TableLookupSlideUpLanes(d, v, amt); -} - -// ------------------------------ Slide1Up - -template -HWY_API VFromD Slide1Up(D d, VFromD v) { - const auto v_lo = ConcatLowerLower(d, v, Zero(d)); - return CombineShiftRightBytes<15>(d, v, v_lo); -} - -template -HWY_API VFromD Slide1Up(D d, VFromD v) { - const auto v_lo = ConcatLowerLower(d, v, Zero(d)); - return CombineShiftRightBytes<14>(d, v, v_lo); -} - -template -HWY_API VFromD Slide1Up(D d, VFromD v) { -#if HWY_TARGET <= HWY_AVX3 - return detail::CombineShiftRightI32Lanes<7>(v, Zero(d)); -#else - const auto v_lo = ConcatLowerLower(d, v, Zero(d)); - return CombineShiftRightBytes<12>(d, v, v_lo); -#endif -} - -template -HWY_API VFromD Slide1Up(D /*d*/, VFromD v) { - return detail::SlideUpI64Lanes<1>(v); -} - -// ------------------------------ SlideDownLanes - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_INLINE V SlideDownI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 3, - "kI64Lanes must be between 0 and 3"); - const DFromV d; - return CombineShiftRightI64Lanes(Zero(d), v); -} -#else // AVX2 -template )> -HWY_INLINE V SlideDownI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 3, - "kI64Lanes must be between 0 and 3"); - constexpr int kIdx1 = (kI64Lanes + 1) & 3; - constexpr int kIdx2 = (kI64Lanes + 2) & 3; - constexpr int kIdx3 = (kI64Lanes + 3) & 3; - constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes); - constexpr int kBlendMask = - static_cast((0xFFu << ((4 - kI64Lanes) * 2)) & 0xFFu); - - const DFromV d; - return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210), - Zero(d).raw, kBlendMask)}; -} - -template )> -HWY_INLINE V SlideDownI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 3, - "kI64Lanes must be between 0 and 3"); - constexpr int kIdx1 = (kI64Lanes + 1) & 3; - constexpr int kIdx2 = (kI64Lanes + 2) & 3; - constexpr int kIdx3 = (kI64Lanes + 3) & 3; - constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes); - constexpr int kBlendMask = (0x0F << (4 - kI64Lanes)) & 0x0F; - - const DFromV d; - const Repartition dd; - return BitCast(d, Vec256{_mm256_blend_pd( - _mm256_permute4x64_pd(BitCast(dd, v).raw, kIdx3210), - Zero(dd).raw, kBlendMask)}); -} -#endif // HWY_TARGET <= HWY_AVX3 - -template HWY_AVX3) ? (1 << 2) : 0))> -HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { - const Repartition du8; - - auto idx_vec = Iota(du8, static_cast(amt * sizeof(TFromD))); - -#if HWY_TARGET <= HWY_AVX3_DL - const auto result_mask = idx_vec < Set(du8, uint8_t{32}); - return VFromD{ - _mm256_maskz_permutexvar_epi8(result_mask.raw, idx_vec.raw, v.raw)}; -#else - const RebindToSigned di8; - idx_vec = - Or(idx_vec, BitCast(du8, VecFromMask(di8, BitCast(di8, idx_vec) > - Set(di8, int8_t{31})))); - return TableLookupLanes(v, Indices256>{idx_vec.raw}); -#endif -} - -template -HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { - const RebindToUnsigned du; - using TU = TFromD; - - const auto idx = Iota(du, static_cast(amt)); - const auto masked_idx = And(idx, Set(du, static_cast(MaxLanes(d) - 1))); - - return IfThenElseZero(RebindMask(d, idx == masked_idx), - TableLookupLanes(v, IndicesFromVec(d, masked_idx))); -} - -#if HWY_TARGET > HWY_AVX3 -template -HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { - const RepartitionToNarrow dn; - return BitCast(d, TableLookupSlideDownLanes(dn, BitCast(dn, v), amt * 2)); -} -#endif // HWY_TARGET > HWY_AVX3 - -} // namespace detail - -template -HWY_API VFromD SlideDownBlocks(D d, VFromD v) { - static_assert(0 <= kBlocks && kBlocks <= 1, - "kBlocks must be between 0 and 1"); - const Half dh; - return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v; -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); - const Half dh; - if (__builtin_constant_p(amt)) { - const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); - switch (amt * sizeof(TFromD)) { - case 0: - return v; - case 1: - return CombineShiftRightBytes<1>(d, v_hi, v); - case 2: - return CombineShiftRightBytes<2>(d, v_hi, v); - case 3: - return CombineShiftRightBytes<3>(d, v_hi, v); - case 4: -#if HWY_TARGET <= HWY_AVX3 - return detail::CombineShiftRightI32Lanes<1>(Zero(d), v); -#else - return CombineShiftRightBytes<4>(d, v_hi, v); -#endif - case 5: - return CombineShiftRightBytes<5>(d, v_hi, v); - case 6: - return CombineShiftRightBytes<6>(d, v_hi, v); - case 7: - return CombineShiftRightBytes<7>(d, v_hi, v); - case 8: - return detail::SlideDownI64Lanes<1>(v); - case 9: - return CombineShiftRightBytes<9>(d, v_hi, v); - case 10: - return CombineShiftRightBytes<10>(d, v_hi, v); - case 11: - return CombineShiftRightBytes<11>(d, v_hi, v); - case 12: -#if HWY_TARGET <= HWY_AVX3 - return detail::CombineShiftRightI32Lanes<3>(Zero(d), v); -#else - return CombineShiftRightBytes<12>(d, v_hi, v); -#endif - case 13: - return CombineShiftRightBytes<13>(d, v_hi, v); - case 14: - return CombineShiftRightBytes<14>(d, v_hi, v); - case 15: - return CombineShiftRightBytes<15>(d, v_hi, v); - case 16: - return v_hi; -#if HWY_TARGET <= HWY_AVX3 - case 20: - return detail::CombineShiftRightI32Lanes<5>(Zero(d), v); -#endif - case 24: - return detail::SlideDownI64Lanes<3>(v); -#if HWY_TARGET <= HWY_AVX3 - case 28: - return detail::CombineShiftRightI32Lanes<7>(Zero(d), v); -#endif - } - } - - if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { - return ZeroExtendVector( - d, SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock)); - } -#endif - - return detail::TableLookupSlideDownLanes(d, v, amt); -} - -// ------------------------------ Slide1Down - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { - const Half dh; - const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); - return CombineShiftRightBytes<1>(d, v_hi, v); -} - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { - const Half dh; - const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); - return CombineShiftRightBytes<2>(d, v_hi, v); -} - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { -#if HWY_TARGET <= HWY_AVX3 - return detail::CombineShiftRightI32Lanes<1>(Zero(d), v); -#else - const Half dh; - const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); - return CombineShiftRightBytes<4>(d, v_hi, v); -#endif -} - -template -HWY_API VFromD Slide1Down(D /*d*/, VFromD v) { - return detail::SlideDownI64Lanes<1>(v); -} - -// ------------------------------ Shl (Mul, ZipLower) - -namespace detail { - -#if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older -template -HWY_INLINE V AVX2ShlU16Vec256(V v, V bits) { - const DFromV d; - const Half dh; - const Rebind du32; - - const auto lo_shl_result = PromoteTo(du32, LowerHalf(dh, v)) - << PromoteTo(du32, LowerHalf(dh, bits)); - const auto hi_shl_result = PromoteTo(du32, UpperHalf(dh, v)) - << PromoteTo(du32, UpperHalf(dh, bits)); - return ConcatEven(d, BitCast(d, hi_shl_result), BitCast(d, lo_shl_result)); -} -#endif - -HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, - Vec256 bits) { -#if HWY_TARGET <= HWY_AVX3 || HWY_IDE - return Vec256{_mm256_sllv_epi16(v.raw, bits.raw)}; -#else - return AVX2ShlU16Vec256(v, bits); -#endif -} - -// 8-bit: may use the Shl overload for uint16_t. -HWY_API Vec256 Shl(hwy::UnsignedTag tag, Vec256 v, - Vec256 bits) { - const DFromV d; -#if HWY_TARGET <= HWY_AVX3_DL - (void)tag; - // kMask[i] = 0xFF >> i - alignas(16) static constexpr uint8_t kMasks[16] = { - 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; - // kShl[i] = 1 << i - alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, - 0x20, 0x40, 0x80, 0x00}; - v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits)); - const VFromD mul = TableLookupBytes(LoadDup128(d, kShl), bits); - return VFromD{_mm256_gf2p8mul_epi8(v.raw, mul.raw)}; -#else - const Repartition dw; - using VW = VFromD; - const VW even_mask = Set(dw, 0x00FF); - const VW odd_mask = Set(dw, 0xFF00); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - // Shift even lanes in-place - const VW evens = Shl(tag, vw, And(bits16, even_mask)); - const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16)); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -#endif -} - -HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, - Vec256 bits) { - return Vec256{_mm256_sllv_epi32(v.raw, bits.raw)}; -} - -HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, - Vec256 bits) { - return Vec256{_mm256_sllv_epi64(v.raw, bits.raw)}; -} - -template -HWY_INLINE Vec256 Shl(hwy::SignedTag /*tag*/, Vec256 v, Vec256 bits) { - // Signed left shifts are the same as unsigned. - const Full256 di; - const Full256> du; - return BitCast(di, - Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); -} - -} // namespace detail - -template -HWY_API Vec256 operator<<(Vec256 v, Vec256 bits) { - return detail::Shl(hwy::TypeTag(), v, bits); -} - -// ------------------------------ Shr (MulHigh, IfThenElse, Not) - -#if HWY_TARGET > HWY_AVX3 // AVX2 -namespace detail { - -template -HWY_INLINE V AVX2ShrU16Vec256(V v, V bits) { - const DFromV d; - const Half dh; - const Rebind di32; - const Rebind du32; - - const auto lo_shr_result = - PromoteTo(du32, LowerHalf(dh, v)) >> PromoteTo(du32, LowerHalf(dh, bits)); - const auto hi_shr_result = - PromoteTo(du32, UpperHalf(dh, v)) >> PromoteTo(du32, UpperHalf(dh, bits)); - return OrderedDemote2To(d, BitCast(di32, lo_shr_result), - BitCast(di32, hi_shr_result)); -} - -} // namespace detail -#endif - -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_srlv_epi16(v.raw, bits.raw)}; -#else - return detail::AVX2ShrU16Vec256(v, bits); -#endif -} - -// 8-bit uses 16-bit shifts. -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { - const DFromV d; - const RepartitionToWide dw; - using VW = VFromD; - const VW mask = Set(dw, 0x00FF); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - const VW evens = And(vw, mask) >> And(bits16, mask); - // Shift odd lanes in-place - const VW odds = vw >> ShiftRight<8>(bits16); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -} - -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { - return Vec256{_mm256_srlv_epi32(v.raw, bits.raw)}; -} - -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { - return Vec256{_mm256_srlv_epi64(v.raw, bits.raw)}; -} - -#if HWY_TARGET > HWY_AVX3 // AVX2 -namespace detail { - -template -HWY_INLINE V AVX2ShrI16Vec256(V v, V bits) { - const DFromV d; - const Half dh; - const Rebind di32; - - const auto lo_shr_result = - PromoteTo(di32, LowerHalf(dh, v)) >> PromoteTo(di32, LowerHalf(dh, bits)); - const auto hi_shr_result = - PromoteTo(di32, UpperHalf(dh, v)) >> PromoteTo(di32, UpperHalf(dh, bits)); - return OrderedDemote2To(d, lo_shr_result, hi_shr_result); -} - -} // namespace detail -#endif - -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_srav_epi16(v.raw, bits.raw)}; -#else - return detail::AVX2ShrI16Vec256(v, bits); -#endif -} - -// 8-bit uses 16-bit shifts. -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { - const DFromV d; - const RepartitionToWide dw; - const RebindToUnsigned dw_u; - using VW = VFromD; - const VW mask = Set(dw, 0x00FF); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >> And(bits16, mask); - // Shift odd lanes in-place - const VW odds = vw >> BitCast(dw, ShiftRight<8>(BitCast(dw_u, bits16))); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -} - -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { - return Vec256{_mm256_srav_epi32(v.raw, bits.raw)}; -} - -HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { -#if HWY_TARGET <= HWY_AVX3 - return Vec256{_mm256_srav_epi64(v.raw, bits.raw)}; -#else - const DFromV d; - return detail::SignedShr(d, v, bits); -#endif -} - -HWY_INLINE Vec256 MulEven(const Vec256 a, - const Vec256 b) { - const Full256 du64; - const RepartitionToNarrow du32; - const auto maskL = Set(du64, 0xFFFFFFFFULL); - const auto a32 = BitCast(du32, a); - const auto b32 = BitCast(du32, b); - // Inputs for MulEven: we only need the lower 32 bits - const auto aH = Shuffle2301(a32); - const auto bH = Shuffle2301(b32); - - // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need - // the even (lower 64 bits of every 128-bit block) results. See - // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat - const auto aLbL = MulEven(a32, b32); - const auto w3 = aLbL & maskL; - - const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); - const auto w2 = t2 & maskL; - const auto w1 = ShiftRight<32>(t2); - - const auto t = MulEven(a32, bH) + w2; - const auto k = ShiftRight<32>(t); - - const auto mulH = MulEven(aH, bH) + w1 + k; - const auto mulL = ShiftLeft<32>(t) + w3; - return InterleaveLower(mulL, mulH); -} - -HWY_INLINE Vec256 MulOdd(const Vec256 a, - const Vec256 b) { - const Full256 du64; - const RepartitionToNarrow du32; - const auto maskL = Set(du64, 0xFFFFFFFFULL); - const auto a32 = BitCast(du32, a); - const auto b32 = BitCast(du32, b); - // Inputs for MulEven: we only need bits [95:64] (= upper half of input) - const auto aH = Shuffle2301(a32); - const auto bH = Shuffle2301(b32); - - // Same as above, but we're using the odd results (upper 64 bits per block). - const auto aLbL = MulEven(a32, b32); - const auto w3 = aLbL & maskL; - - const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); - const auto w2 = t2 & maskL; - const auto w1 = ShiftRight<32>(t2); - - const auto t = MulEven(a32, bH) + w2; - const auto k = ShiftRight<32>(t); - - const auto mulH = MulEven(aH, bH) + w1 + k; - const auto mulL = ShiftLeft<32>(t) + w3; - return InterleaveUpper(du64, mulL, mulH); -} - -// ------------------------------ WidenMulPairwiseAdd -template -HWY_API VFromD WidenMulPairwiseAdd(D /*d32*/, Vec256 a, - Vec256 b) { - return VFromD{_mm256_madd_epi16(a.raw, b.raw)}; -} - -// ------------------------------ SatWidenMulPairwiseAdd - -template -HWY_API VFromD SatWidenMulPairwiseAdd( - DI16 /* tag */, VFromD> a, - VFromD> b) { - return VFromD{_mm256_maddubs_epi16(a.raw, b.raw)}; -} - -// ------------------------------ ReorderWidenMulAccumulate -template -HWY_API VFromD ReorderWidenMulAccumulate(D d, Vec256 a, - Vec256 b, - const VFromD sum0, - VFromD& /*sum1*/) { - (void)d; -#if HWY_TARGET <= HWY_AVX3_DL - return VFromD{_mm256_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; -#else - return sum0 + WidenMulPairwiseAdd(d, a, b); -#endif -} - -// ------------------------------ RearrangeToOddPlusEven -HWY_API Vec256 RearrangeToOddPlusEven(const Vec256 sum0, - Vec256 /*sum1*/) { - return sum0; // invariant already holds -} - -HWY_API Vec256 RearrangeToOddPlusEven(const Vec256 sum0, - Vec256 /*sum1*/) { - return sum0; // invariant already holds -} - -// ------------------------------ SumOfMulQuadAccumulate - -#if HWY_TARGET <= HWY_AVX3_DL - -template -HWY_API VFromD SumOfMulQuadAccumulate( - DI32 /*di32*/, VFromD> a_u, - VFromD> b_i, VFromD sum) { - return VFromD{_mm256_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; -} - -#endif - -// ================================================== CONVERT - -// ------------------------------ Promotions (part w/ narrow lanes -> full) - -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtps_pd(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepi32_pd(v.raw)}; -} - -// Unsigned: zero-extend. -// Note: these have 3 cycle latency; if inputs are already split across the -// 128 bit blocks (in their upper/lower halves), then Zip* would be faster. -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepu8_epi16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepu8_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepu16_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepu32_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec64 v) { - return VFromD{_mm256_cvtepu16_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec32 v) { - return VFromD{_mm256_cvtepu8_epi64(v.raw)}; -} - -// Signed: replicate sign bit. -// Note: these have 3 cycle latency; if inputs are already split across the -// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by -// signed shift would be faster. -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepi8_epi16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepi8_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepi16_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm256_cvtepi32_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec64 v) { - return VFromD{_mm256_cvtepi16_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec32 v) { - return VFromD{_mm256_cvtepi8_epi64(v.raw)}; -} - -// ------------------------------ Demotions (full -> part w/ narrow lanes) - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw); - // Concatenating lower halves of both 128-bit blocks afterward is more - // efficient than an extra input with low block = high block of v. - return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))}; -} - -template -HWY_API VFromD DemoteTo(D dn, Vec256 v) { - const DFromV d; - const RebindToSigned di; - return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw); - return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); - // Concatenate lower 64 bits of each 128-bit block - const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); - const __m128i i16 = _mm256_castsi256_si128(i16_concat); - return VFromD{_mm_packus_epi16(i16, i16)}; -} - -template -HWY_API VFromD DemoteTo(D dn, Vec256 v) { -#if HWY_TARGET <= HWY_AVX3 - (void)dn; - return VFromD{_mm256_cvtusepi32_epi8(v.raw)}; -#else - const DFromV d; - const RebindToSigned di; - return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); -#endif -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw); - return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))}; -} - -template -HWY_API VFromD DemoteTo(D dn, Vec256 v) { - const DFromV d; - const RebindToSigned di; - return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu)))); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); - // Concatenate lower 64 bits of each 128-bit block - const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); - const __m128i i16 = _mm256_castsi256_si128(i16_concat); - return VFromD{_mm_packs_epi16(i16, i16)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw); - return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))}; -} - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtsepi64_epi32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtsepi64_epi16(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtsepi64_epi8(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtusepi64_epi32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtusepi64_epi16(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtusepi64_epi8(v.raw)}; -} -#endif // HWY_TARGET <= HWY_AVX3 - -#ifndef HWY_DISABLE_F16C - -// Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'". -// 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here. -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion") - -template -HWY_API VFromD DemoteTo(D df16, Vec256 v) { - (void)df16; - return VFromD{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; -} - -HWY_DIAGNOSTICS(pop) - -#endif // HWY_DISABLE_F16C - -template -HWY_API VFromD DemoteTo(D dbf16, Vec256 v) { - // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16. - const Rebind di32; - const Rebind du32; // for logical shift right - const Rebind du16; - const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); - return BitCast(dbf16, DemoteTo(du16, bits_in_32)); -} - -template -HWY_API VFromD ReorderDemote2To(D dbf16, Vec256 a, Vec256 b) { - // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16. - const RebindToUnsigned du16; - const Repartition du32; - const Vec256 b_in_even = ShiftRight<16>(BitCast(du32, b)); - return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); -} - -template -HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, - Vec256 b) { - return VFromD{_mm256_packs_epi32(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, - Vec256 b) { - return VFromD{_mm256_packus_epi32(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec256 a, - Vec256 b) { - const DFromV d; - const RebindToSigned di; - const auto max_i32 = Set(d, 0x7FFFFFFFu); - return ReorderDemote2To(dn, BitCast(di, Min(a, max_i32)), - BitCast(di, Min(b, max_i32))); -} - -template -HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, - Vec256 b) { - return VFromD{_mm256_packs_epi16(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, - Vec256 b) { - return VFromD{_mm256_packus_epi16(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec256 a, - Vec256 b) { - const DFromV d; - const RebindToSigned di; - const auto max_i16 = Set(d, 0x7FFFu); - return ReorderDemote2To(dn, BitCast(di, Min(a, max_i16)), - BitCast(di, Min(b, max_i16))); -} - -#if HWY_TARGET > HWY_AVX3 -template -HWY_API Vec256 ReorderDemote2To(D dn, Vec256 a, - Vec256 b) { - const DFromV di64; - const RebindToUnsigned du64; - const Half dnh; - const Repartition dn_f; - - // Negative values are saturated by first saturating their bitwise inverse - // and then inverting the saturation result - const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); - const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); - const auto saturated_a = Xor( - invert_mask_a, - detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); - const auto saturated_b = Xor( - invert_mask_b, - detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); - - return BitCast(dn, - Vec256{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, - BitCast(dn_f, saturated_b).raw, - _MM_SHUFFLE(2, 0, 2, 0))}); -} - -template -HWY_API Vec256 ReorderDemote2To(D dn, Vec256 a, - Vec256 b) { - const DFromV di64; - const RebindToUnsigned du64; - const Half dnh; - const Repartition dn_f; - - const auto saturated_a = detail::DemoteFromU64Saturate( - dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); - const auto saturated_b = detail::DemoteFromU64Saturate( - dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); - - return BitCast(dn, - Vec256{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, - BitCast(dn_f, saturated_b).raw, - _MM_SHUFFLE(2, 0, 2, 0))}); -} - -template -HWY_API Vec256 ReorderDemote2To(D dn, Vec256 a, - Vec256 b) { - const Half dnh; - const Repartition dn_f; - - const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); - const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); - - return BitCast(dn, - Vec256{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, - BitCast(dn_f, saturated_b).raw, - _MM_SHUFFLE(2, 0, 2, 0))}); -} -#endif // HWY_TARGET > HWY_AVX3 - -template ), - HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2), - HWY_IF_T_SIZE_ONE_OF_V(V, - (1 << 1) | (1 << 2) | (1 << 4) | - ((HWY_TARGET > HWY_AVX3) ? (1 << 8) : 0))> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return VFromD{_mm256_permute4x64_epi64(ReorderDemote2To(d, a, b).raw, - _MM_SHUFFLE(3, 1, 2, 0))}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtpd_ps(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { - const Full256 d64; - const auto clamped = detail::ClampF64ToI32Max(d64, v); - return VFromD{_mm256_cvttpd_epi32(clamped.raw)}; -} - -// For already range-limited input [0, 255]. -HWY_API Vec128 U8FromU32(const Vec256 v) { - const Full256 d32; - const Full64 d8; - alignas(32) static constexpr uint32_t k8From32[8] = { - 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u}; - // Place first four bytes in lo[0], remaining 4 in hi[1]. - const auto quad = TableLookupBytes(v, Load(d32, k8From32)); - // Interleave both quadruplets - OR instead of unpack reduces port5 pressure. - const auto lo = LowerHalf(quad); - const auto hi = UpperHalf(Half(), quad); - return BitCast(d8, LowerHalf(lo | hi)); -} - -// ------------------------------ Truncations - -namespace detail { - -// LO and HI each hold four indices of bytes within a 128-bit block. -template -HWY_INLINE Vec128 LookupAndConcatHalves(Vec256 v) { - const Full256 d32; - -#if HWY_TARGET <= HWY_AVX3_DL - alignas(32) static constexpr uint32_t kMap[8] = { - LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0}; - const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw); -#else - alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u, - ~0u, ~0u, LO, HI}; - const auto quad = TableLookupBytes(v, Load(d32, kMap)); - const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC); - // Possible alternative: - // const auto lo = LowerHalf(quad); - // const auto hi = UpperHalf(Half(), quad); - // const auto result = lo | hi; -#endif - - return Vec128{_mm256_castsi256_si128(result)}; -} - -// LO and HI each hold two indices of bytes within a 128-bit block. -template -HWY_INLINE Vec128 LookupAndConcatQuarters(Vec256 v) { - const Full256 d16; - -#if HWY_TARGET <= HWY_AVX3_DL - alignas(32) static constexpr uint16_t kMap[16] = { - LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - const auto result = _mm256_permutexvar_epi8(Load(d16, kMap).raw, v.raw); - return LowerHalf(Vec128{_mm256_castsi256_si128(result)}); -#else - constexpr uint16_t ff = static_cast(~0u); - alignas(32) static constexpr uint16_t kMap[16] = { - LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff}; - const auto quad = TableLookupBytes(v, Load(d16, kMap)); - const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC); - const auto half = _mm256_castsi256_si128(mixed); - return LowerHalf(Vec128{_mm_packus_epi32(half, half)}); -#endif -} - -} // namespace detail - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { - const Full256 d32; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(32) static constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, - 0, 0, 0, 0}; - const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw); - return LowerHalf(LowerHalf(LowerHalf(Vec256{result}))); -#else - alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u, - 0x0800FFFFu, ~0u, ~0u, ~0u}; - const auto quad = TableLookupBytes(v, Load(d32, kMap)); - const auto lo = LowerHalf(quad); - const auto hi = UpperHalf(Half(), quad); - const auto result = lo | hi; - return LowerHalf(LowerHalf(Vec128{result.raw})); -#endif -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { - const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v); - return VFromD{result.raw}; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { - const Full256 d32; - alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6}; - const auto v32 = - TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven)); - return LowerHalf(Vec256{v32.raw}); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { - const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v); - return VFromD{full.raw}; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { - const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v); - return VFromD{full.raw}; -} - -template -HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { - const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v); - return VFromD{full.raw}; -} - -// ------------------------------ Integer <=> fp (ShiftRight, OddEven) - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtepu16_ph(v.raw)}; -} -template -HWY_API VFromD ConvertTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtepi16_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API VFromD ConvertTo(D /* tag */, Vec256 v) { - return VFromD{_mm256_cvtepi32_ps(v.raw)}; -} - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API VFromD ConvertTo(D /*df*/, Vec256 v) { - return VFromD{_mm256_cvtepu32_ps(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /*dd*/, Vec256 v) { - return VFromD{_mm256_cvtepi64_pd(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /*dd*/, Vec256 v) { - return VFromD{_mm256_cvtepu64_pd(v.raw)}; -} -#endif // HWY_TARGET <= HWY_AVX3 - -// Truncates (rounds toward zero). - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D d, Vec256 v) { - return detail::FixConversionOverflow(d, v, - VFromD{_mm256_cvttph_epi16(v.raw)}); -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API VFromD ConvertTo(D d, Vec256 v) { - return detail::FixConversionOverflow(d, v, - VFromD{_mm256_cvttps_epi32(v.raw)}); -} - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API VFromD ConvertTo(D di, Vec256 v) { - return detail::FixConversionOverflow(di, v, - VFromD{_mm256_cvttpd_epi64(v.raw)}); -} -#endif // HWY_TARGET <= HWY_AVX3 - -HWY_API Vec256 NearestInt(const Vec256 v) { - const Full256 di; - return detail::FixConversionOverflow( - di, v, Vec256{_mm256_cvtps_epi32(v.raw)}); -} - -#ifndef HWY_DISABLE_F16C - -template -HWY_API VFromD PromoteTo(D df32, Vec128 v) { - (void)df32; -#if HWY_HAVE_FLOAT16 - const RebindToUnsigned> du16; - return VFromD{_mm256_cvtph_ps(BitCast(du16, v).raw)}; -#else - return VFromD{_mm256_cvtph_ps(v.raw)}; -#endif // HWY_HAVE_FLOAT16 -} - -#endif // HWY_DISABLE_F16C - -template -HWY_API VFromD PromoteTo(D df32, Vec128 v) { - const Rebind du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -// ================================================== CRYPTO - -#if !defined(HWY_DISABLE_PCLMUL_AES) - -HWY_API Vec256 AESRound(Vec256 state, - Vec256 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_aesenc_epi128(state.raw, round_key.raw)}; -#else - const Full256 d; - const Half d2; - return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESRound(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -HWY_API Vec256 AESLastRound(Vec256 state, - Vec256 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_aesenclast_epi128(state.raw, round_key.raw)}; -#else - const Full256 d; - const Half d2; - return Combine(d, - AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESLastRound(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -HWY_API Vec256 AESRoundInv(Vec256 state, - Vec256 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_aesdec_epi128(state.raw, round_key.raw)}; -#else - const Full256 d; - const Half d2; - return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESRoundInv(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -HWY_API Vec256 AESLastRoundInv(Vec256 state, - Vec256 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_aesdeclast_epi128(state.raw, round_key.raw)}; -#else - const Full256 d; - const Half d2; - return Combine( - d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESLastRoundInv(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -template )> -HWY_API V AESInvMixColumns(V state) { - const DFromV d; -#if HWY_TARGET <= HWY_AVX3_DL - // On AVX3_DL, it is more efficient to do an InvMixColumns operation for a - // 256-bit or 512-bit vector by doing a AESLastRound operation - // (_mm256_aesenclast_epi128/_mm512_aesenclast_epi128) followed by a - // AESRoundInv operation (_mm256_aesdec_epi128/_mm512_aesdec_epi128) than to - // split the vector into 128-bit vectors, carrying out multiple - // _mm_aesimc_si128 operations, and then combining the _mm_aesimc_si128 - // results back into a 256-bit or 512-bit vector. - const auto zero = Zero(d); - return AESRoundInv(AESLastRound(state, zero), zero); -#else - const Half dh; - return Combine(d, AESInvMixColumns(UpperHalf(dh, state)), - AESInvMixColumns(LowerHalf(dh, state))); -#endif -} - -template -HWY_API Vec256 AESKeyGenAssist(Vec256 v) { - const Full256 d; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(16) static constexpr uint8_t kRconXorMask[16] = { - 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; - alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { - 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; - const Repartition du32; - const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); - const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask)); - return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle)); -#else - const Half d2; - return Combine(d, AESKeyGenAssist(UpperHalf(d2, v)), - AESKeyGenAssist(LowerHalf(v))); -#endif -} - -HWY_API Vec256 CLMulLower(Vec256 a, Vec256 b) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)}; -#else - const Full256 d; - const Half d2; - return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)), - CLMulLower(LowerHalf(a), LowerHalf(b))); -#endif -} - -HWY_API Vec256 CLMulUpper(Vec256 a, Vec256 b) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec256{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)}; -#else - const Full256 d; - const Half d2; - return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)), - CLMulUpper(LowerHalf(a), LowerHalf(b))); -#endif -} - -#endif // HWY_DISABLE_PCLMUL_AES - -// ================================================== MISC - -#if HWY_TARGET <= HWY_AVX3 - -// ------------------------------ LoadMaskBits - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - constexpr size_t kN = MaxLanes(d); - constexpr size_t kNumBytes = (kN + 7) / 8; - - uint64_t mask_bits = 0; - CopyBytes(bits, &mask_bits); - - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - - return MFromD::FromBits(mask_bits); -} - -// ------------------------------ StoreMaskBits - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - constexpr size_t kN = MaxLanes(d); - constexpr size_t kNumBytes = (kN + 7) / 8; - - CopyBytes(&mask.raw, bits); - - // Non-full byte, need to clear the undefined upper bits. - if (kN < 8) { - const int mask_bits = static_cast((1ull << kN) - 1); - bits[0] = static_cast(bits[0] & mask_bits); - } - return kNumBytes; -} - -// ------------------------------ Mask testing - -template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(static_cast(mask.raw)); -} - -template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); -} - -template -HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { - return mask.raw ? static_cast(FindKnownFirstTrue(d, mask)) - : intptr_t{-1}; -} - -template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw); -} - -template -HWY_API intptr_t FindLastTrue(D d, MFromD mask) { - return mask.raw ? static_cast(FindKnownLastTrue(d, mask)) - : intptr_t{-1}; -} - -// Beware: the suffix indicates the number of mask bits, not lane size! - -namespace detail { - -template -HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask32_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} -template -HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask16_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} -template -HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask8_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} -template -HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256 mask) { - return (uint64_t{mask.raw} & 0xF) == 0; -} - -} // namespace detail - -template -HWY_API bool AllFalse(D /* tag */, MFromD mask) { - return detail::AllFalse(hwy::SizeTag)>(), mask); -} - -namespace detail { - -template -HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask32_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFFFFFFFu; -#endif -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask16_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFFFu; -#endif -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask8_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFu; -#endif -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256 mask) { - // Cannot use _kortestc because we have less than 8 mask bits. - return mask.raw == 0xFu; -} - -} // namespace detail - -template -HWY_API bool AllTrue(D /* tag */, const MFromD mask) { - return detail::AllTrue(hwy::SizeTag)>(), mask); -} - -// ------------------------------ Compress - -// 16-bit is defined in x86_512 so we can use 512-bit vectors. - -template -HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; -} - -HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { - return Vec256{_mm256_maskz_compress_ps(mask.raw, v.raw)}; -} - -template -HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { - // See CompressIsPartition. - alignas(16) static constexpr uint64_t packed_array[16] = { - // PrintCompress64x4NibbleTables - 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120, - 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310, - 0x00001032, 0x00001320, 0x00000321, 0x00003210}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 2) - - // _mm256_permutexvar_epi64 will ignore the upper bits. - const DFromV d; - const RebindToUnsigned du64; - const auto packed = Set(du64, packed_array[mask.raw]); - alignas(64) static constexpr uint64_t shifts[4] = {0, 4, 8, 12}; - const auto indices = Indices256{(packed >> Load(du64, shifts)).raw}; - return TableLookupLanes(v, indices); -} - -// ------------------------------ CompressNot (Compress) - -// Implemented in x86_512 for lane size != 8. - -template -HWY_API Vec256 CompressNot(Vec256 v, Mask256 mask) { - // See CompressIsPartition. - alignas(16) static constexpr uint64_t packed_array[16] = { - // PrintCompressNot64x4NibbleTables - 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031, - 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102, - 0x00003210, 0x00003201, 0x00003210, 0x00003210}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 2) - - // _mm256_permutexvar_epi64 will ignore the upper bits. - const DFromV d; - const RebindToUnsigned du64; - const auto packed = Set(du64, packed_array[mask.raw]); - alignas(32) static constexpr uint64_t shifts[4] = {0, 4, 8, 12}; - const auto indices = Indices256{(packed >> Load(du64, shifts)).raw}; - return TableLookupLanes(v, indices); -} - -// ------------------------------ CompressStore (defined in x86_512) -// ------------------------------ CompressBlendedStore (defined in x86_512) -// ------------------------------ CompressBitsStore (defined in x86_512) - -#else // AVX2 - -// ------------------------------ LoadMaskBits (TestBit) - -namespace detail { - -// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_V_SIZE. -template -HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { - const Full256 d; - const RebindToUnsigned du; - const Repartition du32; - const auto vbits = BitCast(du, Set(du32, static_cast(mask_bits))); - - // Replicate bytes 8x such that each byte contains the bit that governs it. - const Repartition du64; - alignas(32) static constexpr uint64_t kRep8[4] = { - 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull, - 0x0303030303030303ull}; - const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8))); - - alignas(32) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, - 1, 2, 4, 8, 16, 32, 64, 128}; - return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); -} - -template -HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { - const Full256 d; - const RebindToUnsigned du; - alignas(32) static constexpr uint16_t kBit[16] = { - 1, 2, 4, 8, 16, 32, 64, 128, - 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; - const auto vmask_bits = Set(du, static_cast(mask_bits)); - return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); -} - -template -HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { - const Full256 d; - const RebindToUnsigned du; - alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - const auto vmask_bits = Set(du, static_cast(mask_bits)); - return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); -} - -template -HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { - const Full256 d; - const RebindToUnsigned du; - alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8}; - return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); -} - -} // namespace detail - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { - constexpr size_t kN = MaxLanes(d); - constexpr size_t kNumBytes = (kN + 7) / 8; - - uint64_t mask_bits = 0; - CopyBytes(bits, &mask_bits); - - if (kN < 8) { - mask_bits &= (1ull << kN) - 1; - } - - return detail::LoadMaskBits256>(mask_bits); -} - -// ------------------------------ StoreMaskBits - -namespace detail { - -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - const Full256 d; - const Full256 d8; - const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; - // Prevent sign-extension of 32-bit masks because the intrinsic returns int. - return static_cast(_mm256_movemask_epi8(sign_bits)); -} - -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { -#if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2) - const Full256 d; - const Full256 d8; - const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - const uint64_t sign_bits8 = BitsFromMask(mask8); - // Skip the bits from the lower byte of each u16 (better not to use the - // same packs_epi16 as SSE4, because that requires an extra swizzle here). - return _pext_u32(static_cast(sign_bits8), 0xAAAAAAAAu); -#else - // Slow workaround for when BMI2 is disabled - // Remove useless lower half of each u16 while preserving the sign bit. - // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes. - const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256()); - // Move odd qwords (value zero) to top so they don't affect the mask value. - const auto compressed = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0))); - return static_cast(_mm_movemask_epi8(compressed)); -#endif // HWY_ARCH_X86_64 -} - -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - const Full256 d; - const Full256 df; - const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; - return static_cast(_mm256_movemask_ps(sign_bits)); -} - -template -HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { - const Full256 d; - const Full256 df; - const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; - return static_cast(_mm256_movemask_pd(sign_bits)); -} - -} // namespace detail - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { - constexpr size_t N = Lanes(d); - constexpr size_t kNumBytes = (N + 7) / 8; - - const uint64_t mask_bits = detail::BitsFromMask(mask); - CopyBytes(&mask_bits, bits); - return kNumBytes; -} - -// ------------------------------ Mask testing - -// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask -// lane is 0 or ~0. -template -HWY_API bool AllFalse(D d, MFromD mask) { - const Repartition d8; - const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return detail::BitsFromMask(mask8) == 0; -} - -template -HWY_API bool AllFalse(D /* tag */, MFromD mask) { - // Cheaper than PTEST, which is 2 uop / 3L. - return detail::BitsFromMask(mask) == 0; -} - -template -HWY_API bool AllTrue(D d, MFromD mask) { - const Repartition d8; - const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return detail::BitsFromMask(mask8) == (1ull << 32) - 1; -} -template -HWY_API bool AllTrue(D d, MFromD mask) { - constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1; - return detail::BitsFromMask(mask) == kAllBits; -} - -template -HWY_API size_t CountTrue(D d, MFromD mask) { - const Repartition d8; - const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); - return PopCount(detail::BitsFromMask(mask8)) >> 1; -} -template -HWY_API size_t CountTrue(D /* tag */, MFromD mask) { - return PopCount(detail::BitsFromMask(mask)); -} - -template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); - return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); -} - -template -HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); - return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; -} - -template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); - return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); -} - -template -HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { - const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); - return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) - : -1; -} - -// ------------------------------ Compress, CompressBits - -namespace detail { - -template -HWY_INLINE Vec256 IndicesFromBits256(uint64_t mask_bits) { - const Full256 d32; - // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT - // of SetTableIndices would require 8 KiB, a large part of L1D. The other - // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) - // and unavailable in 32-bit builds. We instead compress each index into 4 - // bits, for a total of 1 KiB. - alignas(16) static constexpr uint32_t packed_array[256] = { - // PrintCompress32x8Tables - 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8, - 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98, - 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8, - 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98, - 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8, - 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98, - 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8, - 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98, - 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8, - 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98, - 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8, - 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98, - 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8, - 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98, - 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8, - 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98, - 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8, - 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98, - 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8, - 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98, - 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8, - 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98, - 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8, - 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98, - 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8, - 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98, - 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8, - 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98, - 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8, - 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98, - 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8, - 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98, - 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8, - 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98, - 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8, - 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98, - 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8, - 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98, - 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8, - 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98, - 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8, - 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98, - 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98}; - - // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31. - // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. - // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing - // latency, it may be faster to use LoadDup128 and PSHUFB. - const auto packed = Set(d32, packed_array[mask_bits]); - alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, - 16, 20, 24, 28}; - return packed >> Load(d32, shifts); -} - -template -HWY_INLINE Vec256 IndicesFromBits256(uint64_t mask_bits) { - const Full256 d32; - - // For 64-bit, we still need 32-bit indices because there is no 64-bit - // permutevar, but there are only 4 lanes, so we can afford to skip the - // unpacking and load the entire index vector directly. - alignas(32) static constexpr uint32_t u32_indices[128] = { - // PrintCompress64x4PairTables - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, - 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, - 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7, - 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7, - 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5, - 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5, - 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3, - 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15}; - return Load(d32, u32_indices + 8 * mask_bits); -} - -template -HWY_INLINE Vec256 IndicesFromNotBits256(uint64_t mask_bits) { - const Full256 d32; - // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT - // of SetTableIndices would require 8 KiB, a large part of L1D. The other - // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) - // and unavailable in 32-bit builds. We instead compress each index into 4 - // bits, for a total of 1 KiB. - alignas(16) static constexpr uint32_t packed_array[256] = { - // PrintCompressNot32x8Tables - 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9, - 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca, - 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9, - 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb, - 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9, - 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba, - 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9, - 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec, - 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9, - 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea, - 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9, - 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb, - 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9, - 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba, - 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9, - 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd, - 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9, - 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca, - 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9, - 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb, - 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9, - 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba, - 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9, - 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc, - 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9, - 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda, - 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9, - 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb, - 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9, - 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba, - 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9, - 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e, - 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9, - 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca, - 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9, - 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db, - 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9, - 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba, - 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9, - 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c, - 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9, - 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a, - 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98}; - - // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31. - // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. - // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing - // latency, it may be faster to use LoadDup128 and PSHUFB. - const Vec256 packed = Set(d32, packed_array[mask_bits]); - alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, - 16, 20, 24, 28}; - return packed >> Load(d32, shifts); -} - -template -HWY_INLINE Vec256 IndicesFromNotBits256(uint64_t mask_bits) { - const Full256 d32; - - // For 64-bit, we still need 32-bit indices because there is no 64-bit - // permutevar, but there are only 4 lanes, so we can afford to skip the - // unpacking and load the entire index vector directly. - alignas(32) static constexpr uint32_t u32_indices[128] = { - // PrintCompressNot64x4PairTables - 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, - 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, - 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13, - 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, - 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15, - 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15, - 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15}; - return Load(d32, u32_indices + 8 * mask_bits); -} - -template -HWY_INLINE Vec256 Compress(Vec256 v, const uint64_t mask_bits) { - const DFromV d; - const Repartition du32; - - HWY_DASSERT(mask_bits < (1ull << Lanes(d))); - // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is - // no instruction for 4x64). - const Indices256 indices{IndicesFromBits256(mask_bits).raw}; - return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); -} - -// LUTs are infeasible for 2^16 possible masks, so splice together two -// half-vector Compress. -template -HWY_INLINE Vec256 Compress(Vec256 v, const uint64_t mask_bits) { - const DFromV d; - const RebindToUnsigned du; - const auto vu16 = BitCast(du, v); // (required for float16_t inputs) - const Half duh; - const auto half0 = LowerHalf(duh, vu16); - const auto half1 = UpperHalf(duh, vu16); - - const uint64_t mask_bits0 = mask_bits & 0xFF; - const uint64_t mask_bits1 = mask_bits >> 8; - const auto compressed0 = detail::CompressBits(half0, mask_bits0); - const auto compressed1 = detail::CompressBits(half1, mask_bits1); - - alignas(32) uint16_t all_true[16] = {}; - // Store mask=true lanes, left to right. - const size_t num_true0 = PopCount(mask_bits0); - Store(compressed0, duh, all_true); - StoreU(compressed1, duh, all_true + num_true0); - - if (hwy::HWY_NAMESPACE::CompressIsPartition::value) { - // Store mask=false lanes, right to left. The second vector fills the upper - // half with right-aligned false lanes. The first vector is shifted - // rightwards to overwrite the true lanes of the second. - alignas(32) uint16_t all_false[16] = {}; - const size_t num_true1 = PopCount(mask_bits1); - Store(compressed1, duh, all_false + 8); - StoreU(compressed0, duh, all_false + num_true1); - - const auto mask = FirstN(du, num_true0 + num_true1); - return BitCast(d, - IfThenElse(mask, Load(du, all_true), Load(du, all_false))); - } else { - // Only care about the mask=true lanes. - return BitCast(d, Load(du, all_true)); - } -} - -template -HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { - const DFromV d; - const Repartition du32; - - HWY_DASSERT(mask_bits < (1ull << Lanes(d))); - // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is - // no instruction for 4x64). - const Indices256 indices{IndicesFromNotBits256(mask_bits).raw}; - return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); -} - -// LUTs are infeasible for 2^16 possible masks, so splice together two -// half-vector Compress. -template -HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { - // Compress ensures only the lower 16 bits are set, so flip those. - return Compress(v, mask_bits ^ 0xFFFF); -} - -} // namespace detail - -template -HWY_API Vec256 Compress(Vec256 v, Mask256 m) { - return detail::Compress(v, detail::BitsFromMask(m)); -} - -template -HWY_API Vec256 CompressNot(Vec256 v, Mask256 m) { - return detail::CompressNot(v, detail::BitsFromMask(m)); -} - -HWY_API Vec256 CompressBlocksNot(Vec256 v, - Mask256 mask) { - return CompressNot(v, mask); -} - -template -HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { - constexpr size_t N = 32 / sizeof(T); - constexpr size_t kNumBytes = (N + 7) / 8; - - uint64_t mask_bits = 0; - CopyBytes(bits, &mask_bits); - - if (N < 8) { - mask_bits &= (1ull << N) - 1; - } - - return detail::Compress(v, mask_bits); -} - -// ------------------------------ CompressStore, CompressBitsStore - -template -HWY_API size_t CompressStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); - const size_t count = PopCount(mask_bits); - StoreU(detail::Compress(v, mask_bits), d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); - const size_t count = PopCount(mask_bits); - - const RebindToUnsigned du; - const Repartition du32; - HWY_DASSERT(mask_bits < (1ull << Lanes(d))); - // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is - // no instruction for 4x64). Nibble MSB encodes FirstN. - const Vec256 idx_mask = - detail::IndicesFromBits256>(mask_bits); - // Shift nibble MSB into MSB - const Mask256 mask32 = MaskFromVec(ShiftLeft<28>(idx_mask)); - // First cast to unsigned (RebindMask cannot change lane size) - const MFromD mask_u{mask32.raw}; - const MFromD mask = RebindMask(d, mask_u); - const VFromD compressed = BitCast( - d, - TableLookupLanes(BitCast(du32, v), Indices256{idx_mask.raw})); - - BlendedStore(compressed, mask, d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits = detail::BitsFromMask(m); - const size_t count = PopCount(mask_bits); - const VFromD compressed = detail::Compress(v, mask_bits); - -#if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN - // BlendedStore tests mask for each lane, but we know that the mask is - // FirstN, so we can just copy. - alignas(32) TFromD buf[16]; - Store(compressed, d, buf); - CopyBytes(buf, unaligned, count * sizeof(TFromD)); -#else - BlendedStore(compressed, FirstN(d, count), d, unaligned); -#endif - return count; -} - -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - constexpr size_t kNumBytes = (N + 7) / 8; - - uint64_t mask_bits = 0; - CopyBytes(bits, &mask_bits); - - if (N < 8) { - mask_bits &= (1ull << N) - 1; - } - const size_t count = PopCount(mask_bits); - - StoreU(detail::Compress(v, mask_bits), d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Expand - -// Always define Expand/LoadExpand because generic_ops only does so for Vec128. - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 - -HWY_INLINE Vec256 NativeExpand(Vec256 v, - Mask256 mask) { - return Vec256{_mm256_maskz_expand_epi8(mask.raw, v.raw)}; -} - -HWY_INLINE Vec256 NativeExpand(Vec256 v, - Mask256 mask) { - return Vec256{_mm256_maskz_expand_epi16(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint8_t* HWY_RESTRICT unaligned) { - return VFromD{_mm256_maskz_expandloadu_epi8(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint16_t* HWY_RESTRICT unaligned) { - return VFromD{_mm256_maskz_expandloadu_epi16(mask.raw, unaligned)}; -} - -#endif // HWY_TARGET <= HWY_AVX3_DL -#if HWY_TARGET <= HWY_AVX3 || HWY_IDE - -HWY_INLINE Vec256 NativeExpand(Vec256 v, - Mask256 mask) { - return Vec256{_mm256_maskz_expand_epi32(mask.raw, v.raw)}; -} - -HWY_INLINE Vec256 NativeExpand(Vec256 v, - Mask256 mask) { - return Vec256{_mm256_maskz_expand_epi64(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint32_t* HWY_RESTRICT unaligned) { - return VFromD{_mm256_maskz_expandloadu_epi32(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, - const uint64_t* HWY_RESTRICT unaligned) { - return VFromD{_mm256_maskz_expandloadu_epi64(mask.raw, unaligned)}; -} - -#endif // HWY_TARGET <= HWY_AVX3 - -} // namespace detail - -template -HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { - const DFromV d; -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - const RebindToUnsigned du; - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -#else - // LUTs are infeasible for so many mask combinations, so Combine two - // half-vector Expand. - const Half dh; - const uint64_t mask_bits = detail::BitsFromMask(mask); - constexpr size_t N = 32 / sizeof(T); - const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); - const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); - const Vec128 expandL = Expand(LowerHalf(v), maskL); - // We have to shift the input by a variable number of bytes, but there isn't - // a table-driven option for that until VBMI, and CPUs with that likely also - // have VBMI2 and thus native Expand. - alignas(32) T lanes[N]; - Store(v, d, lanes); - const Mask128 maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); - const Vec128 expandH = Expand(LoadU(dh, lanes + countL), maskH); - return Combine(d, expandH, expandL); -#endif -} - -// If AVX3, this is already implemented by x86_512. -#if HWY_TARGET != HWY_AVX3 - -template -HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { - const Full256 d; -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - const RebindToUnsigned du; - return BitCast(d, detail::NativeExpand(BitCast(du, v), RebindMask(du, mask))); -#else // AVX2 - // LUTs are infeasible for 2^16 possible masks, so splice together two - // half-vector Expand. - const Half dh; - const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); - const Vec128 expandL = Expand(LowerHalf(v), maskL); - // We have to shift the input by a variable number of u16. permutevar_epi16 - // requires AVX3 and if we had that, we'd use native u32 Expand. The only - // alternative is re-loading, which incurs a store to load forwarding stall. - alignas(32) T lanes[32 / sizeof(T)]; - Store(v, d, lanes); - const Vec128 vH = LoadU(dh, lanes + CountTrue(dh, maskL)); - const Mask128 maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); - const Vec128 expandH = Expand(vH, maskH); - return Combine(d, expandH, expandL); -#endif // AVX2 -} - -#endif // HWY_TARGET != HWY_AVX3 - -template -HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { - const Full256 d; -#if HWY_TARGET <= HWY_AVX3 - const RebindToUnsigned du; - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -#else - const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); - - alignas(16) constexpr uint32_t packed_array[256] = { - // PrintExpand32x8Nibble. - 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0, - 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10, - 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0, - 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210, - 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0, - 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10, - 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0, - 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210, - 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0, - 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10, - 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0, - 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210, - 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0, - 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10, - 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0, - 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210, - 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0, - 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10, - 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0, - 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210, - 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0, - 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10, - 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0, - 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210, - 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0, - 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10, - 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0, - 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210, - 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0, - 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10, - 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0, - 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210, - 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0, - 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10, - 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0, - 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210, - 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0, - 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10, - 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0, - 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210, - 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0, - 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10, - 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210, - }; - - // For lane i, shift the i-th 4-bit index down to bits [0, 3). - const Vec256 packed = Set(du, packed_array[mask_bits]); - alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; - // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. - const Indices256 indices{(packed >> Load(du, shifts)).raw}; - const Vec256 expand = TableLookupLanes(BitCast(du, v), indices); - // TableLookupLanes cannot also zero masked-off lanes, so do that now. - return IfThenElseZero(mask, BitCast(d, expand)); -#endif -} - -template -HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { - const Full256 d; -#if HWY_TARGET <= HWY_AVX3 - const RebindToUnsigned du; - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -#else - const RebindToUnsigned du; - const uint64_t mask_bits = detail::BitsFromMask(mask); - - alignas(16) constexpr uint64_t packed_array[16] = { - // PrintExpand64x4Nibble. - 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, - 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, - 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 2). - const Vec256 packed = Set(du, packed_array[mask_bits]); - alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; -#if HWY_TARGET <= HWY_AVX3 // native 64-bit TableLookupLanes - // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. - const Indices256 indices{(packed >> Load(du, shifts)).raw}; -#else - // 64-bit TableLookupLanes on AVX2 requires IndicesFromVec, which checks - // bounds, so clear the upper bits. - const Vec256 masked = And(packed >> Load(du, shifts), Set(du, 3)); - const Indices256 indices = IndicesFromVec(du, masked); -#endif - const Vec256 expand = TableLookupLanes(BitCast(du, v), indices); - // TableLookupLanes cannot also zero masked-off lanes, so do that now. - return IfThenElseZero(mask, BitCast(d, expand)); -#endif -} - -// ------------------------------ LoadExpand - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - const RebindToUnsigned du; - using TU = TFromD; - const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); -#else - return Expand(LoadU(d, unaligned), mask); -#endif -} - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET <= HWY_AVX3 - const RebindToUnsigned du; - using TU = TFromD; - const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); -#else - return Expand(LoadU(d, unaligned), mask); -#endif -} - -// ------------------------------ LoadInterleaved3/4 - -// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. - -namespace detail { -// Input: -// 1 0 (<- first block of unaligned) -// 3 2 -// 5 4 -// Output: -// 3 0 -// 4 1 -// 5 2 -template -HWY_API void LoadTransposedBlocks3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& A, VFromD& B, VFromD& C) { - constexpr size_t N = Lanes(d); - const VFromD v10 = LoadU(d, unaligned + 0 * N); // 1 0 - const VFromD v32 = LoadU(d, unaligned + 1 * N); - const VFromD v54 = LoadU(d, unaligned + 2 * N); - - A = ConcatUpperLower(d, v32, v10); - B = ConcatLowerUpper(d, v54, v10); - C = ConcatUpperLower(d, v54, v32); -} - -// Input (128-bit blocks): -// 1 0 (first block of unaligned) -// 3 2 -// 5 4 -// 7 6 -// Output: -// 4 0 (LSB of vA) -// 5 1 -// 6 2 -// 7 3 -template -HWY_API void LoadTransposedBlocks4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& vA, VFromD& vB, VFromD& vC, - VFromD& vD) { - constexpr size_t N = Lanes(d); - const VFromD v10 = LoadU(d, unaligned + 0 * N); - const VFromD v32 = LoadU(d, unaligned + 1 * N); - const VFromD v54 = LoadU(d, unaligned + 2 * N); - const VFromD v76 = LoadU(d, unaligned + 3 * N); - - vA = ConcatLowerLower(d, v54, v10); - vB = ConcatUpperUpper(d, v54, v10); - vC = ConcatLowerLower(d, v76, v32); - vD = ConcatUpperUpper(d, v76, v32); -} -} // namespace detail - -// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) - -// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. - -namespace detail { -// Input (128-bit blocks): -// 2 0 (LSB of i) -// 3 1 -// Output: -// 1 0 -// 3 2 -template -HWY_API void StoreTransposedBlocks2(VFromD i, VFromD j, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - const auto out0 = ConcatLowerLower(d, j, i); - const auto out1 = ConcatUpperUpper(d, j, i); - StoreU(out0, d, unaligned + 0 * N); - StoreU(out1, d, unaligned + 1 * N); -} - -// Input (128-bit blocks): -// 3 0 (LSB of i) -// 4 1 -// 5 2 -// Output: -// 1 0 -// 3 2 -// 5 4 -template -HWY_API void StoreTransposedBlocks3(VFromD i, VFromD j, VFromD k, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - const auto out0 = ConcatLowerLower(d, j, i); - const auto out1 = ConcatUpperLower(d, i, k); - const auto out2 = ConcatUpperUpper(d, k, j); - StoreU(out0, d, unaligned + 0 * N); - StoreU(out1, d, unaligned + 1 * N); - StoreU(out2, d, unaligned + 2 * N); -} - -// Input (128-bit blocks): -// 4 0 (LSB of i) -// 5 1 -// 6 2 -// 7 3 -// Output: -// 1 0 -// 3 2 -// 5 4 -// 7 6 -template -HWY_API void StoreTransposedBlocks4(VFromD i, VFromD j, VFromD k, - VFromD l, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - // Write lower halves, then upper. - const auto out0 = ConcatLowerLower(d, j, i); - const auto out1 = ConcatLowerLower(d, l, k); - StoreU(out0, d, unaligned + 0 * N); - StoreU(out1, d, unaligned + 1 * N); - const auto out2 = ConcatUpperUpper(d, j, i); - const auto out3 = ConcatUpperUpper(d, l, k); - StoreU(out2, d, unaligned + 2 * N); - StoreU(out3, d, unaligned + 3 * N); -} -} // namespace detail - -// ------------------------------ Additional mask logical operations - -#if HWY_TARGET <= HWY_AVX3 -template -HWY_API Mask256 SetAtOrAfterFirst(Mask256 mask) { - constexpr size_t N = Lanes(Full256()); - constexpr uint32_t kActiveElemMask = - static_cast((uint64_t{1} << N) - 1); - return Mask256{static_cast::Raw>( - (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)}; -} -template -HWY_API Mask256 SetBeforeFirst(Mask256 mask) { - constexpr size_t N = Lanes(Full256()); - constexpr uint32_t kActiveElemMask = - static_cast((uint64_t{1} << N) - 1); - return Mask256{static_cast::Raw>( - (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)}; -} -template -HWY_API Mask256 SetAtOrBeforeFirst(Mask256 mask) { - constexpr size_t N = Lanes(Full256()); - constexpr uint32_t kActiveElemMask = - static_cast((uint64_t{1} << N) - 1); - return Mask256{static_cast::Raw>( - detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)}; -} -template -HWY_API Mask256 SetOnlyFirst(Mask256 mask) { - return Mask256{ - static_cast::Raw>(detail::AVX3Blsi(mask.raw))}; -} -#else // AVX2 -template -HWY_API Mask256 SetAtOrAfterFirst(Mask256 mask) { - const Full256 d; - const Repartition di64; - const Repartition df32; - const Repartition di32; - const Half dh_i64; - const Half dh_i32; - using VF32 = VFromD; - - auto vmask = BitCast(di64, VecFromMask(d, mask)); - vmask = Or(vmask, Neg(vmask)); - - // Copy the sign bit of the even int64_t lanes to the odd int64_t lanes - const auto vmask2 = BitCast( - di32, VF32{_mm256_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw, - _MM_SHUFFLE(1, 1, 0, 0))}); - vmask = Or(vmask, BitCast(di64, BroadcastSignBit(vmask2))); - - // Copy the sign bit of the lower 128-bit half to the upper 128-bit half - const auto vmask3 = - BroadcastSignBit(Broadcast<3>(BitCast(dh_i32, LowerHalf(dh_i64, vmask)))); - vmask = Or(vmask, BitCast(di64, Combine(di32, vmask3, Zero(dh_i32)))); - return MaskFromVec(BitCast(d, vmask)); -} - -template -HWY_API Mask256 SetBeforeFirst(Mask256 mask) { - return Not(SetAtOrAfterFirst(mask)); -} - -template -HWY_API Mask256 SetOnlyFirst(Mask256 mask) { - const Full256 d; - const RebindToSigned di; - const Repartition di64; - const Half dh_i64; - - const auto zero = Zero(di64); - const auto vmask = BitCast(di64, VecFromMask(d, mask)); - - const auto vmask_eq_0 = VecFromMask(di64, vmask == zero); - auto vmask2_lo = LowerHalf(dh_i64, vmask_eq_0); - auto vmask2_hi = UpperHalf(dh_i64, vmask_eq_0); - - vmask2_lo = And(vmask2_lo, InterleaveLower(vmask2_lo, vmask2_lo)); - vmask2_hi = And(ConcatLowerUpper(dh_i64, vmask2_hi, vmask2_lo), - InterleaveUpper(dh_i64, vmask2_lo, vmask2_lo)); - vmask2_lo = InterleaveLower(Set(dh_i64, int64_t{-1}), vmask2_lo); - - const auto vmask2 = Combine(di64, vmask2_hi, vmask2_lo); - const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); - return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); -} - -template -HWY_API Mask256 SetAtOrBeforeFirst(Mask256 mask) { - const Full256 d; - constexpr size_t kLanesPerBlock = MaxLanes(d) / 2; - - const auto vmask = VecFromMask(d, mask); - const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d)); - return SetBeforeFirst( - MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>( - d, vmask, vmask_lo))); -} -#endif // HWY_TARGET <= HWY_AVX3 - -// ------------------------------ Reductions - -namespace detail { - -// These functions start with each lane per 128-bit block being reduced with the -// corresponding lane in the other block, so we use the same logic as x86_128 -// but running on both blocks at the same time. There are two (64-bit) to eight -// (16-bit) lanes per block. -template -HWY_INLINE Vec256 SumOfLanes(Vec256 v10) { - const DFromV d; - return Add(v10, Reverse2(d, v10)); -} -template -HWY_INLINE Vec256 MinOfLanes(Vec256 v10) { - const DFromV d; - return Min(v10, Reverse2(d, v10)); -} -template -HWY_INLINE Vec256 MaxOfLanes(Vec256 v10) { - const DFromV d; - return Max(v10, Reverse2(d, v10)); -} - -template -HWY_INLINE Vec256 SumOfLanes(Vec256 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Add(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Add(v03_12_12_03, v12_03_03_12); -} -template -HWY_INLINE Vec256 MinOfLanes(Vec256 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Min(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Min(v03_12_12_03, v12_03_03_12); -} -template -HWY_INLINE Vec256 MaxOfLanes(Vec256 v3210) { - using V = decltype(v3210); - const DFromV d; - const V v0123 = Reverse4(d, v3210); - const V v03_12_12_03 = Max(v3210, v0123); - const V v12_03_03_12 = Reverse2(d, v03_12_12_03); - return Max(v03_12_12_03, v12_03_03_12); -} - -template -HWY_INLINE Vec256 SumOfLanes(Vec256 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Add(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Add(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Add(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} -template -HWY_INLINE Vec256 MinOfLanes(Vec256 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Min(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Min(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Min(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} -template -HWY_INLINE Vec256 MaxOfLanes(Vec256 v76543210) { - using V = decltype(v76543210); - const DFromV d; - // The upper half is reversed from the lower half; omit for brevity. - const V v34_25_16_07 = Max(v76543210, Reverse8(d, v76543210)); - const V v0347_1625_1625_0347 = Max(v34_25_16_07, Reverse4(d, v34_25_16_07)); - return Max(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); -} - -} // namespace detail - -// Supported for >8-bit types. Returns the broadcasted result. -template -HWY_API VFromD SumOfLanes(D /*d*/, VFromD vHL) { - const VFromD vLH = SwapAdjacentBlocks(vHL); - return detail::SumOfLanes(Add(vLH, vHL)); -} -template -HWY_API TFromD ReduceSum(D d, VFromD v) { - return GetLane(SumOfLanes(d, v)); -} -#if HWY_HAVE_FLOAT16 -template -HWY_API float16_t ReduceSum(D, VFromD v) { - return _mm256_reduce_add_ph(v.raw); -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API VFromD MinOfLanes(D /*d*/, VFromD vHL) { - const VFromD vLH = SwapAdjacentBlocks(vHL); - return detail::MinOfLanes(Min(vLH, vHL)); -} -template -HWY_API VFromD MaxOfLanes(D /*d*/, VFromD vHL) { - const VFromD vLH = SwapAdjacentBlocks(vHL); - return detail::MaxOfLanes(Max(vLH, vHL)); -} - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -#if HWY_TARGET <= HWY_AVX3 -template ), HWY_IF_V_SIZE_D(DFromV, 32)> -HWY_API V LeadingZeroCount(V v) { - return V{_mm256_lzcnt_epi32(v.raw)}; -} - -template ), HWY_IF_V_SIZE_D(DFromV, 32)> -HWY_API V LeadingZeroCount(V v) { - return V{_mm256_lzcnt_epi64(v.raw)}; -} -#endif // HWY_TARGET <= HWY_AVX3 - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - -// the warning seems to be issued at the call site of intrinsics, i.e. our code. -HWY_DIAGNOSTICS(pop) diff --git a/deps/highway/include/hwy/ops/x86_512-inl.h b/deps/highway/include/hwy/ops/x86_512-inl.h deleted file mode 100644 index 189d58dc..00000000 --- a/deps/highway/include/hwy/ops/x86_512-inl.h +++ /dev/null @@ -1,6733 +0,0 @@ -// Copyright 2019 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// 512-bit AVX512 vectors and operations. -// External include guard in highway.h - see comment there. - -// WARNING: most operations do not cross 128-bit block boundaries. In -// particular, "Broadcast", pack and zip behavior may be surprising. - -// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL -#include "hwy/base.h" - -// Avoid uninitialized warnings in GCC's avx512fintrin.h - see -// https://github.com/google/highway/issues/710) -HWY_DIAGNOSTICS(push) -#if HWY_COMPILER_GCC_ACTUAL -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") -HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, - ignored "-Wmaybe-uninitialized") -#endif - -#include // AVX2+ - -#if HWY_COMPILER_CLANGCL -// Including should be enough, but Clang's headers helpfully skip -// including these headers when _MSC_VER is defined, like when using clang-cl. -// Include these directly here. -// clang-format off -#include - -#include -// avxintrin defines __m256i and must come before avx2intrin. -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#if HWY_TARGET <= HWY_AVX3_DL -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -// Must come after avx512fintrin, else will not define 512-bit intrinsics. -#include -#include -#include -#endif // HWY_TARGET <= HWY_AVX3_DL - -#if HWY_TARGET <= HWY_AVX3_SPR -#include -#include -#endif // HWY_TARGET <= HWY_AVX3_SPR - -// clang-format on -#endif // HWY_COMPILER_CLANGCL - -// For half-width vectors. Already includes base.h and shared-inl.h. -#include "hwy/ops/x86_256-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -namespace detail { - -template -struct Raw512 { - using type = __m512i; -}; -#if HWY_HAVE_FLOAT16 -template <> -struct Raw512 { - using type = __m512h; -}; -#endif // HWY_HAVE_FLOAT16 -template <> -struct Raw512 { - using type = __m512; -}; -template <> -struct Raw512 { - using type = __m512d; -}; - -// Template arg: sizeof(lane type) -template -struct RawMask512 {}; -template <> -struct RawMask512<1> { - using type = __mmask64; -}; -template <> -struct RawMask512<2> { - using type = __mmask32; -}; -template <> -struct RawMask512<4> { - using type = __mmask16; -}; -template <> -struct RawMask512<8> { - using type = __mmask8; -}; - -} // namespace detail - -template -class Vec512 { - using Raw = typename detail::Raw512::type; - - public: - using PrivateT = T; // only for DFromV - static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromV - - // Compound assignment. Only usable if there is a corresponding non-member - // binary operator overload. For example, only f32 and f64 support division. - HWY_INLINE Vec512& operator*=(const Vec512 other) { - return *this = (*this * other); - } - HWY_INLINE Vec512& operator/=(const Vec512 other) { - return *this = (*this / other); - } - HWY_INLINE Vec512& operator+=(const Vec512 other) { - return *this = (*this + other); - } - HWY_INLINE Vec512& operator-=(const Vec512 other) { - return *this = (*this - other); - } - HWY_INLINE Vec512& operator&=(const Vec512 other) { - return *this = (*this & other); - } - HWY_INLINE Vec512& operator|=(const Vec512 other) { - return *this = (*this | other); - } - HWY_INLINE Vec512& operator^=(const Vec512 other) { - return *this = (*this ^ other); - } - - Raw raw; -}; - -// Mask register: one bit per lane. -template -struct Mask512 { - using Raw = typename detail::RawMask512::type; - Raw raw; -}; - -template -using Full512 = Simd; - -// ------------------------------ BitCast - -namespace detail { - -HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; } -#if HWY_HAVE_FLOAT16 -HWY_INLINE __m512i BitCastToInteger(__m512h v) { - return _mm512_castph_si512(v); -} -#endif // HWY_HAVE_FLOAT16 -HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); } -HWY_INLINE __m512i BitCastToInteger(__m512d v) { - return _mm512_castpd_si512(v); -} - -template -HWY_INLINE Vec512 BitCastToByte(Vec512 v) { - return Vec512{BitCastToInteger(v.raw)}; -} - -// Cannot rely on function overloading because return types differ. -template -struct BitCastFromInteger512 { - HWY_INLINE __m512i operator()(__m512i v) { return v; } -}; -#if HWY_HAVE_FLOAT16 -template <> -struct BitCastFromInteger512 { - HWY_INLINE __m512h operator()(__m512i v) { return _mm512_castsi512_ph(v); } -}; -#endif // HWY_HAVE_FLOAT16 -template <> -struct BitCastFromInteger512 { - HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); } -}; -template <> -struct BitCastFromInteger512 { - HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); } -}; - -template -HWY_INLINE VFromD BitCastFromByte(D /* tag */, Vec512 v) { - return VFromD{BitCastFromInteger512>()(v.raw)}; -} - -} // namespace detail - -template -HWY_API VFromD BitCast(D d, Vec512 v) { - return detail::BitCastFromByte(d, detail::BitCastToByte(v)); -} - -// ------------------------------ Set - -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm512_set1_epi8(static_cast(t))}; // NOLINT -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm512_set1_epi16(static_cast(t))}; // NOLINT -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm512_set1_epi32(static_cast(t))}; -} -template -HWY_API VFromD Set(D /* tag */, TFromD t) { - return VFromD{_mm512_set1_epi64(static_cast(t))}; // NOLINT -} -// bfloat16_t is handled by x86_128-inl.h. -#if HWY_HAVE_FLOAT16 -template -HWY_API Vec512 Set(D /* tag */, float16_t t) { - return Vec512{_mm512_set1_ph(t)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec512 Set(D /* tag */, float t) { - return Vec512{_mm512_set1_ps(t)}; -} -template -HWY_API Vec512 Set(D /* tag */, double t) { - return Vec512{_mm512_set1_pd(t)}; -} - -// ------------------------------ Zero (Set) - -// GCC pre-9.1 lacked setzero, so use Set instead. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 - -// Cannot use VFromD here because it is defined in terms of Zero. -template -HWY_API Vec512> Zero(D d) { - return Set(d, TFromD{0}); -} -// BitCast is defined below, but the Raw type is the same, so use that. -template -HWY_API Vec512 Zero(D /* tag */) { - const RebindToUnsigned du; - return Vec512{Set(du, 0).raw}; -} -template -HWY_API Vec512 Zero(D /* tag */) { - const RebindToUnsigned du; - return Vec512{Set(du, 0).raw}; -} - -#else - -template -HWY_API Vec512> Zero(D /* tag */) { - return Vec512>{_mm512_setzero_si512()}; -} -template -HWY_API Vec512 Zero(D /* tag */) { - return Vec512{_mm512_setzero_si512()}; -} -template -HWY_API Vec512 Zero(D /* tag */) { -#if HWY_HAVE_FLOAT16 - return Vec512{_mm512_setzero_ph()}; -#else - return Vec512{_mm512_setzero_si512()}; -#endif -} -template -HWY_API Vec512 Zero(D /* tag */) { - return Vec512{_mm512_setzero_ps()}; -} -template -HWY_API Vec512 Zero(D /* tag */) { - return Vec512{_mm512_setzero_pd()}; -} - -#endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 - -// ------------------------------ Undefined - -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") - -// Returns a vector with uninitialized elements. -template -HWY_API Vec512> Undefined(D /* tag */) { - // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC - // generate an XOR instruction. - return Vec512>{_mm512_undefined_epi32()}; -} -template -HWY_API Vec512 Undefined(D /* tag */) { - return Vec512{_mm512_undefined_epi32()}; -} -template -HWY_API Vec512 Undefined(D /* tag */) { -#if HWY_HAVE_FLOAT16 - return Vec512{_mm512_undefined_ph()}; -#else - return Vec512{_mm512_undefined_epi32()}; -#endif -} -template -HWY_API Vec512 Undefined(D /* tag */) { - return Vec512{_mm512_undefined_ps()}; -} -template -HWY_API Vec512 Undefined(D /* tag */) { - return Vec512{_mm512_undefined_pd()}; -} - -HWY_DIAGNOSTICS(pop) - -// ------------------------------ ResizeBitCast - -// 64-byte vector to 16-byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, Vec128{_mm512_castsi512_si128( - BitCast(Full512(), v).raw)}); -} - -// <= 16-byte vector to 64-byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, Vec512{_mm512_castsi128_si512( - ResizeBitCast(Full128(), v).raw)}); -} - -// 32-byte vector to 64-byte vector -template -HWY_API VFromD ResizeBitCast(D d, FromV v) { - return BitCast(d, Vec512{_mm512_castsi256_si512( - BitCast(Full256(), v).raw)}); -} - -// ----------------------------- Iota - -namespace detail { - -template -HWY_INLINE VFromD Iota0(D d) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 - // Missing set_epi8/16. - alignas(64) static constexpr TFromD kIota[64] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; - return Load(d, kIota); -#else - (void)d; - return VFromD{_mm512_set_epi8( - static_cast(63), static_cast(62), static_cast(61), - static_cast(60), static_cast(59), static_cast(58), - static_cast(57), static_cast(56), static_cast(55), - static_cast(54), static_cast(53), static_cast(52), - static_cast(51), static_cast(50), static_cast(49), - static_cast(48), static_cast(47), static_cast(46), - static_cast(45), static_cast(44), static_cast(43), - static_cast(42), static_cast(41), static_cast(40), - static_cast(39), static_cast(38), static_cast(37), - static_cast(36), static_cast(35), static_cast(34), - static_cast(33), static_cast(32), static_cast(31), - static_cast(30), static_cast(29), static_cast(28), - static_cast(27), static_cast(26), static_cast(25), - static_cast(24), static_cast(23), static_cast(22), - static_cast(21), static_cast(20), static_cast(19), - static_cast(18), static_cast(17), static_cast(16), - static_cast(15), static_cast(14), static_cast(13), - static_cast(12), static_cast(11), static_cast(10), - static_cast(9), static_cast(8), static_cast(7), - static_cast(6), static_cast(5), static_cast(4), - static_cast(3), static_cast(2), static_cast(1), - static_cast(0))}; -#endif -} - -template -HWY_INLINE VFromD Iota0(D d) { -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 900 - // Missing set_epi8/16. - alignas(64) static constexpr TFromD kIota[32] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - return Load(d, kIota); -#else - (void)d; - return VFromD{_mm512_set_epi16( - int16_t{31}, int16_t{30}, int16_t{29}, int16_t{28}, int16_t{27}, - int16_t{26}, int16_t{25}, int16_t{24}, int16_t{23}, int16_t{22}, - int16_t{21}, int16_t{20}, int16_t{19}, int16_t{18}, int16_t{17}, - int16_t{16}, int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, - int16_t{11}, int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, - int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})}; -#endif -} - -#if HWY_HAVE_FLOAT16 -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm512_set_ph( - float16_t{31}, float16_t{30}, float16_t{29}, float16_t{28}, float16_t{27}, - float16_t{26}, float16_t{25}, float16_t{24}, float16_t{23}, float16_t{22}, - float16_t{21}, float16_t{20}, float16_t{19}, float16_t{18}, float16_t{17}, - float16_t{16}, float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12}, - float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8}, float16_t{7}, - float16_t{6}, float16_t{5}, float16_t{4}, float16_t{3}, float16_t{2}, - float16_t{1}, float16_t{0})}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm512_set_epi32( - int32_t{15}, int32_t{14}, int32_t{13}, int32_t{12}, int32_t{11}, - int32_t{10}, int32_t{9}, int32_t{8}, int32_t{7}, int32_t{6}, int32_t{5}, - int32_t{4}, int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm512_set_epi64(int64_t{7}, int64_t{6}, int64_t{5}, - int64_t{4}, int64_t{3}, int64_t{2}, - int64_t{1}, int64_t{0})}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, - 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, - 0.0f)}; -} - -template -HWY_INLINE VFromD Iota0(D /*d*/) { - return VFromD{_mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)}; -} - -} // namespace detail - -template -HWY_API VFromD Iota(D d, const T2 first) { - return detail::Iota0(d) + Set(d, static_cast>(first)); -} - -// ================================================== LOGICAL - -// ------------------------------ Not - -template -HWY_API Vec512 Not(const Vec512 v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m512i vu = BitCast(du, v).raw; - return BitCast(d, VU{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)}); -} - -// ------------------------------ And - -template -HWY_API Vec512 And(const Vec512 a, const Vec512 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm512_and_si512(a.raw, b.raw)}); -} - -HWY_API Vec512 And(const Vec512 a, const Vec512 b) { - return Vec512{_mm512_and_ps(a.raw, b.raw)}; -} -HWY_API Vec512 And(const Vec512 a, const Vec512 b) { - return Vec512{_mm512_and_pd(a.raw, b.raw)}; -} - -// ------------------------------ AndNot - -// Returns ~not_mask & mask. -template -HWY_API Vec512 AndNot(const Vec512 not_mask, const Vec512 mask) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast( - d, VFromD{_mm512_andnot_si512(not_mask.raw, mask.raw)}); -} -HWY_API Vec512 AndNot(const Vec512 not_mask, - const Vec512 mask) { - return Vec512{_mm512_andnot_ps(not_mask.raw, mask.raw)}; -} -HWY_API Vec512 AndNot(const Vec512 not_mask, - const Vec512 mask) { - return Vec512{_mm512_andnot_pd(not_mask.raw, mask.raw)}; -} - -// ------------------------------ Or - -template -HWY_API Vec512 Or(const Vec512 a, const Vec512 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm512_or_si512(a.raw, b.raw)}); -} - -HWY_API Vec512 Or(const Vec512 a, const Vec512 b) { - return Vec512{_mm512_or_ps(a.raw, b.raw)}; -} -HWY_API Vec512 Or(const Vec512 a, const Vec512 b) { - return Vec512{_mm512_or_pd(a.raw, b.raw)}; -} - -// ------------------------------ Xor - -template -HWY_API Vec512 Xor(const Vec512 a, const Vec512 b) { - const DFromV d; // for float16_t - const RebindToUnsigned du; - return BitCast(d, VFromD{_mm512_xor_si512(a.raw, b.raw)}); -} - -HWY_API Vec512 Xor(const Vec512 a, const Vec512 b) { - return Vec512{_mm512_xor_ps(a.raw, b.raw)}; -} -HWY_API Vec512 Xor(const Vec512 a, const Vec512 b) { - return Vec512{_mm512_xor_pd(a.raw, b.raw)}; -} - -// ------------------------------ Xor3 -template -HWY_API Vec512 Xor3(Vec512 x1, Vec512 x2, Vec512 x3) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m512i ret = _mm512_ternarylogic_epi64( - BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); - return BitCast(d, VU{ret}); -} - -// ------------------------------ Or3 -template -HWY_API Vec512 Or3(Vec512 o1, Vec512 o2, Vec512 o3) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m512i ret = _mm512_ternarylogic_epi64( - BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); - return BitCast(d, VU{ret}); -} - -// ------------------------------ OrAnd -template -HWY_API Vec512 OrAnd(Vec512 o, Vec512 a1, Vec512 a2) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const __m512i ret = _mm512_ternarylogic_epi64( - BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); - return BitCast(d, VU{ret}); -} - -// ------------------------------ IfVecThenElse -template -HWY_API Vec512 IfVecThenElse(Vec512 mask, Vec512 yes, Vec512 no) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw, - BitCast(du, yes).raw, - BitCast(du, no).raw, 0xCA)}); -} - -// ------------------------------ Operator overloads (internal-only if float) - -template -HWY_API Vec512 operator&(const Vec512 a, const Vec512 b) { - return And(a, b); -} - -template -HWY_API Vec512 operator|(const Vec512 a, const Vec512 b) { - return Or(a, b); -} - -template -HWY_API Vec512 operator^(const Vec512 a, const Vec512 b) { - return Xor(a, b); -} - -// ------------------------------ PopulationCount - -// 8/16 require BITALG, 32/64 require VPOPCNTDQ. -#if HWY_TARGET <= HWY_AVX3_DL - -#ifdef HWY_NATIVE_POPCNT -#undef HWY_NATIVE_POPCNT -#else -#define HWY_NATIVE_POPCNT -#endif - -namespace detail { - -template -HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<1> /* tag */, Vec512 v) { - return Vec512{_mm512_popcnt_epi8(v.raw)}; -} -template -HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<2> /* tag */, Vec512 v) { - return Vec512{_mm512_popcnt_epi16(v.raw)}; -} -template -HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<4> /* tag */, Vec512 v) { - return Vec512{_mm512_popcnt_epi32(v.raw)}; -} -template -HWY_INLINE Vec512 PopulationCount(hwy::SizeTag<8> /* tag */, Vec512 v) { - return Vec512{_mm512_popcnt_epi64(v.raw)}; -} - -} // namespace detail - -template -HWY_API Vec512 PopulationCount(Vec512 v) { - return detail::PopulationCount(hwy::SizeTag(), v); -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ================================================== MASK - -// ------------------------------ FirstN - -// Possibilities for constructing a bitmask of N ones: -// - kshift* only consider the lowest byte of the shift count, so they would -// not correctly handle large n. -// - Scalar shifts >= 64 are UB. -// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However, -// we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds. - -#if HWY_ARCH_X86_32 -namespace detail { - -// 32 bit mask is sufficient for lane size >= 2. -template -HWY_INLINE Mask512 FirstN(size_t n) { - Mask512 m; - const uint32_t all = ~uint32_t{0}; - // BZHI only looks at the lower 8 bits of n! - m.raw = static_cast((n > 255) ? all : _bzhi_u32(all, n)); - return m; -} - -#if HWY_COMPILER_MSVC >= 1920 || HWY_COMPILER_GCC_ACTUAL >= 900 || \ - HWY_COMPILER_CLANG || HWY_COMPILER_ICC -template -HWY_INLINE Mask512 FirstN(size_t n) { - uint32_t lo_mask; - uint32_t hi_mask; - uint32_t hi_mask_len; -#if HWY_COMPILER_GCC - if (__builtin_constant_p(n >= 32) && n >= 32) { - if (__builtin_constant_p(n >= 64) && n >= 64) { - hi_mask_len = 32u; - } else { - hi_mask_len = ((n <= 287) ? static_cast(n) : 287u) - 32u; - } - lo_mask = hi_mask = 0xFFFFFFFFu; - } else // NOLINT(readability/braces) -#endif - { - const uint32_t lo_mask_len = (n <= 255) ? static_cast(n) : 255u; - lo_mask = _bzhi_u32(0xFFFFFFFFu, lo_mask_len); - -#if HWY_COMPILER_GCC - if (__builtin_constant_p(lo_mask_len <= 32) && lo_mask_len <= 32) { - return Mask512{static_cast<__mmask64>(lo_mask)}; - } -#endif - - _addcarry_u32(_subborrow_u32(0, lo_mask_len, 32u, &hi_mask_len), - 0xFFFFFFFFu, 0u, &hi_mask); - } - hi_mask = _bzhi_u32(hi_mask, hi_mask_len); -#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC - if (__builtin_constant_p((static_cast(hi_mask) << 32) | lo_mask)) -#endif - return Mask512{static_cast<__mmask64>( - (static_cast(hi_mask) << 32) | lo_mask)}; -#if HWY_COMPILER_GCC && !HWY_COMPILER_ICC - else - return Mask512{_mm512_kunpackd(static_cast<__mmask64>(hi_mask), - static_cast<__mmask64>(lo_mask))}; -#endif -} -#else -template -HWY_INLINE Mask512 FirstN(size_t n) { - const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0}; - return Mask512{static_cast<__mmask64>(bits)}; -} -#endif -} // namespace detail -#endif // HWY_ARCH_X86_32 - -template -HWY_API MFromD FirstN(D /* tag */, size_t n) { -#if HWY_ARCH_X86_64 - MFromD m; - const uint64_t all = ~uint64_t{0}; - // BZHI only looks at the lower 8 bits of n! - m.raw = static_cast((n > 255) ? all : _bzhi_u64(all, n)); - return m; -#else - return detail::FirstN(n); -#endif // HWY_ARCH_X86_64 -} - -// ------------------------------ IfThenElse - -// Returns mask ? b : a. - -namespace detail { - -// Templates for signed/unsigned integer of a particular size. -template -HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<1> /* tag */, - const Mask512 mask, const Vec512 yes, - const Vec512 no) { - return Vec512{_mm512_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<2> /* tag */, - const Mask512 mask, const Vec512 yes, - const Vec512 no) { - return Vec512{_mm512_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<4> /* tag */, - const Mask512 mask, const Vec512 yes, - const Vec512 no) { - return Vec512{_mm512_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; -} -template -HWY_INLINE Vec512 IfThenElse(hwy::SizeTag<8> /* tag */, - const Mask512 mask, const Vec512 yes, - const Vec512 no) { - return Vec512{_mm512_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; -} - -} // namespace detail - -template -HWY_API Vec512 IfThenElse(const Mask512 mask, const Vec512 yes, - const Vec512 no) { - return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); -} -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 IfThenElse(Mask512 mask, - Vec512 yes, - Vec512 no) { - return Vec512{_mm512_mask_blend_ph(mask.raw, no.raw, yes.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 IfThenElse(Mask512 mask, Vec512 yes, - Vec512 no) { - return Vec512{_mm512_mask_blend_ps(mask.raw, no.raw, yes.raw)}; -} -HWY_API Vec512 IfThenElse(Mask512 mask, Vec512 yes, - Vec512 no) { - return Vec512{_mm512_mask_blend_pd(mask.raw, no.raw, yes.raw)}; -} - -namespace detail { - -template -HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<1> /* tag */, - const Mask512 mask, - const Vec512 yes) { - return Vec512{_mm512_maskz_mov_epi8(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<2> /* tag */, - const Mask512 mask, - const Vec512 yes) { - return Vec512{_mm512_maskz_mov_epi16(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<4> /* tag */, - const Mask512 mask, - const Vec512 yes) { - return Vec512{_mm512_maskz_mov_epi32(mask.raw, yes.raw)}; -} -template -HWY_INLINE Vec512 IfThenElseZero(hwy::SizeTag<8> /* tag */, - const Mask512 mask, - const Vec512 yes) { - return Vec512{_mm512_maskz_mov_epi64(mask.raw, yes.raw)}; -} - -} // namespace detail - -template -HWY_API Vec512 IfThenElseZero(const Mask512 mask, const Vec512 yes) { - return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); -} -HWY_API Vec512 IfThenElseZero(Mask512 mask, Vec512 yes) { - return Vec512{_mm512_maskz_mov_ps(mask.raw, yes.raw)}; -} -HWY_API Vec512 IfThenElseZero(Mask512 mask, - Vec512 yes) { - return Vec512{_mm512_maskz_mov_pd(mask.raw, yes.raw)}; -} - -namespace detail { - -template -HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<1> /* tag */, - const Mask512 mask, const Vec512 no) { - // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. - return Vec512{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<2> /* tag */, - const Mask512 mask, const Vec512 no) { - return Vec512{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<4> /* tag */, - const Mask512 mask, const Vec512 no) { - return Vec512{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; -} -template -HWY_INLINE Vec512 IfThenZeroElse(hwy::SizeTag<8> /* tag */, - const Mask512 mask, const Vec512 no) { - return Vec512{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; -} - -} // namespace detail - -template -HWY_API Vec512 IfThenZeroElse(const Mask512 mask, const Vec512 no) { - return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); -} -HWY_API Vec512 IfThenZeroElse(Mask512 mask, Vec512 no) { - return Vec512{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; -} -HWY_API Vec512 IfThenZeroElse(Mask512 mask, Vec512 no) { - return Vec512{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; -} - -template -HWY_API Vec512 IfNegativeThenElse(Vec512 v, Vec512 yes, Vec512 no) { - static_assert(IsSigned(), "Only works for signed/float"); - // AVX3 MaskFromVec only looks at the MSB - return IfThenElse(MaskFromVec(v), yes, no); -} - -template -HWY_API Vec512 ZeroIfNegative(const Vec512 v) { - // AVX3 MaskFromVec only looks at the MSB - return IfThenZeroElse(MaskFromVec(v), v); -} - -// ================================================== ARITHMETIC - -// ------------------------------ Addition - -// Unsigned -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_ps(a.raw, b.raw)}; -} -HWY_API Vec512 operator+(Vec512 a, Vec512 b) { - return Vec512{_mm512_add_pd(a.raw, b.raw)}; -} - -// ------------------------------ Subtraction - -// Unsigned -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_ps(a.raw, b.raw)}; -} -HWY_API Vec512 operator-(Vec512 a, Vec512 b) { - return Vec512{_mm512_sub_pd(a.raw, b.raw)}; -} - -// ------------------------------ SumsOf8 -HWY_API Vec512 SumsOf8(const Vec512 v) { - const Full512 d; - return Vec512{_mm512_sad_epu8(v.raw, Zero(d).raw)}; -} - -HWY_API Vec512 SumsOf8AbsDiff(Vec512 a, Vec512 b) { - return Vec512{_mm512_sad_epu8(a.raw, b.raw)}; -} - -// ------------------------------ SaturatedAdd - -// Returns a + b clamped to the destination range. - -// Unsigned -HWY_API Vec512 SaturatedAdd(Vec512 a, Vec512 b) { - return Vec512{_mm512_adds_epu8(a.raw, b.raw)}; -} -HWY_API Vec512 SaturatedAdd(Vec512 a, Vec512 b) { - return Vec512{_mm512_adds_epu16(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 SaturatedAdd(Vec512 a, Vec512 b) { - return Vec512{_mm512_adds_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 SaturatedAdd(Vec512 a, Vec512 b) { - return Vec512{_mm512_adds_epi16(a.raw, b.raw)}; -} - -// ------------------------------ SaturatedSub - -// Returns a - b clamped to the destination range. - -// Unsigned -HWY_API Vec512 SaturatedSub(Vec512 a, Vec512 b) { - return Vec512{_mm512_subs_epu8(a.raw, b.raw)}; -} -HWY_API Vec512 SaturatedSub(Vec512 a, Vec512 b) { - return Vec512{_mm512_subs_epu16(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 SaturatedSub(Vec512 a, Vec512 b) { - return Vec512{_mm512_subs_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 SaturatedSub(Vec512 a, Vec512 b) { - return Vec512{_mm512_subs_epi16(a.raw, b.raw)}; -} - -// ------------------------------ Average - -// Returns (a + b + 1) / 2 - -// Unsigned -HWY_API Vec512 AverageRound(Vec512 a, Vec512 b) { - return Vec512{_mm512_avg_epu8(a.raw, b.raw)}; -} -HWY_API Vec512 AverageRound(Vec512 a, Vec512 b) { - return Vec512{_mm512_avg_epu16(a.raw, b.raw)}; -} - -// ------------------------------ Abs (Sub) - -// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. -HWY_API Vec512 Abs(const Vec512 v) { -#if HWY_COMPILER_MSVC - // Workaround for incorrect codegen? (untested due to internal compiler error) - const DFromV d; - const auto zero = Zero(d); - return Vec512{_mm512_max_epi8(v.raw, (zero - v).raw)}; -#else - return Vec512{_mm512_abs_epi8(v.raw)}; -#endif -} -HWY_API Vec512 Abs(const Vec512 v) { - return Vec512{_mm512_abs_epi16(v.raw)}; -} -HWY_API Vec512 Abs(const Vec512 v) { - return Vec512{_mm512_abs_epi32(v.raw)}; -} -HWY_API Vec512 Abs(const Vec512 v) { - return Vec512{_mm512_abs_epi64(v.raw)}; -} - -// These aren't native instructions, they also involve AND with constant. -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Abs(const Vec512 v) { - return Vec512{_mm512_abs_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Vec512 Abs(const Vec512 v) { - return Vec512{_mm512_abs_ps(v.raw)}; -} -HWY_API Vec512 Abs(const Vec512 v) { -// Workaround: _mm512_abs_pd expects __m512, so implement it ourselves. -#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 803 - const DFromV d; - const RebindToUnsigned du; - return And(v, BitCast(d, Set(du, 0x7FFFFFFFFFFFFFFFULL))); -#else - return Vec512{_mm512_abs_pd(v.raw)}; -#endif -} -// ------------------------------ ShiftLeft - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - return Vec512{_mm512_slli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - return Vec512{_mm512_slli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - return Vec512{_mm512_slli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - return Vec512{_mm512_slli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - return Vec512{_mm512_slli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - return Vec512{_mm512_slli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftLeft(const Vec512 v) { - const DFromV d8; - const RepartitionToWide d16; - const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); - return kBits == 1 - ? (v + v) - : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); -} - -// ------------------------------ ShiftRight - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - return Vec512{_mm512_srli_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - return Vec512{_mm512_srli_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - return Vec512{_mm512_srli_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - const DFromV d8; - // Use raw instead of BitCast to support N=1. - const Vec512 shifted{ShiftRight(Vec512{v.raw}).raw}; - return shifted & Set(d8, 0xFF >> kBits); -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - return Vec512{_mm512_srai_epi16(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - return Vec512{_mm512_srai_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - return Vec512{_mm512_srai_epi64(v.raw, kBits)}; -} - -template -HWY_API Vec512 ShiftRight(const Vec512 v) { - const DFromV di; - const RebindToUnsigned du; - const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); - const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// ------------------------------ RotateRight - -template -HWY_API Vec512 RotateRight(const Vec512 v) { - constexpr size_t kSizeInBits = sizeof(T) * 8; - static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); - if (kBits == 0) return v; - // AVX3 does not support 8/16-bit. - return Or(ShiftRight(v), - ShiftLeft(v)); -} - -template -HWY_API Vec512 RotateRight(const Vec512 v) { - static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); - if (kBits == 0) return v; - return Vec512{_mm512_ror_epi32(v.raw, kBits)}; -} - -template -HWY_API Vec512 RotateRight(const Vec512 v) { - static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); - if (kBits == 0) return v; - return Vec512{_mm512_ror_epi64(v.raw, kBits)}; -} - -// ------------------------------ ShiftLeftSame - -// GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512 -// shift-with-immediate: the counts should all be unsigned int. -#if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100 -using Shift16Count = int; -using Shift3264Count = int; -#elif HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400 -// GCC 11.0 requires these, prior versions used a macro+cast and don't care. -using Shift16Count = int; -using Shift3264Count = unsigned int; -#else -// Assume documented behavior. Clang 11, GCC 14 and MSVC 14.28.29910 match this. -using Shift16Count = unsigned int; -using Shift3264Count = unsigned int; -#endif - -HWY_API Vec512 ShiftLeftSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_slli_epi16(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec512 ShiftLeftSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_slli_epi32(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec512 ShiftLeftSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_slli_epi64(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_slli_epi16(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_slli_epi32(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_slli_epi64(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -template -HWY_API Vec512 ShiftLeftSame(const Vec512 v, const int bits) { - const DFromV d8; - const RepartitionToWide d16; - const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); - return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); -} - -// ------------------------------ ShiftRightSame - -HWY_API Vec512 ShiftRightSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_srli_epi16(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec512 ShiftRightSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_srli_epi32(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec512 ShiftRightSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_srli_epi64(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { - const DFromV d8; - const RepartitionToWide d16; - const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); - return shifted & Set(d8, static_cast(0xFF >> bits)); -} - -HWY_API Vec512 ShiftRightSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_srai_epi16(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec512 ShiftRightSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_srai_epi32(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; -} -HWY_API Vec512 ShiftRightSame(const Vec512 v, - const int bits) { -#if HWY_COMPILER_GCC - if (__builtin_constant_p(bits)) { - return Vec512{ - _mm512_srai_epi64(v.raw, static_cast(bits))}; - } -#endif - return Vec512{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; -} - -HWY_API Vec512 ShiftRightSame(Vec512 v, const int bits) { - const DFromV di; - const RebindToUnsigned du; - const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); - const auto shifted_sign = - BitCast(di, Set(du, static_cast(0x80 >> bits))); - return (shifted ^ shifted_sign) - shifted_sign; -} - -// ------------------------------ Minimum - -// Unsigned -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epu8(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epu16(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epu32(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epu64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_ps(a.raw, b.raw)}; -} -HWY_API Vec512 Min(Vec512 a, Vec512 b) { - return Vec512{_mm512_min_pd(a.raw, b.raw)}; -} - -// ------------------------------ Maximum - -// Unsigned -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epu8(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epu16(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epu32(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epu64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epi8(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_epi64(a.raw, b.raw)}; -} - -// Float -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_ps(a.raw, b.raw)}; -} -HWY_API Vec512 Max(Vec512 a, Vec512 b) { - return Vec512{_mm512_max_pd(a.raw, b.raw)}; -} - -// ------------------------------ Integer multiplication - -// Per-target flag to prevent generic_ops-inl.h from defining 64-bit operator*. -#ifdef HWY_NATIVE_MUL_64 -#undef HWY_NATIVE_MUL_64 -#else -#define HWY_NATIVE_MUL_64 -#endif - -// Unsigned -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mullo_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mullo_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mullo_epi64(a.raw, b.raw)}; -} -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(Vec128 a, - Vec128 b) { - return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; -} - -// Signed -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mullo_epi16(a.raw, b.raw)}; -} -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mullo_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mullo_epi64(a.raw, b.raw)}; -} -HWY_API Vec256 operator*(Vec256 a, Vec256 b) { - return Vec256{_mm256_mullo_epi64(a.raw, b.raw)}; -} -template -HWY_API Vec128 operator*(Vec128 a, - Vec128 b) { - return Vec128{_mm_mullo_epi64(a.raw, b.raw)}; -} -// Returns the upper 16 bits of a * b in each lane. -HWY_API Vec512 MulHigh(Vec512 a, Vec512 b) { - return Vec512{_mm512_mulhi_epu16(a.raw, b.raw)}; -} -HWY_API Vec512 MulHigh(Vec512 a, Vec512 b) { - return Vec512{_mm512_mulhi_epi16(a.raw, b.raw)}; -} - -HWY_API Vec512 MulFixedPoint15(Vec512 a, Vec512 b) { - return Vec512{_mm512_mulhrs_epi16(a.raw, b.raw)}; -} - -// Multiplies even lanes (0, 2 ..) and places the double-wide result into -// even and the upper half into its odd neighbor lane. -HWY_API Vec512 MulEven(Vec512 a, Vec512 b) { - return Vec512{_mm512_mul_epi32(a.raw, b.raw)}; -} -HWY_API Vec512 MulEven(Vec512 a, Vec512 b) { - return Vec512{_mm512_mul_epu32(a.raw, b.raw)}; -} - -// ------------------------------ Neg (Sub) - -template -HWY_API Vec512 Neg(const Vec512 v) { - const DFromV d; - return Xor(v, SignBit(d)); -} - -template -HWY_API Vec512 Neg(const Vec512 v) { - const DFromV d; - return Zero(d) - v; -} - -// ------------------------------ Floating-point mul / div - -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mul_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mul_ps(a.raw, b.raw)}; -} -HWY_API Vec512 operator*(Vec512 a, Vec512 b) { - return Vec512{_mm512_mul_pd(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 operator/(Vec512 a, Vec512 b) { - return Vec512{_mm512_div_ph(a.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 operator/(Vec512 a, Vec512 b) { - return Vec512{_mm512_div_ps(a.raw, b.raw)}; -} -HWY_API Vec512 operator/(Vec512 a, Vec512 b) { - return Vec512{_mm512_div_pd(a.raw, b.raw)}; -} - -// Approximate reciprocal -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 ApproximateReciprocal(const Vec512 v) { - return Vec512{_mm512_rcp_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 ApproximateReciprocal(const Vec512 v) { - return Vec512{_mm512_rcp14_ps(v.raw)}; -} - -HWY_API Vec512 ApproximateReciprocal(Vec512 v) { - return Vec512{_mm512_rcp14_pd(v.raw)}; -} - -// ------------------------------ Floating-point multiply-add variants - -#if HWY_HAVE_FLOAT16 - -HWY_API Vec512 MulAdd(Vec512 mul, Vec512 x, - Vec512 add) { - return Vec512{_mm512_fmadd_ph(mul.raw, x.raw, add.raw)}; -} - -HWY_API Vec512 NegMulAdd(Vec512 mul, Vec512 x, - Vec512 add) { - return Vec512{_mm512_fnmadd_ph(mul.raw, x.raw, add.raw)}; -} - -HWY_API Vec512 MulSub(Vec512 mul, Vec512 x, - Vec512 sub) { - return Vec512{_mm512_fmsub_ph(mul.raw, x.raw, sub.raw)}; -} - -HWY_API Vec512 NegMulSub(Vec512 mul, Vec512 x, - Vec512 sub) { - return Vec512{_mm512_fnmsub_ph(mul.raw, x.raw, sub.raw)}; -} - -#endif // HWY_HAVE_FLOAT16 - -// Returns mul * x + add -HWY_API Vec512 MulAdd(Vec512 mul, Vec512 x, - Vec512 add) { - return Vec512{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)}; -} -HWY_API Vec512 MulAdd(Vec512 mul, Vec512 x, - Vec512 add) { - return Vec512{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)}; -} - -// Returns add - mul * x -HWY_API Vec512 NegMulAdd(Vec512 mul, Vec512 x, - Vec512 add) { - return Vec512{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)}; -} -HWY_API Vec512 NegMulAdd(Vec512 mul, Vec512 x, - Vec512 add) { - return Vec512{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)}; -} - -// Returns mul * x - sub -HWY_API Vec512 MulSub(Vec512 mul, Vec512 x, - Vec512 sub) { - return Vec512{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)}; -} -HWY_API Vec512 MulSub(Vec512 mul, Vec512 x, - Vec512 sub) { - return Vec512{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)}; -} - -// Returns -mul * x - sub -HWY_API Vec512 NegMulSub(Vec512 mul, Vec512 x, - Vec512 sub) { - return Vec512{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)}; -} -HWY_API Vec512 NegMulSub(Vec512 mul, Vec512 x, - Vec512 sub) { - return Vec512{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)}; -} - -// ------------------------------ Floating-point square root - -// Full precision square root -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Sqrt(const Vec512 v) { - return Vec512{_mm512_sqrt_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Sqrt(const Vec512 v) { - return Vec512{_mm512_sqrt_ps(v.raw)}; -} -HWY_API Vec512 Sqrt(const Vec512 v) { - return Vec512{_mm512_sqrt_pd(v.raw)}; -} - -// Approximate reciprocal square root -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 ApproximateReciprocalSqrt(Vec512 v) { - return Vec512{_mm512_rsqrt_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 ApproximateReciprocalSqrt(Vec512 v) { - return Vec512{_mm512_rsqrt14_ps(v.raw)}; -} - -HWY_API Vec512 ApproximateReciprocalSqrt(Vec512 v) { - return Vec512{_mm512_rsqrt14_pd(v.raw)}; -} - -// ------------------------------ Floating-point rounding - -// Work around warnings in the intrinsic definitions (passing -1 as a mask). -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") - -// Toward nearest integer, tie to even -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Round(Vec512 v) { - return Vec512{_mm512_roundscale_ph( - v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Round(Vec512 v) { - return Vec512{_mm512_roundscale_ps( - v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec512 Round(Vec512 v) { - return Vec512{_mm512_roundscale_pd( - v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; -} - -// Toward zero, aka truncate -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Trunc(Vec512 v) { - return Vec512{ - _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Trunc(Vec512 v) { - return Vec512{ - _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec512 Trunc(Vec512 v) { - return Vec512{ - _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; -} - -// Toward +infinity, aka ceiling -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Ceil(Vec512 v) { - return Vec512{ - _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Ceil(Vec512 v) { - return Vec512{ - _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec512 Ceil(Vec512 v) { - return Vec512{ - _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; -} - -// Toward -infinity, aka floor -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 Floor(Vec512 v) { - return Vec512{ - _mm512_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 Floor(Vec512 v) { - return Vec512{ - _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} -HWY_API Vec512 Floor(Vec512 v) { - return Vec512{ - _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; -} - -HWY_DIAGNOSTICS(pop) - -// ================================================== COMPARE - -// Comparisons set a mask bit to 1 if the condition is true, else 0. - -template -HWY_API MFromD RebindMask(DTo /*tag*/, Mask512 m) { - static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); - return MFromD{m.raw}; -} - -namespace detail { - -template -HWY_INLINE Mask512 TestBit(hwy::SizeTag<1> /*tag*/, Vec512 v, - Vec512 bit) { - return Mask512{_mm512_test_epi8_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask512 TestBit(hwy::SizeTag<2> /*tag*/, Vec512 v, - Vec512 bit) { - return Mask512{_mm512_test_epi16_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask512 TestBit(hwy::SizeTag<4> /*tag*/, Vec512 v, - Vec512 bit) { - return Mask512{_mm512_test_epi32_mask(v.raw, bit.raw)}; -} -template -HWY_INLINE Mask512 TestBit(hwy::SizeTag<8> /*tag*/, Vec512 v, - Vec512 bit) { - return Mask512{_mm512_test_epi64_mask(v.raw, bit.raw)}; -} - -} // namespace detail - -template -HWY_API Mask512 TestBit(const Vec512 v, const Vec512 bit) { - static_assert(!hwy::IsFloat(), "Only integer vectors supported"); - return detail::TestBit(hwy::SizeTag(), v, bit); -} - -// ------------------------------ Equality - -template -HWY_API Mask512 operator==(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpeq_epi8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask512 operator==(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpeq_epi16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask512 operator==(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpeq_epi32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask512 operator==(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpeq_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Mask512 operator==(Vec512 a, - Vec512 b) { - return Mask512{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask512 operator==(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -HWY_API Mask512 operator==(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; -} - -// ------------------------------ Inequality - -template -HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpneq_epi8_mask(a.raw, b.raw)}; -} -template -HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpneq_epi16_mask(a.raw, b.raw)}; -} -template -HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpneq_epi32_mask(a.raw, b.raw)}; -} -template -HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpneq_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Mask512 operator!=(Vec512 a, - Vec512 b) { - return Mask512{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -HWY_API Mask512 operator!=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; -} - -// ------------------------------ Strict inequality - -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epu8_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epu16_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epu32_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epu64_mask(a.raw, b.raw)}; -} - -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epi8_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epi16_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epi32_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpgt_epi64_mask(a.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} -HWY_API Mask512 operator>(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; -} - -// ------------------------------ Weak inequality - -#if HWY_HAVE_FLOAT16 -HWY_API Mask512 operator>=(Vec512 a, - Vec512 b) { - return Mask512{_mm512_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; -} - -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epu8_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epu16_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epu32_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epu64_mask(a.raw, b.raw)}; -} - -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epi8_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epi16_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epi32_mask(a.raw, b.raw)}; -} -HWY_API Mask512 operator>=(Vec512 a, Vec512 b) { - return Mask512{_mm512_cmpge_epi64_mask(a.raw, b.raw)}; -} - -// ------------------------------ Reversed comparisons - -template -HWY_API Mask512 operator<(Vec512 a, Vec512 b) { - return b > a; -} - -template -HWY_API Mask512 operator<=(Vec512 a, Vec512 b) { - return b >= a; -} - -// ------------------------------ Mask - -namespace detail { - -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<1> /*tag*/, Vec512 v) { - return Mask512{_mm512_movepi8_mask(v.raw)}; -} -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<2> /*tag*/, Vec512 v) { - return Mask512{_mm512_movepi16_mask(v.raw)}; -} -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<4> /*tag*/, Vec512 v) { - return Mask512{_mm512_movepi32_mask(v.raw)}; -} -template -HWY_INLINE Mask512 MaskFromVec(hwy::SizeTag<8> /*tag*/, Vec512 v) { - return Mask512{_mm512_movepi64_mask(v.raw)}; -} - -} // namespace detail - -template -HWY_API Mask512 MaskFromVec(Vec512 v) { - return detail::MaskFromVec(hwy::SizeTag(), v); -} -template -HWY_API Mask512 MaskFromVec(Vec512 v) { - const RebindToSigned> di; - return Mask512{MaskFromVec(BitCast(di, v)).raw}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi8(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi8(v.raw)}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi16(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi16(v.raw)}; -} -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_ph(_mm512_movm_epi16(v.raw))}; -} -#endif // HWY_HAVE_FLOAT16 - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi32(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi32(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))}; -} - -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi64(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_movm_epi64(v.raw)}; -} -HWY_API Vec512 VecFromMask(Mask512 v) { - return Vec512{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))}; -} - -// ------------------------------ Mask logical - -namespace detail { - -template -HWY_INLINE Mask512 Not(hwy::SizeTag<1> /*tag*/, Mask512 m) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_knot_mask64(m.raw)}; -#else - return Mask512{~m.raw}; -#endif -} -template -HWY_INLINE Mask512 Not(hwy::SizeTag<2> /*tag*/, Mask512 m) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_knot_mask32(m.raw)}; -#else - return Mask512{~m.raw}; -#endif -} -template -HWY_INLINE Mask512 Not(hwy::SizeTag<4> /*tag*/, Mask512 m) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_knot_mask16(m.raw)}; -#else - return Mask512{static_cast(~m.raw & 0xFFFF)}; -#endif -} -template -HWY_INLINE Mask512 Not(hwy::SizeTag<8> /*tag*/, Mask512 m) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_knot_mask8(m.raw)}; -#else - return Mask512{static_cast(~m.raw & 0xFF)}; -#endif -} - -template -HWY_INLINE Mask512 And(hwy::SizeTag<1> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kand_mask64(a.raw, b.raw)}; -#else - return Mask512{a.raw & b.raw}; -#endif -} -template -HWY_INLINE Mask512 And(hwy::SizeTag<2> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kand_mask32(a.raw, b.raw)}; -#else - return Mask512{a.raw & b.raw}; -#endif -} -template -HWY_INLINE Mask512 And(hwy::SizeTag<4> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kand_mask16(a.raw, b.raw)}; -#else - return Mask512{static_cast(a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask512 And(hwy::SizeTag<8> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kand_mask8(a.raw, b.raw)}; -#else - return Mask512{static_cast(a.raw & b.raw)}; -#endif -} - -template -HWY_INLINE Mask512 AndNot(hwy::SizeTag<1> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kandn_mask64(a.raw, b.raw)}; -#else - return Mask512{~a.raw & b.raw}; -#endif -} -template -HWY_INLINE Mask512 AndNot(hwy::SizeTag<2> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kandn_mask32(a.raw, b.raw)}; -#else - return Mask512{~a.raw & b.raw}; -#endif -} -template -HWY_INLINE Mask512 AndNot(hwy::SizeTag<4> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kandn_mask16(a.raw, b.raw)}; -#else - return Mask512{static_cast(~a.raw & b.raw)}; -#endif -} -template -HWY_INLINE Mask512 AndNot(hwy::SizeTag<8> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kandn_mask8(a.raw, b.raw)}; -#else - return Mask512{static_cast(~a.raw & b.raw)}; -#endif -} - -template -HWY_INLINE Mask512 Or(hwy::SizeTag<1> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kor_mask64(a.raw, b.raw)}; -#else - return Mask512{a.raw | b.raw}; -#endif -} -template -HWY_INLINE Mask512 Or(hwy::SizeTag<2> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kor_mask32(a.raw, b.raw)}; -#else - return Mask512{a.raw | b.raw}; -#endif -} -template -HWY_INLINE Mask512 Or(hwy::SizeTag<4> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kor_mask16(a.raw, b.raw)}; -#else - return Mask512{static_cast(a.raw | b.raw)}; -#endif -} -template -HWY_INLINE Mask512 Or(hwy::SizeTag<8> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kor_mask8(a.raw, b.raw)}; -#else - return Mask512{static_cast(a.raw | b.raw)}; -#endif -} - -template -HWY_INLINE Mask512 Xor(hwy::SizeTag<1> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxor_mask64(a.raw, b.raw)}; -#else - return Mask512{a.raw ^ b.raw}; -#endif -} -template -HWY_INLINE Mask512 Xor(hwy::SizeTag<2> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxor_mask32(a.raw, b.raw)}; -#else - return Mask512{a.raw ^ b.raw}; -#endif -} -template -HWY_INLINE Mask512 Xor(hwy::SizeTag<4> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxor_mask16(a.raw, b.raw)}; -#else - return Mask512{static_cast(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask512 Xor(hwy::SizeTag<8> /*tag*/, Mask512 a, Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxor_mask8(a.raw, b.raw)}; -#else - return Mask512{static_cast(a.raw ^ b.raw)}; -#endif -} - -template -HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxnor_mask64(a.raw, b.raw)}; -#else - return Mask512{~(a.raw ^ b.raw)}; -#endif -} -template -HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxnor_mask32(a.raw, b.raw)}; -#else - return Mask512{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; -#endif -} -template -HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxnor_mask16(a.raw, b.raw)}; -#else - return Mask512{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; -#endif -} -template -HWY_INLINE Mask512 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, Mask512 a, - Mask512 b) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return Mask512{_kxnor_mask8(a.raw, b.raw)}; -#else - return Mask512{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; -#endif -} - -} // namespace detail - -template -HWY_API Mask512 Not(Mask512 m) { - return detail::Not(hwy::SizeTag(), m); -} - -template -HWY_API Mask512 And(Mask512 a, Mask512 b) { - return detail::And(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask512 AndNot(Mask512 a, Mask512 b) { - return detail::AndNot(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask512 Or(Mask512 a, Mask512 b) { - return detail::Or(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask512 Xor(Mask512 a, Mask512 b) { - return detail::Xor(hwy::SizeTag(), a, b); -} - -template -HWY_API Mask512 ExclusiveNeither(Mask512 a, Mask512 b) { - return detail::ExclusiveNeither(hwy::SizeTag(), a, b); -} - -// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) - -HWY_API Vec512 BroadcastSignBit(Vec512 v) { - const DFromV d; - return VecFromMask(v < Zero(d)); -} - -HWY_API Vec512 BroadcastSignBit(Vec512 v) { - return ShiftRight<15>(v); -} - -HWY_API Vec512 BroadcastSignBit(Vec512 v) { - return ShiftRight<31>(v); -} - -HWY_API Vec512 BroadcastSignBit(Vec512 v) { - return Vec512{_mm512_srai_epi64(v.raw, 63)}; -} - -// ------------------------------ Floating-point classification (Not) - -#if HWY_HAVE_FLOAT16 || HWY_IDE - -HWY_API Mask512 IsNaN(Vec512 v) { - return Mask512{_mm512_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -} - -HWY_API Mask512 IsInf(Vec512 v) { - return Mask512{_mm512_fpclass_ph_mask(v.raw, 0x18)}; -} - -// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for -// positive, so we have to check for inf/NaN and negate. -HWY_API Mask512 IsFinite(Vec512 v) { - return Not(Mask512{_mm512_fpclass_ph_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} - -#endif // HWY_HAVE_FLOAT16 - -HWY_API Mask512 IsNaN(Vec512 v) { - return Mask512{_mm512_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -} -HWY_API Mask512 IsNaN(Vec512 v) { - return Mask512{_mm512_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; -} - -HWY_API Mask512 IsInf(Vec512 v) { - return Mask512{_mm512_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} -HWY_API Mask512 IsInf(Vec512 v) { - return Mask512{_mm512_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; -} - -// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for -// positive, so we have to check for inf/NaN and negate. -HWY_API Mask512 IsFinite(Vec512 v) { - return Not(Mask512{_mm512_fpclass_ps_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} -HWY_API Mask512 IsFinite(Vec512 v) { - return Not(Mask512{_mm512_fpclass_pd_mask( - v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | - HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); -} - -// ================================================== MEMORY - -// ------------------------------ Load - -template -HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { - return VFromD{_mm512_load_si512(aligned)}; -} -// bfloat16_t is handled by x86_128-inl.h. -template -HWY_API Vec512 Load(D d, const float16_t* HWY_RESTRICT aligned) { -#if HWY_HAVE_FLOAT16 - (void)d; - return Vec512{_mm512_load_ph(aligned)}; -#else - const RebindToUnsigned du; - return BitCast(d, Load(du, reinterpret_cast(aligned))); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec512 Load(D /* tag */, const float* HWY_RESTRICT aligned) { - return Vec512{_mm512_load_ps(aligned)}; -} -template -HWY_API VFromD Load(D /* tag */, const double* HWY_RESTRICT aligned) { - return VFromD{_mm512_load_pd(aligned)}; -} - -template -HWY_API VFromD LoadU(D /* tag */, const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_loadu_si512(p)}; -} - -// bfloat16_t is handled by x86_128-inl.h. -template -HWY_API Vec512 LoadU(D d, const float16_t* HWY_RESTRICT p) { -#if HWY_HAVE_FLOAT16 - (void)d; - return Vec512{_mm512_loadu_ph(p)}; -#else - const RebindToUnsigned du; - return BitCast(d, LoadU(du, reinterpret_cast(p))); -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API Vec512 LoadU(D /* tag */, const float* HWY_RESTRICT p) { - return Vec512{_mm512_loadu_ps(p)}; -} -template -HWY_API VFromD LoadU(D /* tag */, const double* HWY_RESTRICT p) { - return VFromD{_mm512_loadu_pd(p)}; -} - -// ------------------------------ MaskedLoad - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_maskz_loadu_epi8(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - return BitCast(d, VFromD{_mm512_maskz_loadu_epi16( - m.raw, reinterpret_cast(p))}); -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_maskz_loadu_epi32(m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_maskz_loadu_epi64(m.raw, p)}; -} - -template -HWY_API Vec512 MaskedLoad(Mask512 m, D /* tag */, - const float* HWY_RESTRICT p) { - return Vec512{_mm512_maskz_loadu_ps(m.raw, p)}; -} - -template -HWY_API Vec512 MaskedLoad(Mask512 m, D /* tag */, - const double* HWY_RESTRICT p) { - return Vec512{_mm512_maskz_loadu_pd(m.raw, p)}; -} - -// ------------------------------ MaskedLoadOr - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_mask_loadu_epi8(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, - const TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - return VFromD{_mm512_mask_loadu_epi16( - BitCast(du, v).raw, m.raw, reinterpret_cast(p))}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_mask_loadu_epi32(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT p) { - return VFromD{_mm512_mask_loadu_epi64(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, Mask512 m, D /* tag */, - const float* HWY_RESTRICT p) { - return VFromD{_mm512_mask_loadu_ps(v.raw, m.raw, p)}; -} - -template -HWY_API VFromD MaskedLoadOr(VFromD v, Mask512 m, D /* tag */, - const double* HWY_RESTRICT p) { - return VFromD{_mm512_mask_loadu_pd(v.raw, m.raw, p)}; -} - -// ------------------------------ LoadDup128 - -// Loads 128 bit and duplicates into both 128-bit halves. This avoids the -// 3-cycle cost of moving data between 128-bit halves and avoids port 5. -template -HWY_API VFromD LoadDup128(D /* tag */, - const TFromD* const HWY_RESTRICT p) { - const Full128> d128; - return VFromD{_mm512_broadcast_i32x4(LoadU(d128, p).raw)}; -} -template -HWY_API VFromD LoadDup128(D /* tag */, const float* HWY_RESTRICT p) { - const __m128 x4 = _mm_loadu_ps(p); - return VFromD{_mm512_broadcast_f32x4(x4)}; -} - -template -HWY_API VFromD LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { - const __m128d x2 = _mm_loadu_pd(p); - return VFromD{_mm512_broadcast_f64x2(x2)}; -} - -// ------------------------------ Store - -template -HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { - _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw); -} -// bfloat16_t is handled by x86_128-inl.h. -template -HWY_API void Store(Vec512 v, D /* tag */, - float16_t* HWY_RESTRICT aligned) { -#if HWY_HAVE_FLOAT16 - _mm512_store_ph(aligned, v.raw); -#else - _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw); -#endif -} -template -HWY_API void Store(Vec512 v, D /* tag */, float* HWY_RESTRICT aligned) { - _mm512_store_ps(aligned, v.raw); -} -template -HWY_API void Store(VFromD v, D /* tag */, double* HWY_RESTRICT aligned) { - _mm512_store_pd(aligned, v.raw); -} - -template -HWY_API void StoreU(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { - _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw); -} -// bfloat16_t is handled by x86_128-inl.h. -template -HWY_API void StoreU(Vec512 v, D /* tag */, - float16_t* HWY_RESTRICT p) { -#if HWY_HAVE_FLOAT16 - _mm512_storeu_ph(p, v.raw); -#else - _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw); -#endif // HWY_HAVE_FLOAT16 -} - -template -HWY_API void StoreU(Vec512 v, D /* tag */, float* HWY_RESTRICT p) { - _mm512_storeu_ps(p, v.raw); -} -template -HWY_API void StoreU(Vec512 v, D /* tag */, double* HWY_RESTRICT p) { - _mm512_storeu_pd(p, v.raw); -} - -// ------------------------------ BlendedStore - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm512_mask_storeu_epi8(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT p) { - const RebindToUnsigned du; // for float16_t - _mm512_mask_storeu_epi16(reinterpret_cast(p), m.raw, - BitCast(du, v).raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm512_mask_storeu_epi32(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT p) { - _mm512_mask_storeu_epi64(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(Vec512 v, Mask512 m, D /* tag */, - float* HWY_RESTRICT p) { - _mm512_mask_storeu_ps(p, m.raw, v.raw); -} - -template -HWY_API void BlendedStore(Vec512 v, Mask512 m, D /* tag */, - double* HWY_RESTRICT p) { - _mm512_mask_storeu_pd(p, m.raw, v.raw); -} - -// ------------------------------ Non-temporal stores - -template -HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { - const RebindToUnsigned du; // for float16_t - _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), BitCast(du, v).raw); -} -template -HWY_API void Stream(VFromD v, D /* tag */, float* HWY_RESTRICT aligned) { - _mm512_stream_ps(aligned, v.raw); -} -template -HWY_API void Stream(VFromD v, D /* tag */, double* HWY_RESTRICT aligned) { - _mm512_stream_pd(aligned, v.raw); -} - -// ------------------------------ ScatterOffset - -// Work around warnings in the intrinsic definitions (passing -1 as a mask). -HWY_DIAGNOSTICS(push) -HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> offset) { - _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1); -} - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> offset) { - _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1); -} - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, float* HWY_RESTRICT base, - Vec512 offset) { - _mm512_i32scatter_ps(base, offset.raw, v.raw, 1); -} - -template -HWY_API void ScatterOffset(VFromD v, D /* tag */, double* HWY_RESTRICT base, - Vec512 offset) { - _mm512_i64scatter_pd(base, offset.raw, v.raw, 1); -} - -// ------------------------------ ScatterIndex - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm512_i32scatter_epi32(base, index.raw, v.raw, 4); -} - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm512_i64scatter_epi64(base, index.raw, v.raw, 8); -} - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, float* HWY_RESTRICT base, - Vec512 index) { - _mm512_i32scatter_ps(base, index.raw, v.raw, 4); -} - -template -HWY_API void ScatterIndex(VFromD v, D /* tag */, double* HWY_RESTRICT base, - Vec512 index) { - _mm512_i64scatter_pd(base, index.raw, v.raw, 8); -} - -// ------------------------------ MaskedScatterIndex - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm512_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, 4); -} - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - TFromD* HWY_RESTRICT base, - VFromD> index) { - _mm512_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, 8); -} - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - float* HWY_RESTRICT base, - Vec512 index) { - _mm512_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, 4); -} - -template -HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, - double* HWY_RESTRICT base, - Vec512 index) { - _mm512_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, 8); -} - -// ------------------------------ Gather - -namespace detail { - -template -HWY_INLINE Vec512 NativeGather(const T* HWY_RESTRICT base, - Vec512 index) { - return Vec512{_mm512_i32gather_epi32(index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeGather(const T* HWY_RESTRICT base, - Vec512 index) { - return Vec512{_mm512_i64gather_epi64(index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeGather(const float* HWY_RESTRICT base, - Vec512 index) { - return Vec512{_mm512_i32gather_ps(index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeGather(const double* HWY_RESTRICT base, - Vec512 index) { - return Vec512{_mm512_i64gather_pd(index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeMaskedGather(Mask512 m, - const T* HWY_RESTRICT base, - Vec512 index) { - const Full512 d; - return Vec512{ - _mm512_mask_i32gather_epi32(Zero(d).raw, m.raw, index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeMaskedGather(Mask512 m, - const T* HWY_RESTRICT base, - Vec512 index) { - const Full512 d; - return Vec512{ - _mm512_mask_i64gather_epi64(Zero(d).raw, m.raw, index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeMaskedGather(Mask512 m, - const float* HWY_RESTRICT base, - Vec512 index) { - const Full512 d; - return Vec512{ - _mm512_mask_i32gather_ps(Zero(d).raw, m.raw, index.raw, base, kScale)}; -} - -template -HWY_INLINE Vec512 NativeMaskedGather(Mask512 m, - const double* HWY_RESTRICT base, - Vec512 index) { - const Full512 d; - return Vec512{ - _mm512_mask_i64gather_pd(Zero(d).raw, m.raw, index.raw, base, kScale)}; -} -} // namespace detail - -template -HWY_API VFromD GatherOffset(D /* tag */, const TFromD* HWY_RESTRICT base, - Vec512 offset) { - static_assert(sizeof(TFromD) == sizeof(TI), "Must match for portability"); - return detail::NativeGather<1>(base, offset); -} -template -HWY_API VFromD GatherIndex(D /* tag */, const TFromD* HWY_RESTRICT base, - Vec512 index) { - static_assert(sizeof(TFromD) == sizeof(TI), "Must match for portability"); - return detail::NativeGather)>(base, index); -} -template -HWY_API VFromD MaskedGatherIndex(MFromD m, D /* tag */, - const TFromD* HWY_RESTRICT base, - Vec512 index) { - static_assert(sizeof(TFromD) == sizeof(TI), "Must match for portability"); - return detail::NativeMaskedGather)>(m, base, index); -} - -HWY_DIAGNOSTICS(pop) - -// ================================================== SWIZZLE - -// ------------------------------ LowerHalf - -template -HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { - return VFromD{_mm512_castsi512_si256(v.raw)}; -} -template -HWY_API VFromD LowerHalf(D /* tag */, Vec512 v) { - return VFromD{_mm512_castsi512_si256(v.raw)}; -} -template -HWY_API VFromD LowerHalf(D /* tag */, Vec512 v) { -#if HWY_HAVE_FLOAT16 - return VFromD{_mm512_castph512_ph256(v.raw)}; -#else - return VFromD{_mm512_castsi512_si256(v.raw)}; -#endif // HWY_HAVE_FLOAT16 -} -template -HWY_API VFromD LowerHalf(D /* tag */, Vec512 v) { - return VFromD{_mm512_castps512_ps256(v.raw)}; -} -template -HWY_API VFromD LowerHalf(D /* tag */, Vec512 v) { - return VFromD{_mm512_castpd512_pd256(v.raw)}; -} - -template -HWY_API Vec256 LowerHalf(Vec512 v) { - const Half> dh; - return LowerHalf(dh, v); -} - -// ------------------------------ UpperHalf - -template -HWY_API VFromD UpperHalf(D d, VFromD> v) { - const RebindToUnsigned du; // for float16_t - const Twice dut; - return BitCast(d, VFromD{ - _mm512_extracti32x8_epi32(BitCast(dut, v).raw, 1)}); -} -template -HWY_API VFromD UpperHalf(D /* tag */, VFromD> v) { - return VFromD{_mm512_extractf32x8_ps(v.raw, 1)}; -} -template -HWY_API VFromD UpperHalf(D /* tag */, VFromD> v) { - return VFromD{_mm512_extractf64x4_pd(v.raw, 1)}; -} - -// ------------------------------ ExtractLane (Store) -template -HWY_API T ExtractLane(const Vec512 v, size_t i) { - const DFromV d; - HWY_DASSERT(i < Lanes(d)); - -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - constexpr size_t kLanesPerBlock = 16 / sizeof(T); - if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) { - return ExtractLane(ResizeBitCast(Full128(), v), i); - } -#endif - - alignas(64) T lanes[Lanes(d)]; - Store(v, d, lanes); - return lanes[i]; -} - -// ------------------------------ ExtractBlock -template * = nullptr> -HWY_API Vec128 ExtractBlock(Vec512 v) { - const DFromV d; - const Half dh; - return ExtractBlock(LowerHalf(dh, v)); -} - -template 1)>* = nullptr> -HWY_API Vec128 ExtractBlock(Vec512 v) { - static_assert(kBlockIdx <= 3, "Invalid block index"); - return Vec128{_mm512_extracti32x4_epi32(v.raw, kBlockIdx)}; -} - -template 1)>* = nullptr> -HWY_API Vec128 ExtractBlock(Vec512 v) { - static_assert(kBlockIdx <= 3, "Invalid block index"); - return Vec128{_mm512_extractf32x4_ps(v.raw, kBlockIdx)}; -} - -template 1)>* = nullptr> -HWY_API Vec128 ExtractBlock(Vec512 v) { - static_assert(kBlockIdx <= 3, "Invalid block index"); - return Vec128{_mm512_extractf64x2_pd(v.raw, kBlockIdx)}; -} - -// ------------------------------ InsertLane (Store) -template -HWY_API Vec512 InsertLane(const Vec512 v, size_t i, T t) { - return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); -} - -// ------------------------------ InsertBlock -namespace detail { - -template -HWY_INLINE Vec512 InsertBlock(hwy::SizeTag<0> /* blk_idx_tag */, Vec512 v, - Vec128 blk_to_insert) { - const DFromV d; - const auto insert_mask = FirstN(d, 16 / sizeof(T)); - return IfThenElse(insert_mask, ResizeBitCast(d, blk_to_insert), v); -} - -template -HWY_INLINE Vec512 InsertBlock(hwy::SizeTag /* blk_idx_tag */, - Vec512 v, Vec128 blk_to_insert) { - return Vec512{_mm512_inserti32x4(v.raw, blk_to_insert.raw, - static_cast(kBlockIdx & 3))}; -} - -template * = nullptr> -HWY_INLINE Vec512 InsertBlock(hwy::SizeTag /* blk_idx_tag */, - Vec512 v, - Vec128 blk_to_insert) { - return Vec512{_mm512_insertf32x4(v.raw, blk_to_insert.raw, - static_cast(kBlockIdx & 3))}; -} - -template * = nullptr> -HWY_INLINE Vec512 InsertBlock(hwy::SizeTag /* blk_idx_tag */, - Vec512 v, - Vec128 blk_to_insert) { - return Vec512{_mm512_insertf64x2(v.raw, blk_to_insert.raw, - static_cast(kBlockIdx & 3))}; -} - -} // namespace detail - -template -HWY_API Vec512 InsertBlock(Vec512 v, Vec128 blk_to_insert) { - static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index"); - return detail::InsertBlock(hwy::SizeTag(kBlockIdx)>(), v, - blk_to_insert); -} - -// ------------------------------ GetLane (LowerHalf) -template -HWY_API T GetLane(const Vec512 v) { - return GetLane(LowerHalf(v)); -} - -// ------------------------------ ZeroExtendVector - -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { -#if HWY_HAVE_ZEXT // See definition/comment in x86_256-inl.h. - (void)d; - return VFromD{_mm512_zextsi256_si512(lo.raw)}; -#else - return VFromD{_mm512_inserti32x8(Zero(d).raw, lo.raw, 0)}; -#endif -} -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { -#if HWY_HAVE_ZEXT - (void)d; - return VFromD{_mm512_zextph256_ph512(lo.raw)}; -#else - const RebindToUnsigned du; - return BitCast(d, ZeroExtendVector(du, BitCast(du, lo))); -#endif -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { -#if HWY_HAVE_ZEXT - (void)d; - return VFromD{_mm512_zextps256_ps512(lo.raw)}; -#else - return VFromD{_mm512_insertf32x8(Zero(d).raw, lo.raw, 0)}; -#endif -} -template -HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { -#if HWY_HAVE_ZEXT - (void)d; - return VFromD{_mm512_zextpd256_pd512(lo.raw)}; -#else - return VFromD{_mm512_insertf64x4(Zero(d).raw, lo.raw, 0)}; -#endif -} - -// ------------------------------ ZeroExtendResizeBitCast - -namespace detail { - -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, - DTo d_to, DFrom d_from, VFromD v) { - const Repartition du8_from; - const auto vu8 = BitCast(du8_from, v); -#if HWY_HAVE_ZEXT - (void)d_to; - return VFromD{_mm512_zextsi128_si512(vu8.raw)}; -#else - return VFromD{_mm512_inserti32x4(Zero(d_to).raw, vu8.raw, 0)}; -#endif -} - -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, - DTo d_to, DFrom d_from, VFromD v) { - const Repartition df32_from; - const auto vf32 = BitCast(df32_from, v); -#if HWY_HAVE_ZEXT - (void)d_to; - return Vec512{_mm512_zextps128_ps512(vf32.raw)}; -#else - return Vec512{_mm512_insertf32x4(Zero(d_to).raw, vf32.raw, 0)}; -#endif -} - -template -HWY_INLINE Vec512 ZeroExtendResizeBitCast( - hwy::SizeTag<16> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, - DTo d_to, DFrom d_from, VFromD v) { - const Repartition df64_from; - const auto vf64 = BitCast(df64_from, v); -#if HWY_HAVE_ZEXT - (void)d_to; - return Vec512{_mm512_zextpd128_pd512(vf64.raw)}; -#else - return Vec512{_mm512_insertf64x2(Zero(d_to).raw, vf64.raw, 0)}; -#endif -} - -template -HWY_INLINE VFromD ZeroExtendResizeBitCast( - hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<64> /* to_size_tag */, - DTo d_to, DFrom d_from, VFromD v) { - const Twice dt_from; - return ZeroExtendResizeBitCast(hwy::SizeTag<16>(), hwy::SizeTag<64>(), d_to, - dt_from, ZeroExtendVector(dt_from, v)); -} - -} // namespace detail - -// ------------------------------ Combine - -template -HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { - const RebindToUnsigned du; // for float16_t - const Half duh; - const __m512i lo512 = ZeroExtendVector(du, BitCast(duh, lo)).raw; - return VFromD{_mm512_inserti32x8(lo512, BitCast(duh, hi).raw, 1)}; -} -template -HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { - return VFromD{_mm512_insertf32x8(ZeroExtendVector(d, lo).raw, hi.raw, 1)}; -} -template -HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { - return VFromD{_mm512_insertf64x4(ZeroExtendVector(d, lo).raw, hi.raw, 1)}; -} - -// ------------------------------ ShiftLeftBytes -template -HWY_API VFromD ShiftLeftBytes(D /* tag */, const VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - return VFromD{_mm512_bslli_epi128(v.raw, kBytes)}; -} - -// ------------------------------ ShiftRightBytes -template -HWY_API VFromD ShiftRightBytes(D /* tag */, const VFromD v) { - static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); - return VFromD{_mm512_bsrli_epi128(v.raw, kBytes)}; -} - -// ------------------------------ CombineShiftRightBytes - -template -HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { - const Repartition d8; - return BitCast(d, Vec512{_mm512_alignr_epi8( - BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); -} - -// ------------------------------ Broadcast/splat any lane - -template -HWY_API Vec512 Broadcast(const Vec512 v) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; - const VU vu = BitCast(du, v); // for float16_t - static_assert(0 <= kLane && kLane < 8, "Invalid lane"); - if (kLane < 4) { - const __m512i lo = _mm512_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); - return BitCast(d, VU{_mm512_unpacklo_epi64(lo, lo)}); - } else { - const __m512i hi = - _mm512_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); - return BitCast(d, VU{_mm512_unpackhi_epi64(hi, hi)}); - } -} - -template -HWY_API Vec512 Broadcast(const Vec512 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); - return Vec512{_mm512_shuffle_epi32(v.raw, perm)}; -} - -template -HWY_API Vec512 Broadcast(const Vec512 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA; - return Vec512{_mm512_shuffle_epi32(v.raw, perm)}; -} - -template -HWY_API Vec512 Broadcast(const Vec512 v) { - static_assert(0 <= kLane && kLane < 4, "Invalid lane"); - constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane); - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, perm)}; -} - -template -HWY_API Vec512 Broadcast(const Vec512 v) { - static_assert(0 <= kLane && kLane < 2, "Invalid lane"); - constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane); - return Vec512{_mm512_shuffle_pd(v.raw, v.raw, perm)}; -} - -// ------------------------------ BroadcastBlock -template -HWY_API Vec512 BroadcastBlock(Vec512 v) { - static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index"); - return Vec512{_mm512_shuffle_i32x4(v.raw, v.raw, 0x55 * kBlockIdx)}; -} - -template -HWY_API Vec512 BroadcastBlock(Vec512 v) { - static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index"); - return Vec512{_mm512_shuffle_f32x4(v.raw, v.raw, 0x55 * kBlockIdx)}; -} - -template -HWY_API Vec512 BroadcastBlock(Vec512 v) { - static_assert(0 <= kBlockIdx && kBlockIdx <= 3, "Invalid block index"); - return Vec512{_mm512_shuffle_f64x2(v.raw, v.raw, 0x55 * kBlockIdx)}; -} - -// ------------------------------ BroadcastLane - -namespace detail { - -template -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec512 v) { - return Vec512{_mm512_broadcastb_epi8(ResizeBitCast(Full128(), v).raw)}; -} - -template -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec512 v) { - return Vec512{_mm512_broadcastw_epi16(ResizeBitCast(Full128(), v).raw)}; -} - -template -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec512 v) { - return Vec512{_mm512_broadcastd_epi32(ResizeBitCast(Full128(), v).raw)}; -} - -template -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec512 v) { - return Vec512{_mm512_broadcastq_epi64(ResizeBitCast(Full128(), v).raw)}; -} - -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec512 v) { - return Vec512{ - _mm512_broadcastss_ps(ResizeBitCast(Full128(), v).raw)}; -} - -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, - Vec512 v) { - return Vec512{ - _mm512_broadcastsd_pd(ResizeBitCast(Full128(), v).raw)}; -} - -template * = nullptr> -HWY_INLINE Vec512 BroadcastLane(hwy::SizeTag /* lane_idx_tag */, - Vec512 v) { - constexpr size_t kLanesPerBlock = 16 / sizeof(T); - constexpr int kBlockIdx = static_cast(kLaneIdx / kLanesPerBlock); - constexpr int kLaneInBlkIdx = - static_cast(kLaneIdx) & (kLanesPerBlock - 1); - return Broadcast(BroadcastBlock(v)); -} - -} // namespace detail - -template -HWY_API Vec512 BroadcastLane(Vec512 v) { - static_assert(0 <= kLaneIdx, "Invalid lane"); - return detail::BroadcastLane(hwy::SizeTag(kLaneIdx)>(), - v); -} - -// ------------------------------ Hard-coded shuffles - -// Notation: let Vec512 have lanes 7,6,5,4,3,2,1,0 (0 is -// least-significant). Shuffle0321 rotates four-lane blocks one lane to the -// right (the previous least-significant lane is now most-significant => -// 47650321). These could also be implemented via CombineShiftRightBytes but -// the shuffle_abcd notation is more convenient. - -// Swap 32-bit halves in 64-bit halves. -template -HWY_API Vec512 Shuffle2301(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)}; -} -HWY_API Vec512 Shuffle2301(const Vec512 v) { - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)}; -} - -namespace detail { - -template -HWY_API Vec512 ShuffleTwo2301(const Vec512 a, const Vec512 b) { - const DFromV d; - const RebindToFloat df; - return BitCast( - d, Vec512{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, - _MM_PERM_CDAB)}); -} -template -HWY_API Vec512 ShuffleTwo1230(const Vec512 a, const Vec512 b) { - const DFromV d; - const RebindToFloat df; - return BitCast( - d, Vec512{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, - _MM_PERM_BCDA)}); -} -template -HWY_API Vec512 ShuffleTwo3012(const Vec512 a, const Vec512 b) { - const DFromV d; - const RebindToFloat df; - return BitCast( - d, Vec512{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, - _MM_PERM_DABC)}); -} - -} // namespace detail - -// Swap 64-bit halves -HWY_API Vec512 Shuffle1032(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; -} -HWY_API Vec512 Shuffle1032(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; -} -HWY_API Vec512 Shuffle1032(const Vec512 v) { - // Shorter encoding than _mm512_permute_ps. - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)}; -} -HWY_API Vec512 Shuffle01(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; -} -HWY_API Vec512 Shuffle01(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)}; -} -HWY_API Vec512 Shuffle01(const Vec512 v) { - // Shorter encoding than _mm512_permute_pd. - return Vec512{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)}; -} - -// Rotate right 32 bits -HWY_API Vec512 Shuffle0321(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)}; -} -HWY_API Vec512 Shuffle0321(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)}; -} -HWY_API Vec512 Shuffle0321(const Vec512 v) { - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)}; -} -// Rotate left 32 bits -HWY_API Vec512 Shuffle2103(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)}; -} -HWY_API Vec512 Shuffle2103(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)}; -} -HWY_API Vec512 Shuffle2103(const Vec512 v) { - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)}; -} - -// Reverse -HWY_API Vec512 Shuffle0123(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)}; -} -HWY_API Vec512 Shuffle0123(const Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)}; -} -HWY_API Vec512 Shuffle0123(const Vec512 v) { - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)}; -} - -// ------------------------------ TableLookupLanes - -// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. -template -struct Indices512 { - __m512i raw; -}; - -template , typename TI> -HWY_API Indices512 IndicesFromVec(D /* tag */, Vec512 vec) { - static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); -#if HWY_IS_DEBUG_BUILD - const DFromV di; - const RebindToUnsigned du; - using TU = MakeUnsigned; - const auto vec_u = BitCast(du, vec); - HWY_DASSERT( - AllTrue(du, Lt(vec_u, Set(du, static_cast(128 / sizeof(T)))))); -#endif - return Indices512{vec.raw}; -} - -template -HWY_API Indices512> SetTableIndices(D d, const TI* idx) { - const Rebind di; - return IndicesFromVec(d, LoadU(di, idx)); -} - -template -HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_permutexvar_epi8(idx.raw, v.raw)}; -#else - const DFromV d; - const Repartition du16; - const Vec512 idx_vec{idx.raw}; - - const auto bd_sel_mask = - MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); - const auto cd_sel_mask = - MaskFromVec(BitCast(d, ShiftLeft<2>(BitCast(du16, idx_vec)))); - - const Vec512 v_a{_mm512_shuffle_i32x4(v.raw, v.raw, 0x00)}; - const Vec512 v_b{_mm512_shuffle_i32x4(v.raw, v.raw, 0x55)}; - const Vec512 v_c{_mm512_shuffle_i32x4(v.raw, v.raw, 0xAA)}; - const Vec512 v_d{_mm512_shuffle_i32x4(v.raw, v.raw, 0xFF)}; - - const auto shuf_a = TableLookupBytes(v_a, idx_vec); - const auto shuf_c = TableLookupBytes(v_c, idx_vec); - const Vec512 shuf_ab{_mm512_mask_shuffle_epi8(shuf_a.raw, bd_sel_mask.raw, - v_b.raw, idx_vec.raw)}; - const Vec512 shuf_cd{_mm512_mask_shuffle_epi8(shuf_c.raw, bd_sel_mask.raw, - v_d.raw, idx_vec.raw)}; - return IfThenElse(cd_sel_mask, shuf_cd, shuf_ab); -#endif -} - -template -HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { - return Vec512{_mm512_permutexvar_epi16(idx.raw, v.raw)}; -} -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 TableLookupLanes(Vec512 v, - Indices512 idx) { - return Vec512{_mm512_permutexvar_ph(idx.raw, v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { - return Vec512{_mm512_permutexvar_epi32(idx.raw, v.raw)}; -} - -template -HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { - return Vec512{_mm512_permutexvar_epi64(idx.raw, v.raw)}; -} - -HWY_API Vec512 TableLookupLanes(Vec512 v, Indices512 idx) { - return Vec512{_mm512_permutexvar_ps(idx.raw, v.raw)}; -} - -HWY_API Vec512 TableLookupLanes(Vec512 v, - Indices512 idx) { - return Vec512{_mm512_permutexvar_pd(idx.raw, v.raw)}; -} - -template -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, Vec512 b, - Indices512 idx) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_permutex2var_epi8(a.raw, idx.raw, b.raw)}; -#else - const DFromV d; - const auto b_sel_mask = - MaskFromVec(BitCast(d, ShiftLeft<1>(Vec512{idx.raw}))); - return IfThenElse(b_sel_mask, TableLookupLanes(b, idx), - TableLookupLanes(a, idx)); -#endif -} - -template -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, Vec512 b, - Indices512 idx) { - return Vec512{_mm512_permutex2var_epi16(a.raw, idx.raw, b.raw)}; -} - -template -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, Vec512 b, - Indices512 idx) { - return Vec512{_mm512_permutex2var_epi32(a.raw, idx.raw, b.raw)}; -} - -#if HWY_HAVE_FLOAT16 -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, - Vec512 b, - Indices512 idx) { - return Vec512{_mm512_permutex2var_ph(a.raw, idx.raw, b.raw)}; -} -#endif // HWY_HAVE_FLOAT16 -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, Vec512 b, - Indices512 idx) { - return Vec512{_mm512_permutex2var_ps(a.raw, idx.raw, b.raw)}; -} - -template -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, Vec512 b, - Indices512 idx) { - return Vec512{_mm512_permutex2var_epi64(a.raw, idx.raw, b.raw)}; -} - -HWY_API Vec512 TwoTablesLookupLanes(Vec512 a, Vec512 b, - Indices512 idx) { - return Vec512{_mm512_permutex2var_pd(a.raw, idx.raw, b.raw)}; -} - -// ------------------------------ Reverse - -template -HWY_API VFromD Reverse(D d, const VFromD v) { -#if HWY_TARGET <= HWY_AVX3_DL - const RebindToSigned di; - alignas(64) static constexpr int8_t kReverse[64] = { - 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, - 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - const Vec512 idx = Load(di, kReverse); - return BitCast( - d, Vec512{_mm512_permutexvar_epi8(idx.raw, BitCast(di, v).raw)}); -#else - const RepartitionToWide d16; - return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); -#endif -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - const RebindToSigned di; - alignas(64) static constexpr int16_t kReverse[32] = { - 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - const Vec512 idx = Load(di, kReverse); - return BitCast(d, Vec512{ - _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - alignas(64) static constexpr int32_t kReverse[16] = { - 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; - return TableLookupLanes(v, SetTableIndices(d, kReverse)); -} - -template -HWY_API VFromD Reverse(D d, const VFromD v) { - alignas(64) static constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; - return TableLookupLanes(v, SetTableIndices(d, kReverse)); -} - -// ------------------------------ Reverse2 (in x86_128) - -// ------------------------------ Reverse4 - -template -HWY_API VFromD Reverse4(D d, const VFromD v) { - const RebindToSigned di; - alignas(64) static constexpr int16_t kReverse4[32] = { - 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, - 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; - const Vec512 idx = Load(di, kReverse4); - return BitCast(d, Vec512{ - _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); -} - -// 32 bit Reverse4 defined in x86_128. - -template -HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { - return VFromD{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; -} -template -HWY_API VFromD Reverse4(D /* tag */, VFromD v) { - return VFromD{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; -} - -// ------------------------------ Reverse8 - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const RebindToSigned di; - alignas(64) static constexpr int16_t kReverse8[32] = { - 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, - 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; - const Vec512 idx = Load(di, kReverse8); - return BitCast(d, Vec512{ - _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); -} - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - const RebindToSigned di; - alignas(64) static constexpr int32_t kReverse8[16] = { - 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; - const Vec512 idx = Load(di, kReverse8); - return BitCast(d, Vec512{ - _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)}); -} - -template -HWY_API VFromD Reverse8(D d, const VFromD v) { - return Reverse(d, v); -} - -// ------------------------------ ReverseBits - -#if HWY_TARGET <= HWY_AVX3_DL -template , 64)> -HWY_API V ReverseBits(V v) { - const Full512 du64; - const auto affine_matrix = Set(du64, 0x8040201008040201u); - return V{_mm512_gf2p8affine_epi64_epi8(v.raw, affine_matrix.raw, 0)}; -} -#endif // HWY_TARGET <= HWY_AVX3_DL - -// ------------------------------ InterleaveLower - -template -HWY_API Vec512 InterleaveLower(Vec512 a, Vec512 b) { - return Vec512{_mm512_unpacklo_epi8(a.raw, b.raw)}; -} -template -HWY_API Vec512 InterleaveLower(Vec512 a, Vec512 b) { - const DFromV d; - const RebindToUnsigned du; - using VU = VFromD; // for float16_t - return BitCast( - d, VU{_mm512_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); -} -template -HWY_API Vec512 InterleaveLower(Vec512 a, Vec512 b) { - return Vec512{_mm512_unpacklo_epi32(a.raw, b.raw)}; -} -template -HWY_API Vec512 InterleaveLower(Vec512 a, Vec512 b) { - return Vec512{_mm512_unpacklo_epi64(a.raw, b.raw)}; -} -HWY_API Vec512 InterleaveLower(Vec512 a, Vec512 b) { - return Vec512{_mm512_unpacklo_ps(a.raw, b.raw)}; -} -HWY_API Vec512 InterleaveLower(Vec512 a, Vec512 b) { - return Vec512{_mm512_unpacklo_pd(a.raw, b.raw)}; -} - -// ------------------------------ InterleaveUpper - -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm512_unpackhi_epi8(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { - const RebindToUnsigned du; - using VU = VFromD; // for float16_t - return BitCast( - d, VU{_mm512_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm512_unpackhi_epi32(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm512_unpackhi_epi64(a.raw, b.raw)}; -} - -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm512_unpackhi_ps(a.raw, b.raw)}; -} -template -HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { - return VFromD{_mm512_unpackhi_pd(a.raw, b.raw)}; -} - -// ------------------------------ Concat* halves - -// hiH,hiL loH,loL |-> hiL,loL (= lower halves) -template -HWY_API VFromD ConcatLowerLower(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)}; -} -template -HWY_API VFromD ConcatLowerLower(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)}; -} -template -HWY_API Vec512 ConcatLowerLower(D /* tag */, Vec512 hi, - Vec512 lo) { - return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)}; -} - -// hiH,hiL loH,loL |-> hiH,loH (= upper halves) -template -HWY_API VFromD ConcatUpperUpper(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)}; -} -template -HWY_API VFromD ConcatUpperUpper(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)}; -} -template -HWY_API Vec512 ConcatUpperUpper(D /* tag */, Vec512 hi, - Vec512 lo) { - return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)}; -} - -// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) -template -HWY_API VFromD ConcatLowerUpper(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; -} -template -HWY_API VFromD ConcatLowerUpper(D /* tag */, VFromD hi, VFromD lo) { - return VFromD{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; -} -template -HWY_API Vec512 ConcatLowerUpper(D /* tag */, Vec512 hi, - Vec512 lo) { - return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)}; -} - -// hiH,hiL loH,loL |-> hiH,loL (= outer halves) -template -HWY_API VFromD ConcatUpperLower(D /* tag */, VFromD hi, VFromD lo) { - // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks - // are efficiently loaded from 32-bit regs. - const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF); - return VFromD{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)}; -} -template -HWY_API VFromD ConcatUpperLower(D /* tag */, VFromD hi, VFromD lo) { - const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF); - return VFromD{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)}; -} -template -HWY_API Vec512 ConcatUpperLower(D /* tag */, Vec512 hi, - Vec512 lo) { - const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F); - return Vec512{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)}; -} - -// ------------------------------ ConcatOdd - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(64) static constexpr uint8_t kIdx[64] = { - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, - 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, - 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, - 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99, 101, 103, - 105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi8( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RepartitionToWide dw; - // Right-shift 8 bits per u16 so we can pack. - const Vec512 uH = ShiftRight<8>(BitCast(dw, hi)); - const Vec512 uL = ShiftRight<8>(BitCast(dw, lo)); - const Vec512 u8{_mm512_packus_epi16(uL.raw, uH.raw)}; - // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes. - const Full512 du64; - alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx))); -#endif -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint16_t kIdx[32] = { - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, - 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi16( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint32_t kIdx[16] = { - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi32( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint32_t kIdx[16] = { - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; - return VFromD{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi64( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -} - -template -HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; - return VFromD{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; -} - -// ------------------------------ ConcatEven - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(64) static constexpr uint8_t kIdx[64] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, - 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, - 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, - 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, - 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi8( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -#else - const RepartitionToWide dw; - // Isolate lower 8 bits per u16 so we can pack. - const Vec512 mask = Set(dw, 0x00FF); - const Vec512 uH = And(BitCast(dw, hi), mask); - const Vec512 uL = And(BitCast(dw, lo), mask); - const Vec512 u8{_mm512_packus_epi16(uL.raw, uH.raw)}; - // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes. - const Full512 du64; - alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx))); -#endif -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint16_t kIdx[32] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi16( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint32_t kIdx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi32( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint32_t kIdx[16] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; - return VFromD{_mm512_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; - return BitCast( - d, Vec512{_mm512_permutex2var_epi64( - BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); -} - -template -HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { - const RebindToUnsigned du; - alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; - return VFromD{_mm512_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; -} - -// ------------------------------ DupEven (InterleaveLower) - -template -HWY_API Vec512 DupEven(Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)}; -} -HWY_API Vec512 DupEven(Vec512 v) { - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)}; -} - -template -HWY_API Vec512 DupEven(const Vec512 v) { - const DFromV d; - return InterleaveLower(d, v, v); -} - -// ------------------------------ DupOdd (InterleaveUpper) - -template -HWY_API Vec512 DupOdd(Vec512 v) { - return Vec512{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)}; -} -HWY_API Vec512 DupOdd(Vec512 v) { - return Vec512{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)}; -} - -template -HWY_API Vec512 DupOdd(const Vec512 v) { - const DFromV d; - return InterleaveUpper(d, v, v); -} - -// ------------------------------ OddEven (IfThenElse) - -template -HWY_API Vec512 OddEven(const Vec512 a, const Vec512 b) { - constexpr size_t s = sizeof(T); - constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56; - return IfThenElse(Mask512{0x5555555555555555ull >> shift}, b, a); -} - -// ------------------------------ OddEvenBlocks - -template -HWY_API Vec512 OddEvenBlocks(Vec512 odd, Vec512 even) { - return Vec512{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)}; -} - -HWY_API Vec512 OddEvenBlocks(Vec512 odd, Vec512 even) { - return Vec512{ - _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)}; -} - -HWY_API Vec512 OddEvenBlocks(Vec512 odd, Vec512 even) { - return Vec512{ - _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)}; -} - -// ------------------------------ SwapAdjacentBlocks - -template -HWY_API Vec512 SwapAdjacentBlocks(Vec512 v) { - return Vec512{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)}; -} - -HWY_API Vec512 SwapAdjacentBlocks(Vec512 v) { - return Vec512{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)}; -} - -HWY_API Vec512 SwapAdjacentBlocks(Vec512 v) { - return Vec512{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)}; -} - -// ------------------------------ ReverseBlocks - -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return VFromD{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)}; -} -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return VFromD{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)}; -} -template -HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { - return VFromD{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)}; -} - -// ------------------------------ TableLookupBytes (ZeroExtendVector) - -// Both full -template -HWY_API Vec512 TableLookupBytes(Vec512 bytes, Vec512 indices) { - return Vec512{_mm512_shuffle_epi8(bytes.raw, indices.raw)}; -} - -// Partial index vector -template -HWY_API Vec128 TableLookupBytes(Vec512 bytes, Vec128 from) { - const Full512 d512; - const Half d256; - const Half d128; - // First expand to full 128, then 256, then 512. - const Vec128 from_full{from.raw}; - const auto from_512 = - ZeroExtendVector(d512, ZeroExtendVector(d256, from_full)); - const auto tbl_full = TableLookupBytes(bytes, from_512); - // Shrink to 256, then 128, then partial. - return Vec128{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw}; -} -template -HWY_API Vec256 TableLookupBytes(Vec512 bytes, Vec256 from) { - const DFromV dih; - const Twice di; - const auto from_512 = ZeroExtendVector(di, from); - return LowerHalf(dih, TableLookupBytes(bytes, from_512)); -} - -// Partial table vector -template -HWY_API Vec512 TableLookupBytes(Vec128 bytes, Vec512 from) { - const DFromV d512; - const Half d256; - const Half d128; - // First expand to full 128, then 256, then 512. - const Vec128 bytes_full{bytes.raw}; - const auto bytes_512 = - ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full)); - return TableLookupBytes(bytes_512, from); -} -template -HWY_API Vec512 TableLookupBytes(Vec256 bytes, Vec512 from) { - const Full512 d; - return TableLookupBytes(ZeroExtendVector(d, bytes), from); -} - -// Partial both are handled by x86_128/256. - -// ------------------------------ I8/U8 Broadcast (TableLookupBytes) - -template -HWY_API Vec512 Broadcast(const Vec512 v) { - static_assert(0 <= kLane && kLane < 16, "Invalid lane"); - return TableLookupBytes(v, Set(Full512(), static_cast(kLane))); -} - -// ------------------------------ Per4LaneBlockShuffle - -namespace detail { - -template -HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, - const uint32_t x2, - const uint32_t x1, - const uint32_t x0) { - return BitCast(d, Vec512{_mm512_set_epi32( - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0), - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0), - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0), - static_cast(x3), static_cast(x2), - static_cast(x1), static_cast(x0))}); -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag<64> /*vect_size_tag*/, V v) { - return V{ - _mm512_shuffle_epi32(v.raw, static_cast<_MM_PERM_ENUM>(kIdx3210 & 0xFF))}; -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<4> /*lane_size_tag*/, - hwy::SizeTag<64> /*vect_size_tag*/, V v) { - return V{_mm512_shuffle_ps(v.raw, v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<8> /*lane_size_tag*/, - hwy::SizeTag<64> /*vect_size_tag*/, V v) { - return V{_mm512_permutex_epi64(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -template )> -HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, - hwy::SizeTag<8> /*lane_size_tag*/, - hwy::SizeTag<64> /*vect_size_tag*/, V v) { - return V{_mm512_permutex_pd(v.raw, static_cast(kIdx3210 & 0xFF))}; -} - -} // namespace detail - -// ------------------------------ SlideUpLanes - -namespace detail { - -template -HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo) { - const DFromV d; - const Repartition du32; - return BitCast(d, - Vec512{_mm512_alignr_epi32( - BitCast(du32, hi).raw, BitCast(du32, lo).raw, kI32Lanes)}); -} - -template -HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo) { - const DFromV d; - const Repartition du64; - return BitCast(d, - Vec512{_mm512_alignr_epi64( - BitCast(du64, hi).raw, BitCast(du64, lo).raw, kI64Lanes)}); -} - -template -HWY_INLINE V SlideUpI32Lanes(V v) { - static_assert(0 <= kI32Lanes && kI32Lanes <= 15, - "kI32Lanes must be between 0 and 15"); - const DFromV d; - return CombineShiftRightI32Lanes<16 - kI32Lanes>(v, Zero(d)); -} - -template -HWY_INLINE V SlideUpI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 7, - "kI64Lanes must be between 0 and 7"); - const DFromV d; - return CombineShiftRightI64Lanes<8 - kI64Lanes>(v, Zero(d)); -} - -template -HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { - const Repartition du8; - -#if HWY_TARGET <= HWY_AVX3_DL - const auto byte_idx = Iota(du8, static_cast(size_t{0} - amt)); - return TwoTablesLookupLanes(v, Zero(d), Indices512>{byte_idx.raw}); -#else - const Repartition du16; - const Repartition du64; - const auto byte_idx = Iota(du8, static_cast(size_t{0} - (amt & 15))); - const auto blk_u64_idx = - Iota(du64, static_cast(uint64_t{0} - ((amt >> 4) << 1))); - - const VFromD even_blocks{ - _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; - const VFromD odd_blocks{ - _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(3, 1, 1, 3))}; - const auto odd_sel_mask = - MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, byte_idx)))); - const auto even_blk_lookup_result = - BitCast(d, TableLookupBytes(even_blocks, byte_idx)); - const VFromD blockwise_slide_up_result{ - _mm512_mask_shuffle_epi8(even_blk_lookup_result.raw, odd_sel_mask.raw, - odd_blocks.raw, byte_idx.raw)}; - return BitCast(d, TwoTablesLookupLanes( - BitCast(du64, blockwise_slide_up_result), Zero(du64), - Indices512{blk_u64_idx.raw})); -#endif -} - -} // namespace detail - -template -HWY_API VFromD SlideUpBlocks(D d, VFromD v) { - static_assert(0 <= kBlocks && kBlocks <= 3, - "kBlocks must be between 0 and 3"); - switch (kBlocks) { - case 0: - return v; - case 1: - return detail::SlideUpI64Lanes<2>(v); - case 2: - return ConcatLowerLower(d, v, Zero(d)); - case 3: - return detail::SlideUpI64Lanes<6>(v); - } - - return v; -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return detail::SlideUpI32Lanes<1>(v); - case 2: - return detail::SlideUpI64Lanes<1>(v); - case 3: - return detail::SlideUpI32Lanes<3>(v); - case 4: - return detail::SlideUpI64Lanes<2>(v); - case 5: - return detail::SlideUpI32Lanes<5>(v); - case 6: - return detail::SlideUpI64Lanes<3>(v); - case 7: - return detail::SlideUpI32Lanes<7>(v); - case 8: - return ConcatLowerLower(d, v, Zero(d)); - case 9: - return detail::SlideUpI32Lanes<9>(v); - case 10: - return detail::SlideUpI64Lanes<5>(v); - case 11: - return detail::SlideUpI32Lanes<11>(v); - case 12: - return detail::SlideUpI64Lanes<6>(v); - case 13: - return detail::SlideUpI32Lanes<13>(v); - case 14: - return detail::SlideUpI64Lanes<7>(v); - case 15: - return detail::SlideUpI32Lanes<15>(v); - } - } -#endif - - return detail::TableLookupSlideUpLanes(d, v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - switch (amt) { - case 0: - return v; - case 1: - return detail::SlideUpI64Lanes<1>(v); - case 2: - return detail::SlideUpI64Lanes<2>(v); - case 3: - return detail::SlideUpI64Lanes<3>(v); - case 4: - return ConcatLowerLower(d, v, Zero(d)); - case 5: - return detail::SlideUpI64Lanes<5>(v); - case 6: - return detail::SlideUpI64Lanes<6>(v); - case 7: - return detail::SlideUpI64Lanes<7>(v); - } - } -#endif - - return detail::TableLookupSlideUpLanes(d, v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - if ((amt & 3) == 0) { - const Repartition du32; - return BitCast(d, SlideUpLanes(du32, BitCast(du32, v), amt >> 2)); - } else if ((amt & 1) == 0) { - const Repartition du16; - return BitCast( - d, detail::TableLookupSlideUpLanes(du16, BitCast(du16, v), amt >> 1)); - } -#if HWY_TARGET > HWY_AVX3_DL - else if (amt <= 63) { // NOLINT(readability/braces) - const Repartition du64; - const size_t blk_u64_slideup_amt = (amt >> 4) << 1; - const auto vu64 = BitCast(du64, v); - const auto v_hi = - BitCast(d, SlideUpLanes(du64, vu64, blk_u64_slideup_amt)); - const auto v_lo = - (blk_u64_slideup_amt <= 4) - ? BitCast(d, SlideUpLanes(du64, vu64, blk_u64_slideup_amt + 2)) - : Zero(d); - switch (amt & 15) { - case 1: - return CombineShiftRightBytes<15>(d, v_hi, v_lo); - case 3: - return CombineShiftRightBytes<13>(d, v_hi, v_lo); - case 5: - return CombineShiftRightBytes<11>(d, v_hi, v_lo); - case 7: - return CombineShiftRightBytes<9>(d, v_hi, v_lo); - case 9: - return CombineShiftRightBytes<7>(d, v_hi, v_lo); - case 11: - return CombineShiftRightBytes<5>(d, v_hi, v_lo); - case 13: - return CombineShiftRightBytes<3>(d, v_hi, v_lo); - case 15: - return CombineShiftRightBytes<1>(d, v_hi, v_lo); - } - } -#endif // HWY_TARGET > HWY_AVX3_DL - } -#endif - - return detail::TableLookupSlideUpLanes(d, v, amt); -} - -template -HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt) && (amt & 1) == 0) { - const Repartition du32; - return BitCast(d, SlideUpLanes(du32, BitCast(du32, v), amt >> 1)); - } -#endif - - return detail::TableLookupSlideUpLanes(d, v, amt); -} - -// ------------------------------ Slide1Up - -template -HWY_API VFromD Slide1Up(D d, VFromD v) { -#if HWY_TARGET <= HWY_AVX3_DL - return detail::TableLookupSlideUpLanes(d, v, 1); -#else - const auto v_lo = detail::SlideUpI64Lanes<2>(v); - return CombineShiftRightBytes<15>(d, v, v_lo); -#endif -} - -template -HWY_API VFromD Slide1Up(D d, VFromD v) { - return detail::TableLookupSlideUpLanes(d, v, 1); -} - -template -HWY_API VFromD Slide1Up(D /*d*/, VFromD v) { - return detail::SlideUpI32Lanes<1>(v); -} - -template -HWY_API VFromD Slide1Up(D /*d*/, VFromD v) { - return detail::SlideUpI64Lanes<1>(v); -} - -// ------------------------------ SlideDownLanes - -namespace detail { - -template -HWY_INLINE V SlideDownI32Lanes(V v) { - static_assert(0 <= kI32Lanes && kI32Lanes <= 15, - "kI32Lanes must be between 0 and 15"); - const DFromV d; - return CombineShiftRightI32Lanes(Zero(d), v); -} - -template -HWY_INLINE V SlideDownI64Lanes(V v) { - static_assert(0 <= kI64Lanes && kI64Lanes <= 7, - "kI64Lanes must be between 0 and 7"); - const DFromV d; - return CombineShiftRightI64Lanes(Zero(d), v); -} - -template -HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { - const Repartition du8; - -#if HWY_TARGET <= HWY_AVX3_DL - auto byte_idx = Iota(du8, static_cast(amt)); - return TwoTablesLookupLanes(v, Zero(d), Indices512>{byte_idx.raw}); -#else - const Repartition du16; - const Repartition du64; - const auto byte_idx = Iota(du8, static_cast(amt & 15)); - const auto blk_u64_idx = Iota(du64, static_cast(((amt >> 4) << 1))); - - const VFromD even_blocks{ - _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(0, 2, 2, 0))}; - const VFromD odd_blocks{ - _mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; - const auto odd_sel_mask = - MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, byte_idx)))); - const VFromD even_blk_lookup_result{ - _mm512_maskz_shuffle_epi8(static_cast<__mmask64>(0x0000FFFFFFFFFFFFULL), - even_blocks.raw, byte_idx.raw)}; - const VFromD blockwise_slide_up_result{ - _mm512_mask_shuffle_epi8(even_blk_lookup_result.raw, odd_sel_mask.raw, - odd_blocks.raw, byte_idx.raw)}; - return BitCast(d, TwoTablesLookupLanes( - BitCast(du64, blockwise_slide_up_result), Zero(du64), - Indices512{blk_u64_idx.raw})); -#endif -} - -} // namespace detail - -template -HWY_API VFromD SlideDownBlocks(D d, VFromD v) { - static_assert(0 <= kBlocks && kBlocks <= 3, - "kBlocks must be between 0 and 3"); - const Half dh; - switch (kBlocks) { - case 0: - return v; - case 1: - return detail::SlideDownI64Lanes<2>(v); - case 2: - return ZeroExtendVector(d, UpperHalf(dh, v)); - case 3: - return detail::SlideDownI64Lanes<6>(v); - } - - return v; -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - const Half dh; - switch (amt) { - case 1: - return detail::SlideDownI32Lanes<1>(v); - case 2: - return detail::SlideDownI64Lanes<1>(v); - case 3: - return detail::SlideDownI32Lanes<3>(v); - case 4: - return detail::SlideDownI64Lanes<2>(v); - case 5: - return detail::SlideDownI32Lanes<5>(v); - case 6: - return detail::SlideDownI64Lanes<3>(v); - case 7: - return detail::SlideDownI32Lanes<7>(v); - case 8: - return ZeroExtendVector(d, UpperHalf(dh, v)); - case 9: - return detail::SlideDownI32Lanes<9>(v); - case 10: - return detail::SlideDownI64Lanes<5>(v); - case 11: - return detail::SlideDownI32Lanes<11>(v); - case 12: - return detail::SlideDownI64Lanes<6>(v); - case 13: - return detail::SlideDownI32Lanes<13>(v); - case 14: - return detail::SlideDownI64Lanes<7>(v); - case 15: - return detail::SlideDownI32Lanes<15>(v); - } - } -#endif - - return detail::TableLookupSlideDownLanes(d, v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - const Half dh; - switch (amt) { - case 0: - return v; - case 1: - return detail::SlideDownI64Lanes<1>(v); - case 2: - return detail::SlideDownI64Lanes<2>(v); - case 3: - return detail::SlideDownI64Lanes<3>(v); - case 4: - return ZeroExtendVector(d, UpperHalf(dh, v)); - case 5: - return detail::SlideDownI64Lanes<5>(v); - case 6: - return detail::SlideDownI64Lanes<6>(v); - case 7: - return detail::SlideDownI64Lanes<7>(v); - } - } -#endif - - return detail::TableLookupSlideDownLanes(d, v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt)) { - if ((amt & 3) == 0) { - const Repartition du32; - return BitCast(d, SlideDownLanes(du32, BitCast(du32, v), amt >> 2)); - } else if ((amt & 1) == 0) { - const Repartition du16; - return BitCast(d, detail::TableLookupSlideDownLanes( - du16, BitCast(du16, v), amt >> 1)); - } -#if HWY_TARGET > HWY_AVX3_DL - else if (amt <= 63) { // NOLINT(readability/braces) - const Repartition du64; - const size_t blk_u64_slidedown_amt = (amt >> 4) << 1; - const auto vu64 = BitCast(du64, v); - const auto v_lo = - BitCast(d, SlideDownLanes(du64, vu64, blk_u64_slidedown_amt)); - const auto v_hi = - (blk_u64_slidedown_amt <= 4) - ? BitCast(d, - SlideDownLanes(du64, vu64, blk_u64_slidedown_amt + 2)) - : Zero(d); - switch (amt & 15) { - case 1: - return CombineShiftRightBytes<1>(d, v_hi, v_lo); - case 3: - return CombineShiftRightBytes<3>(d, v_hi, v_lo); - case 5: - return CombineShiftRightBytes<5>(d, v_hi, v_lo); - case 7: - return CombineShiftRightBytes<7>(d, v_hi, v_lo); - case 9: - return CombineShiftRightBytes<9>(d, v_hi, v_lo); - case 11: - return CombineShiftRightBytes<11>(d, v_hi, v_lo); - case 13: - return CombineShiftRightBytes<13>(d, v_hi, v_lo); - case 15: - return CombineShiftRightBytes<15>(d, v_hi, v_lo); - } - } -#endif - } -#endif - - return detail::TableLookupSlideDownLanes(d, v, amt); -} - -template -HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { -#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang - if (__builtin_constant_p(amt) && (amt & 1) == 0) { - const Repartition du32; - return BitCast(d, SlideDownLanes(du32, BitCast(du32, v), amt >> 1)); - } -#endif - - return detail::TableLookupSlideDownLanes(d, v, amt); -} - -// ------------------------------ Slide1Down - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { -#if HWY_TARGET <= HWY_AVX3_DL - return detail::TableLookupSlideDownLanes(d, v, 1); -#else - const auto v_hi = detail::SlideDownI64Lanes<2>(v); - return CombineShiftRightBytes<1>(d, v_hi, v); -#endif -} - -template -HWY_API VFromD Slide1Down(D d, VFromD v) { - return detail::TableLookupSlideDownLanes(d, v, 1); -} - -template -HWY_API VFromD Slide1Down(D /*d*/, VFromD v) { - return detail::SlideDownI32Lanes<1>(v); -} - -template -HWY_API VFromD Slide1Down(D /*d*/, VFromD v) { - return detail::SlideDownI64Lanes<1>(v); -} - -// ================================================== CONVERT - -// ------------------------------ Promotions (part w/ narrow lanes -> full) - -// Unsigned: zero-extend. -// Note: these have 3 cycle latency; if inputs are already split across the -// 128 bit blocks (in their upper/lower halves), then Zip* would be faster. -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepu8_epi16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm512_cvtepu8_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepu16_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepu32_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm512_cvtepu16_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec64 v) { - return VFromD{_mm512_cvtepu8_epi64(v.raw)}; -} - -// Signed: replicate sign bit. -// Note: these have 3 cycle latency; if inputs are already split across the -// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by -// signed shift would be faster. -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepi8_epi16(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm512_cvtepi8_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepi16_epi32(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepi32_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { - return VFromD{_mm512_cvtepi16_epi64(v.raw)}; -} -template -HWY_API VFromD PromoteTo(D /* tag */, Vec64 v) { - return VFromD{_mm512_cvtepi8_epi64(v.raw)}; -} - -// Float -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { -#if HWY_HAVE_FLOAT16 - const RebindToUnsigned> du16; - return VFromD{_mm512_cvtph_ps(BitCast(du16, v).raw)}; -#else - return VFromD{_mm512_cvtph_ps(v.raw)}; -#endif // HWY_HAVE_FLOAT16 -} - -template -HWY_API VFromD PromoteTo(D df32, Vec256 v) { - const Rebind du16; - const RebindToSigned di32; - return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); -} - -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtps_pd(v.raw)}; -} - -template -HWY_API VFromD PromoteTo(D /* tag */, Vec256 v) { - return VFromD{_mm512_cvtepi32_pd(v.raw)}; -} - -// ------------------------------ Demotions (full -> part w/ narrow lanes) - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 du64; - const Vec512 u16{_mm512_packus_epi32(v.raw, v.raw)}; - - // Compress even u64 lanes into 256 bit. - alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; - const auto idx64 = Load(du64, kLanes); - const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)}; - return LowerHalf(even); -} - -template -HWY_API VFromD DemoteTo(D dn, Vec512 v) { - const DFromV d; - const RebindToSigned di; - return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 du64; - const Vec512 i16{_mm512_packs_epi32(v.raw, v.raw)}; - - // Compress even u64 lanes into 256 bit. - alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; - const auto idx64 = Load(du64, kLanes); - const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)}; - return LowerHalf(even); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 du32; - const Vec512 i16{_mm512_packs_epi32(v.raw, v.raw)}; - const Vec512 u8{_mm512_packus_epi16(i16.raw, i16.raw)}; - - alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12}; - const auto idx32 = LoadDup128(du32, kLanes); - const Vec512 fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)}; - return LowerHalf(LowerHalf(fixed)); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtusepi32_epi8(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 du64; - const Vec512 u8{_mm512_packus_epi16(v.raw, v.raw)}; - - // Compress even u64 lanes into 256 bit. - alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; - const auto idx64 = Load(du64, kLanes); - const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)}; - return LowerHalf(even); -} - -template -HWY_API VFromD DemoteTo(D dn, Vec512 v) { - const DFromV d; - const RebindToSigned di; - return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu)))); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 du32; - const Vec512 i16{_mm512_packs_epi32(v.raw, v.raw)}; - const Vec512 i8{_mm512_packs_epi16(i16.raw, i16.raw)}; - - alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12, - 0, 4, 8, 12, 0, 4, 8, 12}; - const auto idx32 = LoadDup128(du32, kLanes); - const Vec512 fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)}; - return LowerHalf(LowerHalf(fixed)); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 du64; - const Vec512 u8{_mm512_packs_epi16(v.raw, v.raw)}; - - // Compress even u64 lanes into 256 bit. - alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6}; - const auto idx64 = Load(du64, kLanes); - const Vec512 even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)}; - return LowerHalf(even); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtsepi64_epi32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtsepi64_epi16(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtsepi64_epi8(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm512_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm512_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const auto neg_mask = MaskFromVec(v); -#if HWY_COMPILER_HAS_MASK_INTRINSICS - const __mmask8 non_neg_mask = _knot_mask8(neg_mask.raw); -#else - const __mmask8 non_neg_mask = static_cast<__mmask8>(~neg_mask.raw); -#endif - return VFromD{_mm512_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtusepi64_epi32(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtusepi64_epi16(v.raw)}; -} -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtusepi64_epi8(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - // Work around warnings in the intrinsic definitions (passing -1 as a mask). - HWY_DIAGNOSTICS(push) - HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") - return VFromD{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}; - HWY_DIAGNOSTICS(pop) -} - -template -HWY_API VFromD DemoteTo(D dbf16, Vec512 v) { - // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16. - const Rebind di32; - const Rebind du32; // for logical shift right - const Rebind du16; - const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); - return BitCast(dbf16, DemoteTo(du16, bits_in_32)); -} - -template -HWY_API VFromD ReorderDemote2To(D dbf16, Vec512 a, Vec512 b) { - // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16. - const RebindToUnsigned du16; - const Repartition du32; - const Vec512 b_in_even = ShiftRight<16>(BitCast(du32, b)); - return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); -} - -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec512 a, - Vec512 b) { - return VFromD{_mm512_packs_epi32(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec512 a, - Vec512 b) { - return VFromD{_mm512_packus_epi32(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, - Vec512 b) { - const DFromV du32; - const RebindToSigned di32; - const auto max_i32 = Set(du32, 0x7FFFFFFFu); - - return ReorderDemote2To(dn, BitCast(di32, Min(a, max_i32)), - BitCast(di32, Min(b, max_i32))); -} - -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec512 a, - Vec512 b) { - return VFromD{_mm512_packs_epi16(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D /* tag */, Vec512 a, - Vec512 b) { - return VFromD{_mm512_packus_epi16(a.raw, b.raw)}; -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, - Vec512 b) { - const DFromV du16; - const RebindToSigned di16; - const auto max_i16 = Set(du16, 0x7FFFu); - - return ReorderDemote2To(dn, BitCast(di16, Min(a, max_i16)), - BitCast(di16, Min(b, max_i16))); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, Vec512 b) { - const Half dnh; - return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); -} - -template -HWY_API VFromD ReorderDemote2To(D dn, Vec512 a, - Vec512 b) { - const Half dnh; - return Combine(dn, DemoteTo(dnh, b), DemoteTo(dnh, a)); -} - -template ), - HWY_IF_V_SIZE_D(D, 64), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2), - HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4))> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - const Full512 du64; - alignas(64) static constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; - return BitCast(d, TableLookupLanes(BitCast(du64, ReorderDemote2To(d, a, b)), - SetTableIndices(du64, kIdx))); -} - -template ), - HWY_IF_V_SIZE_GT_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), - HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), - HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2), - HWY_IF_T_SIZE_V(V, 8)> -HWY_API VFromD OrderedDemote2To(D d, V a, V b) { - return ReorderDemote2To(d, a, b); -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtpd_ps(v.raw)}; -} - -template -HWY_API VFromD DemoteTo(D /* tag */, Vec512 v) { - const Full512 d64; - const auto clamped = detail::ClampF64ToI32Max(d64, v); - return VFromD{_mm512_cvttpd_epi32(clamped.raw)}; -} - -// For already range-limited input [0, 255]. -HWY_API Vec128 U8FromU32(const Vec512 v) { - const DFromV d32; - // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the - // lowest 4 bytes. - alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u, - ~0u}; - const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32)); - // Gather the lowest 4 bytes of 4 128-bit blocks. - alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12}; - const Vec512 bytes{ - _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)}; - return LowerHalf(LowerHalf(bytes)); -} - -// ------------------------------ Truncations - -template -HWY_API VFromD TruncateTo(D d, const Vec512 v) { -#if HWY_TARGET <= HWY_AVX3_DL - (void)d; - const Full512 d8; - alignas(16) static constexpr uint8_t k8From64[16] = { - 0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56}; - const Vec512 bytes{ - _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)}; - return LowerHalf(LowerHalf(LowerHalf(bytes))); -#else - const Full512 d32; - alignas(64) static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14, - 0, 2, 4, 6, 8, 10, 12, 14}; - const Vec512 even{ - _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)}; - return TruncateTo(d, LowerHalf(even)); -#endif -} - -template -HWY_API VFromD TruncateTo(D /* tag */, const Vec512 v) { - const Full512 d16; - alignas(16) static constexpr uint16_t k16From64[8] = {0, 4, 8, 12, - 16, 20, 24, 28}; - const Vec512 bytes{ - _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)}; - return LowerHalf(LowerHalf(bytes)); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, const Vec512 v) { - const Full512 d32; - alignas(64) static constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14, - 0, 2, 4, 6, 8, 10, 12, 14}; - const Vec512 even{ - _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)}; - return LowerHalf(even); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, const Vec512 v) { -#if HWY_TARGET <= HWY_AVX3_DL - const Full512 d8; - alignas(16) static constexpr uint8_t k8From32[16] = { - 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}; - const Vec512 bytes{ - _mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)}; -#else - const Full512 d32; - // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the - // lowest 4 bytes. - alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u, - ~0u}; - const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32)); - // Gather the lowest 4 bytes of 4 128-bit blocks. - alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12}; - const Vec512 bytes{ - _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)}; -#endif - return LowerHalf(LowerHalf(bytes)); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, const Vec512 v) { - const Full512 d16; - alignas(64) static constexpr uint16_t k16From32[32] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; - const Vec512 bytes{ - _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)}; - return LowerHalf(bytes); -} - -template -HWY_API VFromD TruncateTo(D /* tag */, const Vec512 v) { -#if HWY_TARGET <= HWY_AVX3_DL - const Full512 d8; - alignas(64) static constexpr uint8_t k8From16[64] = { - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; - const Vec512 bytes{ - _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)}; -#else - const Full512 d32; - alignas(16) static constexpr uint32_t k16From32[4] = { - 0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u}; - const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32)); - alignas(64) static constexpr uint32_t kIndex32[16] = { - 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13}; - const Vec512 bytes{ - _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)}; -#endif - return LowerHalf(bytes); -} - -// ------------------------------ Convert integer <=> floating point - -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtepu16_ph(v.raw)}; -} -template -HWY_API VFromD ConvertTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtepi16_ph(v.raw)}; -} -#endif // HWY_HAVE_FLOAT16 - -template -HWY_API VFromD ConvertTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtepi32_ps(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /* tag */, Vec512 v) { - return VFromD{_mm512_cvtepi64_pd(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /* tag*/, Vec512 v) { - return VFromD{_mm512_cvtepu32_ps(v.raw)}; -} - -template -HWY_API VFromD ConvertTo(D /* tag*/, Vec512 v) { - return VFromD{_mm512_cvtepu64_pd(v.raw)}; -} - -// Truncates (rounds toward zero). -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D d, Vec512 v) { - return detail::FixConversionOverflow(d, v, - VFromD{_mm512_cvttph_epi16(v.raw)}); -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API VFromD ConvertTo(D d, Vec512 v) { - return detail::FixConversionOverflow(d, v, - VFromD{_mm512_cvttps_epi32(v.raw)}); -} -template -HWY_API VFromD ConvertTo(D di, Vec512 v) { - return detail::FixConversionOverflow(di, v, - VFromD{_mm512_cvttpd_epi64(v.raw)}); -} - -HWY_API Vec512 NearestInt(const Vec512 v) { - const Full512 di; - return detail::FixConversionOverflow( - di, v, Vec512{_mm512_cvtps_epi32(v.raw)}); -} - -// ================================================== CRYPTO - -#if !defined(HWY_DISABLE_PCLMUL_AES) - -HWY_API Vec512 AESRound(Vec512 state, - Vec512 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_aesenc_epi128(state.raw, round_key.raw)}; -#else - const DFromV d; - const Half d2; - return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESRound(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -HWY_API Vec512 AESLastRound(Vec512 state, - Vec512 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_aesenclast_epi128(state.raw, round_key.raw)}; -#else - const DFromV d; - const Half d2; - return Combine(d, - AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESLastRound(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -HWY_API Vec512 AESRoundInv(Vec512 state, - Vec512 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_aesdec_epi128(state.raw, round_key.raw)}; -#else - const Full512 d; - const Half d2; - return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESRoundInv(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -HWY_API Vec512 AESLastRoundInv(Vec512 state, - Vec512 round_key) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_aesdeclast_epi128(state.raw, round_key.raw)}; -#else - const Full512 d; - const Half d2; - return Combine( - d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), - AESLastRoundInv(LowerHalf(state), LowerHalf(round_key))); -#endif -} - -template -HWY_API Vec512 AESKeyGenAssist(Vec512 v) { - const Full512 d; -#if HWY_TARGET <= HWY_AVX3_DL - alignas(16) static constexpr uint8_t kRconXorMask[16] = { - 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; - alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { - 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; - const Repartition du32; - const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); - const auto sub_word_result = AESLastRound(w13, LoadDup128(d, kRconXorMask)); - return TableLookupBytes(sub_word_result, LoadDup128(d, kRotWordShuffle)); -#else - const Half d2; - return Combine(d, AESKeyGenAssist(UpperHalf(d2, v)), - AESKeyGenAssist(LowerHalf(v))); -#endif -} - -HWY_API Vec512 CLMulLower(Vec512 va, Vec512 vb) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)}; -#else - alignas(64) uint64_t a[8]; - alignas(64) uint64_t b[8]; - const DFromV d; - const Half> d128; - Store(va, d, a); - Store(vb, d, b); - for (size_t i = 0; i < 8; i += 2) { - const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i)); - Store(mul, d128, a + i); - } - return Load(d, a); -#endif -} - -HWY_API Vec512 CLMulUpper(Vec512 va, Vec512 vb) { -#if HWY_TARGET <= HWY_AVX3_DL - return Vec512{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)}; -#else - alignas(64) uint64_t a[8]; - alignas(64) uint64_t b[8]; - const DFromV d; - const Half> d128; - Store(va, d, a); - Store(vb, d, b); - for (size_t i = 0; i < 8; i += 2) { - const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i)); - Store(mul, d128, a + i); - } - return Load(d, a); -#endif -} - -#endif // HWY_DISABLE_PCLMUL_AES - -// ================================================== MISC - -// ------------------------------ I32/I64 SaturatedAdd (MaskFromVec) - -HWY_API Vec512 SaturatedAdd(Vec512 a, Vec512 b) { - const DFromV d; - const auto sum = a + b; - const auto overflow_mask = MaskFromVec( - Vec512{_mm512_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); - const auto i32_max = Set(d, LimitsMax()); - const Vec512 overflow_result{_mm512_mask_ternarylogic_epi32( - i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, sum); -} - -HWY_API Vec512 SaturatedAdd(Vec512 a, Vec512 b) { - const DFromV d; - const auto sum = a + b; - const auto overflow_mask = MaskFromVec( - Vec512{_mm512_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); - const auto i64_max = Set(d, LimitsMax()); - const Vec512 overflow_result{_mm512_mask_ternarylogic_epi64( - i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, sum); -} - -// ------------------------------ I32/I64 SaturatedSub (MaskFromVec) - -HWY_API Vec512 SaturatedSub(Vec512 a, Vec512 b) { - const DFromV d; - const auto diff = a - b; - const auto overflow_mask = MaskFromVec( - Vec512{_mm512_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); - const auto i32_max = Set(d, LimitsMax()); - const Vec512 overflow_result{_mm512_mask_ternarylogic_epi32( - i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, diff); -} - -HWY_API Vec512 SaturatedSub(Vec512 a, Vec512 b) { - const DFromV d; - const auto diff = a - b; - const auto overflow_mask = MaskFromVec( - Vec512{_mm512_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); - const auto i64_max = Set(d, LimitsMax()); - const Vec512 overflow_result{_mm512_mask_ternarylogic_epi64( - i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; - return IfThenElse(overflow_mask, overflow_result, diff); -} - -// ------------------------------ Mask testing - -// Beware: the suffix indicates the number of mask bits, not lane size! - -namespace detail { - -template -HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask64_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} -template -HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask32_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} -template -HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask16_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} -template -HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestz_mask8_u8(mask.raw, mask.raw); -#else - return mask.raw == 0; -#endif -} - -} // namespace detail - -template -HWY_API bool AllFalse(D /* tag */, const MFromD mask) { - return detail::AllFalse(hwy::SizeTag)>(), mask); -} - -namespace detail { - -template -HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask64_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFFFFFFFFFFFFFFFull; -#endif -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask32_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFFFFFFFull; -#endif -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask16_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFFFull; -#endif -} -template -HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512 mask) { -#if HWY_COMPILER_HAS_MASK_INTRINSICS - return _kortestc_mask8_u8(mask.raw, mask.raw); -#else - return mask.raw == 0xFFull; -#endif -} - -} // namespace detail - -template -HWY_API bool AllTrue(D /* tag */, const MFromD mask) { - return detail::AllTrue(hwy::SizeTag)>(), mask); -} - -// `p` points to at least 8 readable bytes, not all of which need be valid. -template -HWY_API MFromD LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { - MFromD mask; - CopyBytes<8 / sizeof(TFromD)>(bits, &mask.raw); - // N >= 8 (= 512 / 64), so no need to mask invalid bits. - return mask; -} - -// `p` points to at least 8 writable bytes. -template -HWY_API size_t StoreMaskBits(D /* tag */, MFromD mask, uint8_t* bits) { - const size_t kNumBytes = 8 / sizeof(TFromD); - CopyBytes(&mask.raw, bits); - // N >= 8 (= 512 / 64), so no need to mask invalid bits. - return kNumBytes; -} - -template -HWY_API size_t CountTrue(D /* tag */, const MFromD mask) { - return PopCount(static_cast(mask.raw)); -} - -template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); -} - -template -HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { - return Num0BitsBelowLS1Bit_Nonzero64(mask.raw); -} - -template -HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { - return mask.raw ? static_cast(FindKnownFirstTrue(d, mask)) - : intptr_t{-1}; -} - -template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw); -} - -template -HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { - return 63 - Num0BitsAboveMS1Bit_Nonzero64(mask.raw); -} - -template -HWY_API intptr_t FindLastTrue(D d, MFromD mask) { - return mask.raw ? static_cast(FindKnownLastTrue(d, mask)) - : intptr_t{-1}; -} - -// ------------------------------ Compress - -// Always implement 8-bit here even if we lack VBMI2 because we can do better -// than generic_ops (8 at a time) via the native 32-bit compress (16 at a time). -#ifdef HWY_NATIVE_COMPRESS8 -#undef HWY_NATIVE_COMPRESS8 -#else -#define HWY_NATIVE_COMPRESS8 -#endif - -namespace detail { - -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 -template -HWY_INLINE Vec128 NativeCompress(const Vec128 v, - const Mask128 mask) { - return Vec128{_mm_maskz_compress_epi8(mask.raw, v.raw)}; -} -HWY_INLINE Vec256 NativeCompress(const Vec256 v, - const Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi8(mask.raw, v.raw)}; -} -HWY_INLINE Vec512 NativeCompress(const Vec512 v, - const Mask512 mask) { - return Vec512{_mm512_maskz_compress_epi8(mask.raw, v.raw)}; -} - -template -HWY_INLINE Vec128 NativeCompress(const Vec128 v, - const Mask128 mask) { - return Vec128{_mm_maskz_compress_epi16(mask.raw, v.raw)}; -} -HWY_INLINE Vec256 NativeCompress(const Vec256 v, - const Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi16(mask.raw, v.raw)}; -} -HWY_INLINE Vec512 NativeCompress(const Vec512 v, - const Mask512 mask) { - return Vec512{_mm512_maskz_compress_epi16(mask.raw, v.raw)}; -} - -// Slow on Zen4, do not even define these to prevent accidental usage. -#if HWY_TARGET != HWY_AVX3_ZEN4 - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint8_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint8_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint8_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi8(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint16_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint16_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint16_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, v.raw); -} - -#endif // HWY_TARGET != HWY_AVX3_ZEN4 - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi8(mask.raw, v.raw)}; -} - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi16(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint8_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi8(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint16_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi16(mask.raw, unaligned)}; -} - -#endif // HWY_TARGET <= HWY_AVX3_DL - -template -HWY_INLINE Vec128 NativeCompress(Vec128 v, - Mask128 mask) { - return Vec128{_mm_maskz_compress_epi32(mask.raw, v.raw)}; -} -HWY_INLINE Vec256 NativeCompress(Vec256 v, - Mask256 mask) { - return Vec256{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; -} -HWY_INLINE Vec512 NativeCompress(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_compress_epi32(mask.raw, v.raw)}; -} -// We use table-based compress for 64-bit lanes, see CompressIsPartition. - -// Slow on Zen4, do not even define these to prevent accidental usage. -#if HWY_TARGET != HWY_AVX3_ZEN4 - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint32_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint32_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint32_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - uint64_t* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - uint64_t* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - uint64_t* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, Mask128 mask, - float* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - float* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - float* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw); -} - -template -HWY_INLINE void NativeCompressStore(Vec128 v, - Mask128 mask, - double* HWY_RESTRICT unaligned) { - _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec256 v, Mask256 mask, - double* HWY_RESTRICT unaligned) { - _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); -} -HWY_INLINE void NativeCompressStore(Vec512 v, Mask512 mask, - double* HWY_RESTRICT unaligned) { - _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw); -} - -#endif // HWY_TARGET != HWY_AVX3_ZEN4 - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi32(mask.raw, v.raw)}; -} - -HWY_INLINE Vec512 NativeExpand(Vec512 v, - Mask512 mask) { - return Vec512{_mm512_maskz_expand_epi64(mask.raw, v.raw)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint32_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi32(mask.raw, unaligned)}; -} - -template -HWY_INLINE VFromD NativeLoadExpand(Mask512 mask, D /* d */, - const uint64_t* HWY_RESTRICT unaligned) { - return VFromD{_mm512_maskz_expandloadu_epi64(mask.raw, unaligned)}; -} - -// For u8x16 and <= u16x16 we can avoid store+load for Compress because there is -// only a single compressed vector (u32x16). Other EmuCompress are implemented -// after the EmuCompressStore they build upon. -template -HWY_INLINE Vec128 EmuCompress(Vec128 v, - Mask128 mask) { - const DFromV d; - const Rebind d32; - const VFromD v0 = PromoteTo(d32, v); - - const uint64_t mask_bits{mask.raw}; - // Mask type is __mmask16 if v is full 128, else __mmask8. - using M32 = MFromD; - const M32 m0{static_cast(mask_bits)}; - return TruncateTo(d, Compress(v0, m0)); -} - -template -HWY_INLINE Vec128 EmuCompress(Vec128 v, - Mask128 mask) { - const DFromV d; - const Rebind di32; - const RebindToUnsigned du32; - const MFromD mask32{static_cast<__mmask8>(mask.raw)}; - // DemoteTo is 2 ops, but likely lower latency than TruncateTo on SKX. - // Only i32 -> u16 is supported, whereas NativeCompress expects u32. - const VFromD v32 = BitCast(du32, PromoteTo(di32, v)); - return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); -} - -HWY_INLINE Vec256 EmuCompress(Vec256 v, - Mask256 mask) { - const DFromV d; - const Rebind di32; - const RebindToUnsigned du32; - const Mask512 mask32{static_cast<__mmask16>(mask.raw)}; - const Vec512 v32 = BitCast(du32, PromoteTo(di32, v)); - return DemoteTo(d, BitCast(di32, NativeCompress(v32, mask32))); -} - -// See above - small-vector EmuCompressStore are implemented via EmuCompress. -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(EmuCompress(v, mask), d, unaligned); -} - -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - StoreU(EmuCompress(v, mask), d, unaligned); -} - -// Main emulation logic for wider vector, starting with EmuCompressStore because -// it is most convenient to merge pieces using memory (concatenating vectors at -// byte offsets is difficult). -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits{mask.raw}; - const Half dh; - const Rebind d32; - const Vec512 v0 = PromoteTo(d32, LowerHalf(v)); - const Vec512 v1 = PromoteTo(d32, UpperHalf(dh, v)); - const Mask512 m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)}; - const Mask512 m1{static_cast<__mmask16>(mask_bits >> 16)}; - const Vec128 c0 = TruncateTo(dh, NativeCompress(v0, m0)); - const Vec128 c1 = TruncateTo(dh, NativeCompress(v1, m1)); - uint8_t* HWY_RESTRICT pos = unaligned; - StoreU(c0, dh, pos); - StoreU(c1, dh, pos + CountTrue(d32, m0)); -} - -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const uint64_t mask_bits{mask.raw}; - const Half> dq; - const Rebind d32; - alignas(64) uint8_t lanes[64]; - Store(v, d, lanes); - const Vec512 v0 = PromoteTo(d32, LowerHalf(LowerHalf(v))); - const Vec512 v1 = PromoteTo(d32, Load(dq, lanes + 16)); - const Vec512 v2 = PromoteTo(d32, Load(dq, lanes + 32)); - const Vec512 v3 = PromoteTo(d32, Load(dq, lanes + 48)); - const Mask512 m0{static_cast<__mmask16>(mask_bits & 0xFFFFu)}; - const Mask512 m1{ - static_cast((mask_bits >> 16) & 0xFFFFu)}; - const Mask512 m2{ - static_cast((mask_bits >> 32) & 0xFFFFu)}; - const Mask512 m3{static_cast<__mmask16>(mask_bits >> 48)}; - const Vec128 c0 = TruncateTo(dq, NativeCompress(v0, m0)); - const Vec128 c1 = TruncateTo(dq, NativeCompress(v1, m1)); - const Vec128 c2 = TruncateTo(dq, NativeCompress(v2, m2)); - const Vec128 c3 = TruncateTo(dq, NativeCompress(v3, m3)); - uint8_t* HWY_RESTRICT pos = unaligned; - StoreU(c0, dq, pos); - pos += CountTrue(d32, m0); - StoreU(c1, dq, pos); - pos += CountTrue(d32, m1); - StoreU(c2, dq, pos); - pos += CountTrue(d32, m2); - StoreU(c3, dq, pos); -} - -template -HWY_INLINE void EmuCompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { - const Repartition di32; - const RebindToUnsigned du32; - const Half dh; - const Vec512 promoted0 = - BitCast(du32, PromoteTo(di32, LowerHalf(dh, v))); - const Vec512 promoted1 = - BitCast(du32, PromoteTo(di32, UpperHalf(dh, v))); - - const uint64_t mask_bits{mask.raw}; - const uint64_t maskL = mask_bits & 0xFFFF; - const uint64_t maskH = mask_bits >> 16; - const Mask512 mask0{static_cast<__mmask16>(maskL)}; - const Mask512 mask1{static_cast<__mmask16>(maskH)}; - const Vec512 compressed0 = NativeCompress(promoted0, mask0); - const Vec512 compressed1 = NativeCompress(promoted1, mask1); - - const Vec256 demoted0 = DemoteTo(dh, BitCast(di32, compressed0)); - const Vec256 demoted1 = DemoteTo(dh, BitCast(di32, compressed1)); - - // Store 256-bit halves - StoreU(demoted0, dh, unaligned); - StoreU(demoted1, dh, unaligned + PopCount(maskL)); -} - -// Finally, the remaining EmuCompress for wide vectors, using EmuCompressStore. -template // 1 or 2 bytes -HWY_INLINE Vec512 EmuCompress(Vec512 v, Mask512 mask) { - const DFromV d; - alignas(64) T buf[2 * Lanes(d)]; - EmuCompressStore(v, mask, d, buf); - return Load(d, buf); -} - -HWY_INLINE Vec256 EmuCompress(Vec256 v, - const Mask256 mask) { - const DFromV d; - alignas(32) uint8_t buf[2 * 32 / sizeof(uint8_t)]; - EmuCompressStore(v, mask, d, buf); - return Load(d, buf); -} - -} // namespace detail - -template -HWY_API V Compress(V v, const M mask) { - const DFromV d; - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); -#else - return BitCast(d, detail::EmuCompress(BitCast(du, v), mu)); -#endif -} - -template -HWY_API V Compress(V v, const M mask) { - const DFromV d; - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - return BitCast(d, detail::NativeCompress(BitCast(du, v), mu)); -} - -template -HWY_API Vec512 Compress(Vec512 v, Mask512 mask) { - // See CompressIsPartition. u64 is faster than u32. - alignas(16) static constexpr uint64_t packed_array[256] = { - // From PrintCompress32x8Tables, without the FirstN extension (there is - // no benefit to including them because 64-bit CompressStore is anyway - // masked, but also no harm because TableLookupLanes ignores the MSB). - 0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120, - 0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310, - 0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140, - 0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210, - 0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320, - 0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510, - 0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530, - 0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210, - 0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420, - 0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310, - 0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160, - 0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210, - 0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320, - 0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410, - 0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430, - 0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210, - 0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520, - 0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310, - 0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540, - 0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210, - 0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320, - 0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710, - 0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730, - 0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210, - 0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420, - 0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310, - 0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750, - 0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210, - 0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320, - 0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410, - 0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430, - 0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210, - 0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620, - 0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310, - 0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640, - 0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210, - 0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320, - 0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510, - 0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530, - 0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210, - 0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420, - 0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310, - 0x10765432, 0x17654320, 0x07654321, 0x76543210}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 3) - - // _mm512_permutexvar_epi64 will ignore the upper bits. - const DFromV d; - const RebindToUnsigned du64; - const auto packed = Set(du64, packed_array[mask.raw]); - alignas(64) static constexpr uint64_t shifts[8] = {0, 4, 8, 12, - 16, 20, 24, 28}; - const auto indices = Indices512{(packed >> Load(du64, shifts)).raw}; - return TableLookupLanes(v, indices); -} - -// ------------------------------ Expand - -template -HWY_API Vec512 Expand(Vec512 v, const Mask512 mask) { - const Full512 d; -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -#else - // LUTs are infeasible for 2^64 possible masks, so splice together two - // half-vector Expand. - const Full256 dh; - constexpr size_t N = Lanes(d); - // We have to shift the input by a variable number of u8. Shuffling requires - // VBMI2, in which case we would already have NativeExpand. We instead - // load at an offset, which may incur a store to load forwarding stall. - alignas(64) T lanes[N]; - Store(v, d, lanes); - using Bits = typename Mask256::Raw; - const Mask256 maskL{ - static_cast(mask.raw & Bits{(1ULL << (N / 2)) - 1})}; - const Mask256 maskH{static_cast(mask.raw >> (N / 2))}; - const size_t countL = CountTrue(dh, maskL); - const Vec256 expandL = Expand(LowerHalf(v), maskL); - const Vec256 expandH = Expand(LoadU(dh, lanes + countL), maskH); - return Combine(d, expandH, expandL); -#endif -} - -template -HWY_API Vec512 Expand(Vec512 v, const Mask512 mask) { - const Full512 d; - const RebindToUnsigned du; - const Vec512 vu = BitCast(du, v); -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - return BitCast(d, detail::NativeExpand(vu, RebindMask(du, mask))); -#else // AVX3 - // LUTs are infeasible for 2^32 possible masks, so splice together two - // half-vector Expand. - const Full256 dh; - constexpr size_t N = Lanes(d); - using Bits = typename Mask256::Raw; - const Mask256 maskL{ - static_cast(mask.raw & Bits{(1ULL << (N / 2)) - 1})}; - const Mask256 maskH{static_cast(mask.raw >> (N / 2))}; - // In AVX3 we can permutevar, which avoids a potential store to load - // forwarding stall vs. reloading the input. - alignas(64) uint16_t iota[64] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, - 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - const Vec512 indices = LoadU(du, iota + CountTrue(dh, maskL)); - const Vec512 shifted{_mm512_permutexvar_epi16(indices.raw, vu.raw)}; - const Vec256 expandL = Expand(LowerHalf(v), maskL); - const Vec256 expandH = Expand(LowerHalf(BitCast(d, shifted)), maskH); - return Combine(d, expandH, expandL); -#endif // AVX3 -} - -template -HWY_API V Expand(V v, const M mask) { - const DFromV d; - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); -} - -// For smaller vectors, it is likely more efficient to promote to 32-bit. -// This works for u8x16, u16x8, u16x16 (can be promoted to u32x16), but is -// unnecessary if HWY_AVX3_DL, which provides native instructions. -#if HWY_TARGET > HWY_AVX3_DL // no VBMI2 - -template , 16)> -HWY_API V Expand(V v, M mask) { - const DFromV d; - const RebindToUnsigned du; - const Rebind du32; - const VFromD vu = BitCast(du, v); - using M32 = MFromD; - const M32 m32{static_cast(mask.raw)}; - return BitCast(d, TruncateTo(du, Expand(PromoteTo(du32, vu), m32))); -} - -#endif // HWY_TARGET > HWY_AVX3_DL - -// ------------------------------ LoadExpand - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - const RebindToUnsigned du; - using TU = TFromD; - const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); -#else - return Expand(LoadU(d, unaligned), mask); -#endif -} - -template -HWY_API VFromD LoadExpand(MFromD mask, D d, - const TFromD* HWY_RESTRICT unaligned) { - const RebindToUnsigned du; - using TU = TFromD; - const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - const MFromD mu = RebindMask(du, mask); - return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); -} - -// ------------------------------ CompressNot - -template -HWY_API V CompressNot(V v, const M mask) { - return Compress(v, Not(mask)); -} - -template -HWY_API Vec512 CompressNot(Vec512 v, Mask512 mask) { - // See CompressIsPartition. u64 is faster than u32. - alignas(16) static constexpr uint64_t packed_array[256] = { - // From PrintCompressNot32x8Tables, without the FirstN extension (there is - // no benefit to including them because 64-bit CompressStore is anyway - // masked, but also no harm because TableLookupLanes ignores the MSB). - 0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431, - 0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542, - 0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321, - 0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653, - 0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651, - 0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432, - 0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421, - 0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764, - 0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631, - 0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762, - 0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321, - 0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543, - 0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541, - 0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532, - 0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521, - 0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075, - 0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431, - 0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742, - 0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321, - 0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073, - 0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071, - 0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432, - 0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421, - 0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654, - 0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531, - 0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652, - 0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321, - 0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643, - 0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641, - 0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632, - 0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621, - 0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106, - 0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431, - 0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542, - 0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321, - 0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053, - 0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051, - 0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432, - 0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421, - 0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104, - 0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031, - 0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102, - 0x76543210, 0x76543201, 0x76543210, 0x76543210}; - - // For lane i, shift the i-th 4-bit index down to bits [0, 3) - - // _mm512_permutexvar_epi64 will ignore the upper bits. - const DFromV d; - const RebindToUnsigned du64; - const auto packed = Set(du64, packed_array[mask.raw]); - alignas(64) static constexpr uint64_t shifts[8] = {0, 4, 8, 12, - 16, 20, 24, 28}; - const auto indices = Indices512{(packed >> Load(du64, shifts)).raw}; - return TableLookupLanes(v, indices); -} - -// uint64_t lanes. Only implement for 256 and 512-bit vectors because this is a -// no-op for 128-bit. -template , 16)> -HWY_API V CompressBlocksNot(V v, M mask) { - return CompressNot(v, mask); -} - -// ------------------------------ CompressBits -template -HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { - return Compress(v, LoadMaskBits(DFromV(), bits)); -} - -// ------------------------------ CompressStore - -// Generic for all vector lengths. - -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET == HWY_AVX3_ZEN4 - StoreU(Compress(v, mask), d, unaligned); -#else - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - auto pu = reinterpret_cast * HWY_RESTRICT>(unaligned); - -#if HWY_TARGET <= HWY_AVX3_DL // VBMI2 - detail::NativeCompressStore(BitCast(du, v), mu, pu); -#else - detail::EmuCompressStore(BitCast(du, v), mu, du, pu); -#endif -#endif // HWY_TARGET != HWY_AVX3_ZEN4 - const size_t count = CountTrue(d, mask); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET == HWY_AVX3_ZEN4 - StoreU(Compress(v, mask), d, unaligned); -#else - const RebindToUnsigned du; - const auto mu = RebindMask(du, mask); - using TU = TFromD; - TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); - detail::NativeCompressStore(BitCast(du, v), mu, pu); -#endif // HWY_TARGET != HWY_AVX3_ZEN4 - const size_t count = CountTrue(d, mask); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -// Additional overloads to avoid casting to uint32_t (delay?). -template -HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, - TFromD* HWY_RESTRICT unaligned) { -#if HWY_TARGET == HWY_AVX3_ZEN4 - StoreU(Compress(v, mask), d, unaligned); -#else - (void)d; - detail::NativeCompressStore(v, mask, unaligned); -#endif // HWY_TARGET != HWY_AVX3_ZEN4 - const size_t count = PopCount(uint64_t{mask.raw}); - detail::MaybeUnpoison(unaligned, count); - return count; -} - -// ------------------------------ CompressBlendedStore -template -HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, - TFromD* HWY_RESTRICT unaligned) { - // Native CompressStore already does the blending at no extra cost (latency - // 11, rthroughput 2 - same as compress plus store). - if (HWY_TARGET == HWY_AVX3_DL || - (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD) > 2)) { - return CompressStore(v, m, d, unaligned); - } else { - const size_t count = CountTrue(d, m); - BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned); - detail::MaybeUnpoison(unaligned, count); - return count; - } -} - -// ------------------------------ CompressBitsStore -// Generic for all vector lengths. -template -HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, - D d, TFromD* HWY_RESTRICT unaligned) { - return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); -} - -// ------------------------------ LoadInterleaved4 - -// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4. -namespace detail { - -// Type-safe wrapper. -template <_MM_PERM_ENUM kPerm, typename T> -Vec512 Shuffle128(const Vec512 lo, const Vec512 hi) { - return Vec512{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)}; -} -template <_MM_PERM_ENUM kPerm> -Vec512 Shuffle128(const Vec512 lo, const Vec512 hi) { - return Vec512{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)}; -} -template <_MM_PERM_ENUM kPerm> -Vec512 Shuffle128(const Vec512 lo, const Vec512 hi) { - return Vec512{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)}; -} - -// Input (128-bit blocks): -// 3 2 1 0 (<- first block in unaligned) -// 7 6 5 4 -// b a 9 8 -// Output: -// 9 6 3 0 (LSB of A) -// a 7 4 1 -// b 8 5 2 -template -HWY_API void LoadTransposedBlocks3(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& A, VFromD& B, VFromD& C) { - constexpr size_t N = Lanes(d); - const VFromD v3210 = LoadU(d, unaligned + 0 * N); - const VFromD v7654 = LoadU(d, unaligned + 1 * N); - const VFromD vba98 = LoadU(d, unaligned + 2 * N); - - const VFromD v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654); - const VFromD va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98); - - A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976); - B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976); - C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98); -} - -// Input (128-bit blocks): -// 3 2 1 0 (<- first block in unaligned) -// 7 6 5 4 -// b a 9 8 -// f e d c -// Output: -// c 8 4 0 (LSB of A) -// d 9 5 1 -// e a 6 2 -// f b 7 3 -template -HWY_API void LoadTransposedBlocks4(D d, const TFromD* HWY_RESTRICT unaligned, - VFromD& vA, VFromD& vB, VFromD& vC, - VFromD& vD) { - constexpr size_t N = Lanes(d); - const VFromD v3210 = LoadU(d, unaligned + 0 * N); - const VFromD v7654 = LoadU(d, unaligned + 1 * N); - const VFromD vba98 = LoadU(d, unaligned + 2 * N); - const VFromD vfedc = LoadU(d, unaligned + 3 * N); - - const VFromD v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654); - const VFromD vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc); - const VFromD v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654); - const VFromD vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc); - vA = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98); - vB = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98); - vC = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba); - vD = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba); -} - -} // namespace detail - -// ------------------------------ StoreInterleaved2 - -// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. - -namespace detail { - -// Input (128-bit blocks): -// 6 4 2 0 (LSB of i) -// 7 5 3 1 -// Output: -// 3 2 1 0 -// 7 6 5 4 -template -HWY_API void StoreTransposedBlocks2(const VFromD i, const VFromD j, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j); - const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j); - const auto j1_i1_j0_i0 = - detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0); - const auto j3_i3_j2_i2 = - detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2); - StoreU(j1_i1_j0_i0, d, unaligned + 0 * N); - StoreU(j3_i3_j2_i2, d, unaligned + 1 * N); -} - -// Input (128-bit blocks): -// 9 6 3 0 (LSB of i) -// a 7 4 1 -// b 8 5 2 -// Output: -// 3 2 1 0 -// 7 6 5 4 -// b a 9 8 -template -HWY_API void StoreTransposedBlocks3(const VFromD i, const VFromD j, - const VFromD k, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - const VFromD j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j); - const VFromD i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i); - const VFromD j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j); - - const VFromD out0 = // i1 k0 j0 i0 - detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0); - const VFromD out1 = // j2 i2 k1 j1 - detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0); - const VFromD out2 = // k3 j3 i3 k2 - detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1); - - StoreU(out0, d, unaligned + 0 * N); - StoreU(out1, d, unaligned + 1 * N); - StoreU(out2, d, unaligned + 2 * N); -} - -// Input (128-bit blocks): -// c 8 4 0 (LSB of i) -// d 9 5 1 -// e a 6 2 -// f b 7 3 -// Output: -// 3 2 1 0 -// 7 6 5 4 -// b a 9 8 -// f e d c -template -HWY_API void StoreTransposedBlocks4(const VFromD i, const VFromD j, - const VFromD k, const VFromD l, D d, - TFromD* HWY_RESTRICT unaligned) { - constexpr size_t N = Lanes(d); - const VFromD j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j); - const VFromD l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l); - const VFromD j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j); - const VFromD l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l); - const VFromD out0 = - detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0); - const VFromD out1 = - detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0); - const VFromD out2 = - detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2); - const VFromD out3 = - detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2); - StoreU(out0, d, unaligned + 0 * N); - StoreU(out1, d, unaligned + 1 * N); - StoreU(out2, d, unaligned + 2 * N); - StoreU(out3, d, unaligned + 3 * N); -} - -} // namespace detail - -// ------------------------------ Additional mask logical operations - -template -HWY_API Mask512 SetAtOrAfterFirst(Mask512 mask) { - return Mask512{ - static_cast::Raw>(0u - detail::AVX3Blsi(mask.raw))}; -} -template -HWY_API Mask512 SetBeforeFirst(Mask512 mask) { - return Mask512{ - static_cast::Raw>(detail::AVX3Blsi(mask.raw) - 1u)}; -} -template -HWY_API Mask512 SetAtOrBeforeFirst(Mask512 mask) { - return Mask512{ - static_cast::Raw>(detail::AVX3Blsmsk(mask.raw))}; -} -template -HWY_API Mask512 SetOnlyFirst(Mask512 mask) { - return Mask512{ - static_cast::Raw>(detail::AVX3Blsi(mask.raw))}; -} - -// ------------------------------ Shl (LoadDup128) - -HWY_API Vec512 operator<<(Vec512 v, Vec512 bits) { - return Vec512{_mm512_sllv_epi16(v.raw, bits.raw)}; -} - -// 8-bit: may use the << overload for uint16_t. -HWY_API Vec512 operator<<(Vec512 v, Vec512 bits) { - const DFromV d; -#if HWY_TARGET <= HWY_AVX3_DL - // kMask[i] = 0xFF >> i - alignas(16) static constexpr uint8_t kMasks[16] = { - 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; - // kShl[i] = 1 << i - alignas(16) static constexpr uint8_t kShl[16] = {0x01, 0x02, 0x04, 0x08, - 0x10, 0x20, 0x40, 0x80}; - v = And(v, TableLookupBytes(LoadDup128(d, kMasks), bits)); - const VFromD mul = TableLookupBytes(LoadDup128(d, kShl), bits); - return VFromD{_mm512_gf2p8mul_epi8(v.raw, mul.raw)}; -#else - const Repartition dw; - using VW = VFromD; - const VW even_mask = Set(dw, 0x00FF); - const VW odd_mask = Set(dw, 0xFF00); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - // Shift even lanes in-place - const VW evens = vw << And(bits16, even_mask); - const VW odds = And(vw, odd_mask) << ShiftRight<8>(bits16); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -#endif -} - -HWY_API Vec512 operator<<(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_sllv_epi32(v.raw, bits.raw)}; -} - -HWY_API Vec512 operator<<(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_sllv_epi64(v.raw, bits.raw)}; -} - -// Signed left shift is the same as unsigned. -template -HWY_API Vec512 operator<<(const Vec512 v, const Vec512 bits) { - const DFromV di; - const RebindToUnsigned du; - return BitCast(di, BitCast(du, v) << BitCast(du, bits)); -} - -// ------------------------------ Shr (IfVecThenElse) - -HWY_API Vec512 operator>>(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_srlv_epi16(v.raw, bits.raw)}; -} - -// 8-bit uses 16-bit shifts. -HWY_API Vec512 operator>>(Vec512 v, Vec512 bits) { - const DFromV d; - const RepartitionToWide dw; - using VW = VFromD; - const VW mask = Set(dw, 0x00FF); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - const VW evens = And(vw, mask) >> And(bits16, mask); - // Shift odd lanes in-place - const VW odds = vw >> ShiftRight<8>(bits16); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -} - -HWY_API Vec512 operator>>(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_srlv_epi32(v.raw, bits.raw)}; -} - -HWY_API Vec512 operator>>(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_srlv_epi64(v.raw, bits.raw)}; -} - -HWY_API Vec512 operator>>(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_srav_epi16(v.raw, bits.raw)}; -} - -// 8-bit uses 16-bit shifts. -HWY_API Vec512 operator>>(Vec512 v, Vec512 bits) { - const DFromV d; - const RepartitionToWide dw; - const RebindToUnsigned dw_u; - using VW = VFromD; - const VW mask = Set(dw, 0x00FF); - const VW vw = BitCast(dw, v); - const VW bits16 = BitCast(dw, bits); - const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >> And(bits16, mask); - // Shift odd lanes in-place - const VW odds = vw >> BitCast(dw, ShiftRight<8>(BitCast(dw_u, bits16))); - return OddEven(BitCast(d, odds), BitCast(d, evens)); -} - -HWY_API Vec512 operator>>(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_srav_epi32(v.raw, bits.raw)}; -} - -HWY_API Vec512 operator>>(const Vec512 v, - const Vec512 bits) { - return Vec512{_mm512_srav_epi64(v.raw, bits.raw)}; -} - -// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower) - -HWY_INLINE Vec512 MulEven(const Vec512 a, - const Vec512 b) { - const DFromV du64; - const RepartitionToNarrow du32; - const auto maskL = Set(du64, 0xFFFFFFFFULL); - const auto a32 = BitCast(du32, a); - const auto b32 = BitCast(du32, b); - // Inputs for MulEven: we only need the lower 32 bits - const auto aH = Shuffle2301(a32); - const auto bH = Shuffle2301(b32); - - // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need - // the even (lower 64 bits of every 128-bit block) results. See - // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat - const auto aLbL = MulEven(a32, b32); - const auto w3 = aLbL & maskL; - - const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); - const auto w2 = t2 & maskL; - const auto w1 = ShiftRight<32>(t2); - - const auto t = MulEven(a32, bH) + w2; - const auto k = ShiftRight<32>(t); - - const auto mulH = MulEven(aH, bH) + w1 + k; - const auto mulL = ShiftLeft<32>(t) + w3; - return InterleaveLower(mulL, mulH); -} - -HWY_INLINE Vec512 MulOdd(const Vec512 a, - const Vec512 b) { - const DFromV du64; - const RepartitionToNarrow du32; - const auto maskL = Set(du64, 0xFFFFFFFFULL); - const auto a32 = BitCast(du32, a); - const auto b32 = BitCast(du32, b); - // Inputs for MulEven: we only need bits [95:64] (= upper half of input) - const auto aH = Shuffle2301(a32); - const auto bH = Shuffle2301(b32); - - // Same as above, but we're using the odd results (upper 64 bits per block). - const auto aLbL = MulEven(a32, b32); - const auto w3 = aLbL & maskL; - - const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); - const auto w2 = t2 & maskL; - const auto w1 = ShiftRight<32>(t2); - - const auto t = MulEven(a32, bH) + w2; - const auto k = ShiftRight<32>(t); - - const auto mulH = MulEven(aH, bH) + w1 + k; - const auto mulL = ShiftLeft<32>(t) + w3; - return InterleaveUpper(du64, mulL, mulH); -} - -// ------------------------------ WidenMulPairwiseAdd -template -HWY_API VFromD WidenMulPairwiseAdd(D /*d32*/, Vec512 a, - Vec512 b) { - return VFromD{_mm512_madd_epi16(a.raw, b.raw)}; -} - -// ------------------------------ SatWidenMulPairwiseAdd - -template -HWY_API VFromD SatWidenMulPairwiseAdd( - DI16 /* tag */, VFromD> a, - VFromD> b) { - return VFromD{_mm512_maddubs_epi16(a.raw, b.raw)}; -} - -// ------------------------------ ReorderWidenMulAccumulate -template -HWY_API VFromD ReorderWidenMulAccumulate(D d, Vec512 a, - Vec512 b, - const VFromD sum0, - VFromD& /*sum1*/) { - (void)d; -#if HWY_TARGET <= HWY_AVX3_DL - return VFromD{_mm512_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; -#else - return sum0 + WidenMulPairwiseAdd(d, a, b); -#endif -} - -HWY_API Vec512 RearrangeToOddPlusEven(const Vec512 sum0, - Vec512 /*sum1*/) { - return sum0; // invariant already holds -} - -HWY_API Vec512 RearrangeToOddPlusEven(const Vec512 sum0, - Vec512 /*sum1*/) { - return sum0; // invariant already holds -} - -// ------------------------------ SumOfMulQuadAccumulate - -#if HWY_TARGET <= HWY_AVX3_DL - -template -HWY_API VFromD SumOfMulQuadAccumulate( - DI32 /*di32*/, VFromD> a_u, - VFromD> b_i, VFromD sum) { - return VFromD{_mm512_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; -} - -#endif - -// ------------------------------ Reductions - -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return _mm512_reduce_add_epi32(v.raw); -} -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return _mm512_reduce_add_epi64(v.raw); -} -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return static_cast(_mm512_reduce_add_epi32(v.raw)); -} -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return static_cast(_mm512_reduce_add_epi64(v.raw)); -} -#if HWY_HAVE_FLOAT16 -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return _mm512_reduce_add_ph(v.raw); -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return _mm512_reduce_add_ps(v.raw); -} -template -HWY_API TFromD ReduceSum(D, VFromD v) { - return _mm512_reduce_add_pd(v.raw); -} -template -HWY_API TFromD ReduceSum(D d, VFromD v) { - const RepartitionToWide d32; - const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); - const auto odd = ShiftRight<16>(BitCast(d32, v)); - const auto sum = ReduceSum(d32, even + odd); - return static_cast(sum); -} -template -HWY_API TFromD ReduceSum(D d, VFromD v) { - const RepartitionToWide d32; - // Sign-extend - const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); - const auto odd = ShiftRight<16>(BitCast(d32, v)); - const auto sum = ReduceSum(d32, even + odd); - return static_cast(sum); -} - -// Returns the sum in each lane. -template -HWY_API VFromD SumOfLanes(D d, VFromD v) { - return Set(d, ReduceSum(d, v)); -} - -// Returns the minimum in each lane. -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_epi32(v.raw)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_epi64(v.raw)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_epu32(v.raw)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_epu64(v.raw)); -} -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_ph(v.raw)); -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_ps(v.raw)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_min_pd(v.raw)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - const RepartitionToWide d32; - const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); - const auto odd = ShiftRight<16>(BitCast(d32, v)); - const auto min = MinOfLanes(d32, Min(even, odd)); - // Also broadcast into odd lanes. - return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); -} -template -HWY_API VFromD MinOfLanes(D d, VFromD v) { - const RepartitionToWide d32; - // Sign-extend - const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); - const auto odd = ShiftRight<16>(BitCast(d32, v)); - const auto min = MinOfLanes(d32, Min(even, odd)); - // Also broadcast into odd lanes. - return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); -} - -// Returns the maximum in each lane. -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_epi32(v.raw)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_epi64(v.raw)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_epu32(v.raw)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_epu64(v.raw)); -} -#if HWY_HAVE_FLOAT16 -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_ph(v.raw)); -} -#endif // HWY_HAVE_FLOAT16 -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_ps(v.raw)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - return Set(d, _mm512_reduce_max_pd(v.raw)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - const RepartitionToWide d32; - const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); - const auto odd = ShiftRight<16>(BitCast(d32, v)); - const auto min = MaxOfLanes(d32, Max(even, odd)); - // Also broadcast into odd lanes. - return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); -} -template -HWY_API VFromD MaxOfLanes(D d, VFromD v) { - const RepartitionToWide d32; - // Sign-extend - const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v))); - const auto odd = ShiftRight<16>(BitCast(d32, v)); - const auto min = MaxOfLanes(d32, Max(even, odd)); - // Also broadcast into odd lanes. - return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min)); -} - -// -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex - -template ), HWY_IF_V_SIZE_D(DFromV, 64)> -HWY_API V LeadingZeroCount(V v) { - return V{_mm512_lzcnt_epi32(v.raw)}; -} - -template ), HWY_IF_V_SIZE_D(DFromV, 64)> -HWY_API V LeadingZeroCount(V v) { - return V{_mm512_lzcnt_epi64(v.raw)}; -} - -namespace detail { - -template , 16)> -HWY_INLINE V Lzcnt32ForU8OrU16(V v) { - const DFromV d; - const Rebind di32; - const Rebind du32; - - const auto v_lz_count = LeadingZeroCount(PromoteTo(du32, v)); - return DemoteTo(d, BitCast(di32, v_lz_count)); -} - -template , 32)> -HWY_INLINE VFromD>> Lzcnt32ForU8OrU16AsU16(V v) { - const DFromV d; - const Half dh; - const Rebind di32; - const Rebind du32; - const Rebind du16; - - const auto lo_v_lz_count = - LeadingZeroCount(PromoteTo(du32, LowerHalf(dh, v))); - const auto hi_v_lz_count = - LeadingZeroCount(PromoteTo(du32, UpperHalf(dh, v))); - return OrderedDemote2To(du16, BitCast(di32, lo_v_lz_count), - BitCast(di32, hi_v_lz_count)); -} - -HWY_INLINE Vec256 Lzcnt32ForU8OrU16(Vec256 v) { - const DFromV d; - const Rebind di16; - return DemoteTo(d, BitCast(di16, Lzcnt32ForU8OrU16AsU16(v))); -} - -HWY_INLINE Vec512 Lzcnt32ForU8OrU16(Vec512 v) { - const DFromV d; - const Half dh; - const Rebind di16; - - const auto lo_half = LowerHalf(dh, v); - const auto hi_half = UpperHalf(dh, v); - - const auto lo_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(lo_half)); - const auto hi_v_lz_count = BitCast(di16, Lzcnt32ForU8OrU16AsU16(hi_half)); - return OrderedDemote2To(d, lo_v_lz_count, hi_v_lz_count); -} - -HWY_INLINE Vec512 Lzcnt32ForU8OrU16(Vec512 v) { - return Lzcnt32ForU8OrU16AsU16(v); -} - -} // namespace detail - -template -HWY_API V LeadingZeroCount(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - - constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; - const auto v_lzcnt32 = detail::Lzcnt32ForU8OrU16(BitCast(du, v)); - return BitCast(d, Min(v_lzcnt32 - Set(du, TU{32 - kNumOfBitsInT}), - Set(du, TU{kNumOfBitsInT}))); -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - const RebindToUnsigned du; - using TU = TFromD; - return BitCast(d, - Set(du, TU{31}) - detail::Lzcnt32ForU8OrU16(BitCast(du, v))); -} - -template -HWY_API V HighestSetBitIndex(V v) { - const DFromV d; - using T = TFromD; - return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); -} - -template -HWY_API V TrailingZeroCount(V v) { - const DFromV d; - const RebindToSigned di; - using T = TFromD; - - const auto vi = BitCast(di, v); - const auto lowest_bit = BitCast(d, And(vi, Neg(vi))); - constexpr T kNumOfBitsInT{sizeof(T) * 8}; - const auto bit_idx = HighestSetBitIndex(lowest_bit); - return IfThenElse(MaskFromVec(bit_idx), Set(d, kNumOfBitsInT), bit_idx); -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - -// the warning seems to be issued at the call site of intrinsics, i.e. our code. -HWY_DIAGNOSTICS(pop) diff --git a/deps/highway/include/hwy/per_target.h b/deps/highway/include/hwy/per_target.h deleted file mode 100644 index 52c316ec..00000000 --- a/deps/highway/include/hwy/per_target.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2022 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_PER_TARGET_H_ -#define HIGHWAY_HWY_PER_TARGET_H_ - -#include - -#include "hwy/highway_export.h" - -// Functions to query the capabilities of the target that will be called by -// HWY_DYNAMIC_DISPATCH, which is not necessarily the current target. - -namespace hwy { - -// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag())`. -// -// Do not cache the result, which may change after calling DisableTargets, or -// if software requests a different vector size (e.g. when entering/exiting SME -// streaming mode). Instead call this right before the code that depends on the -// result, without any DisableTargets or SME transition in-between. Note that -// this involves an indirect call, so prefer not to call this frequently nor -// unnecessarily. -HWY_DLLEXPORT size_t VectorBytes(); - -// Returns whether 16/64-bit floats are a supported lane type. -HWY_DLLEXPORT bool HaveFloat16(); -HWY_DLLEXPORT bool HaveFloat64(); - -} // namespace hwy - -#endif // HIGHWAY_HWY_PER_TARGET_H_ diff --git a/deps/highway/include/hwy/print-inl.h b/deps/highway/include/hwy/print-inl.h deleted file mode 100644 index 46881a29..00000000 --- a/deps/highway/include/hwy/print-inl.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2022 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Print() function - -#include "hwy/highway.h" -#include "hwy/print.h" - -// Per-target include guard -#if defined(HIGHWAY_HWY_PRINT_INL_H_) == defined(HWY_TARGET_TOGGLE) -#ifdef HIGHWAY_HWY_PRINT_INL_H_ -#undef HIGHWAY_HWY_PRINT_INL_H_ -#else -#define HIGHWAY_HWY_PRINT_INL_H_ -#endif - -#if HWY_TARGET == HWY_RVV -#include "hwy/aligned_allocator.h" -#endif - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { - -// Prints lanes around `lane`, in memory order. -template > -HWY_API void Print(const D d, const char* caption, V v, size_t lane_u = 0, - size_t max_lanes = 7) { - const size_t N = Lanes(d); - using T = TFromD; -#if HWY_TARGET == HWY_RVV - auto storage = AllocateAligned(N); - T* HWY_RESTRICT lanes = storage.get(); -#else - // This works around an SVE compile error on GCC 11 and 12. Calling - // AllocateAligned here would seem to require it be marked with HWY_ATTR. - HWY_ALIGN T lanes[MaxLanes(d)]; -#endif - Store(v, d, lanes); - - const auto info = hwy::detail::MakeTypeInfo(); - hwy::detail::PrintArray(info, caption, lanes, N, lane_u, max_lanes); -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -#endif // per-target include guard diff --git a/deps/highway/include/hwy/print.h b/deps/highway/include/hwy/print.h deleted file mode 100644 index e61631e6..00000000 --- a/deps/highway/include/hwy/print.h +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2022 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HWY_PRINT_H_ -#define HWY_PRINT_H_ - -// Helpers for printing vector lanes. - -#include -#include - -#include "hwy/base.h" -#include "hwy/highway_export.h" - -namespace hwy { - -namespace detail { - -// For implementing value comparisons etc. as type-erased functions to reduce -// template bloat. -struct TypeInfo { - size_t sizeof_t; - bool is_float; - bool is_signed; - bool is_bf16; -}; - -template -HWY_INLINE TypeInfo MakeTypeInfo() { - TypeInfo info; - info.sizeof_t = sizeof(T); - info.is_float = IsFloat(); - info.is_signed = IsSigned(); - info.is_bf16 = IsSame(); - return info; -} - -HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100); -HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, - char* string100); - -HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption, - const void* array_void, size_t N, - size_t lane_u = 0, size_t max_lanes = 7); - -} // namespace detail - -template -HWY_NOINLINE void PrintValue(T value) { - char str[100]; - detail::ToString(hwy::detail::MakeTypeInfo(), &value, str); - fprintf(stderr, "%s,", str); -} - -template -HWY_NOINLINE void PrintArray(const T* value, size_t count) { - detail::PrintArray(hwy::detail::MakeTypeInfo(), "", value, count, 0, - count); -} - -} // namespace hwy - -#endif // HWY_PRINT_H_ diff --git a/deps/highway/include/hwy/robust_statistics.h b/deps/highway/include/hwy/robust_statistics.h deleted file mode 100644 index 1cf3e5d2..00000000 --- a/deps/highway/include/hwy/robust_statistics.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2023 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_ROBUST_STATISTICS_H_ -#define HIGHWAY_HWY_ROBUST_STATISTICS_H_ - -#include // std::sort, std::find_if -#include -#include // std::pair -#include - -#include "hwy/base.h" - -namespace hwy { -namespace robust_statistics { - -// Sorts integral values in ascending order (e.g. for Mode). About 3x faster -// than std::sort for input distributions with very few unique values. -template -void CountingSort(T* values, size_t num_values) { - // Unique values and their frequency (similar to flat_map). - using Unique = std::pair; - std::vector unique; - for (size_t i = 0; i < num_values; ++i) { - const T value = values[i]; - const auto pos = - std::find_if(unique.begin(), unique.end(), - [value](const Unique u) { return u.first == value; }); - if (pos == unique.end()) { - unique.push_back(std::make_pair(value, 1)); - } else { - ++pos->second; - } - } - - // Sort in ascending order of value (pair.first). - std::sort(unique.begin(), unique.end()); - - // Write that many copies of each unique value to the array. - T* HWY_RESTRICT p = values; - for (const auto& value_count : unique) { - std::fill(p, p + value_count.second, value_count.first); - p += value_count.second; - } - HWY_ASSERT(p == values + num_values); -} - -// @return i in [idx_begin, idx_begin + half_count) that minimizes -// sorted[i + half_count] - sorted[i]. -template -size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin, - const size_t half_count) { - T min_range = std::numeric_limits::max(); - size_t min_idx = 0; - - for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) { - HWY_ASSERT(sorted[idx] <= sorted[idx + half_count]); - const T range = sorted[idx + half_count] - sorted[idx]; - if (range < min_range) { - min_range = range; - min_idx = idx; - } - } - - return min_idx; -} - -// Returns an estimate of the mode by calling MinRange on successively -// halved intervals. "sorted" must be in ascending order. This is the -// Half Sample Mode estimator proposed by Bickel in "On a fast, robust -// estimator of the mode", with complexity O(N log N). The mode is less -// affected by outliers in highly-skewed distributions than the median. -// The averaging operation below assumes "T" is an unsigned integer type. -template -T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) { - size_t idx_begin = 0; - size_t half_count = num_values / 2; - while (half_count > 1) { - idx_begin = MinRange(sorted, idx_begin, half_count); - half_count >>= 1; - } - - const T x = sorted[idx_begin + 0]; - if (half_count == 0) { - return x; - } - HWY_ASSERT(half_count == 1); - const T average = (x + sorted[idx_begin + 1] + 1) / 2; - return average; -} - -// Returns the mode. Side effect: sorts "values". -template -T Mode(T* values, const size_t num_values) { - CountingSort(values, num_values); - return ModeOfSorted(values, num_values); -} - -template -T Mode(T (&values)[N]) { - return Mode(&values[0], N); -} - -// Returns the median value. Side effect: sorts "values". -template -T Median(T* values, const size_t num_values) { - HWY_ASSERT(num_values != 0); - std::sort(values, values + num_values); - const size_t half = num_values / 2; - // Odd count: return middle - if (num_values % 2) { - return values[half]; - } - // Even count: return average of middle two. - return (values[half] + values[half - 1] + 1) / 2; -} - -// Returns a robust measure of variability. -template -T MedianAbsoluteDeviation(const T* values, const size_t num_values, - const T median) { - HWY_ASSERT(num_values != 0); - std::vector abs_deviations; - abs_deviations.reserve(num_values); - for (size_t i = 0; i < num_values; ++i) { - const int64_t abs = std::abs(static_cast(values[i]) - - static_cast(median)); - abs_deviations.push_back(static_cast(abs)); - } - return Median(abs_deviations.data(), num_values); -} - -} // namespace robust_statistics -} // namespace hwy - -#endif // HIGHWAY_HWY_ROBUST_STATISTICS_H_ diff --git a/deps/highway/include/hwy/targets.h b/deps/highway/include/hwy/targets.h deleted file mode 100644 index 693e2e80..00000000 --- a/deps/highway/include/hwy/targets.h +++ /dev/null @@ -1,338 +0,0 @@ -// Copyright 2020 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_TARGETS_H_ -#define HIGHWAY_HWY_TARGETS_H_ - -// Allows opting out of C++ standard library usage, which is not available in -// some Compiler Explorer environments. -#ifndef HWY_NO_LIBCXX -#include -#endif - -// For SIMD module implementations and their callers. Defines which targets to -// generate and call. - -#include "hwy/base.h" -#include "hwy/detect_targets.h" -#include "hwy/highway_export.h" - -#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX) -#include -#endif - -namespace hwy { - -// Returns bitfield of enabled targets that are supported on this CPU; there is -// always at least one such target, hence the return value is never 0. The -// targets returned may change after calling DisableTargets. This function is -// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding -// calls to it if there is only a single target enabled. -HWY_DLLEXPORT int64_t SupportedTargets(); - -// Evaluates to a function call, or literal if there is a single target. -#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0 -#define HWY_SUPPORTED_TARGETS HWY_TARGETS -#else -#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets() -#endif - -// Subsequent SupportedTargets will not return targets whose bit(s) are set in -// `disabled_targets`. Exception: if SupportedTargets would return 0, it will -// instead return HWY_STATIC_TARGET (there must always be one target to call). -// -// This function is useful for disabling targets known to be buggy, or if the -// best available target is undesirable (perhaps due to throttling or memory -// bandwidth limitations). Use SetSupportedTargetsForTest instead of this -// function for iteratively enabling specific targets for testing. -HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets); - -// Subsequent SupportedTargets will return the given set of targets, except -// those disabled via DisableTargets. Call with a mask of 0 to disable the mock -// and return to the normal SupportedTargets behavior. Used to run tests for -// all targets. -HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets); - -#ifndef HWY_NO_LIBCXX - -// Return the list of targets in HWY_TARGETS supported by the CPU as a list of -// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list -// is affected by the current SetSupportedTargetsForTest() mock if any. -HWY_INLINE std::vector SupportedAndGeneratedTargets() { - std::vector ret; - for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0; - targets = targets & (targets - 1)) { - int64_t current_target = targets & ~(targets - 1); - ret.push_back(current_target); - } - return ret; -} - -#endif // HWY_NO_LIBCXX - -static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) { - switch (target) { -#if HWY_ARCH_X86 - case HWY_SSE2: - return "SSE2"; - case HWY_SSSE3: - return "SSSE3"; - case HWY_SSE4: - return "SSE4"; - case HWY_AVX2: - return "AVX2"; - case HWY_AVX3: - return "AVX3"; - case HWY_AVX3_DL: - return "AVX3_DL"; - case HWY_AVX3_ZEN4: - return "AVX3_ZEN4"; - case HWY_AVX3_SPR: - return "AVX3_SPR"; -#endif - -#if HWY_ARCH_ARM - case HWY_SVE2_128: - return "SVE2_128"; - case HWY_SVE_256: - return "SVE_256"; - case HWY_SVE2: - return "SVE2"; - case HWY_SVE: - return "SVE"; - case HWY_NEON: - return "NEON"; - case HWY_NEON_WITHOUT_AES: - return "NEON_WITHOUT_AES"; -#endif - -#if HWY_ARCH_PPC - case HWY_PPC8: - return "PPC8"; - case HWY_PPC9: - return "PPC9"; - case HWY_PPC10: - return "PPC10"; -#endif - -#if HWY_ARCH_WASM - case HWY_WASM: - return "WASM"; - case HWY_WASM_EMU256: - return "WASM_EMU256"; -#endif - -#if HWY_ARCH_RVV - case HWY_RVV: - return "RVV"; -#endif - - case HWY_EMU128: - return "EMU128"; - case HWY_SCALAR: - return "SCALAR"; - - default: - return "Unknown"; // must satisfy gtest IsValidParamName() - } -} - -// The maximum number of dynamic targets on any architecture is defined by -// HWY_MAX_DYNAMIC_TARGETS and depends on the arch. - -// For the ChosenTarget mask and index we use a different bit arrangement than -// in the HWY_TARGETS mask. Only the targets involved in the current -// architecture are used in this mask, and therefore only the least significant -// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least -// significant bit is set when the mask is not initialized, the next -// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the -// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to -// that position and the next more significant bit is used for HWY_SCALAR (if -// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to -// define equivalent values for HWY_TARGETS in this representation. -// This mask representation allows to use ctz() on this mask and obtain a small -// number that's used as an index of the table for dynamic dispatch. In this -// way the first entry is used when the mask is uninitialized, the following -// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for -// scalar. - -// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format. -#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1)) - -// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the -// current architecture. -#define HWY_CHOSEN_TARGET_SHIFT(X) \ - ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \ - ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \ - << 1) - -// The HWY_TARGETS mask in the ChosenTarget mask format. -#define HWY_CHOSEN_TARGET_MASK_TARGETS \ - (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL) - -#if HWY_ARCH_X86 -// Maximum number of dynamic targets, changing this value is an ABI incompatible -// change -#define HWY_MAX_DYNAMIC_TARGETS 15 -#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86 -// These must match the order in which the HWY_TARGETS are defined -// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 - -// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly -// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry -// corresponds to the best target. Don't include a "," at the end of the list. -#define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \ - HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \ - HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \ - HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \ - nullptr, /* AVX */ \ - HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \ - HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \ - nullptr, /* reserved - SSE3? */ \ - HWY_CHOOSE_SSE2(func_name) /* SSE2 */ - -#elif HWY_ARCH_ARM -// See HWY_ARCH_X86 above for details. -#define HWY_MAX_DYNAMIC_TARGETS 15 -#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM -#define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \ - HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \ - HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \ - HWY_CHOOSE_SVE(func_name), /* SVE */ \ - HWY_CHOOSE_NEON(func_name), /* NEON */ \ - HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */ - -#elif HWY_ARCH_RVV -// See HWY_ARCH_X86 above for details. -#define HWY_MAX_DYNAMIC_TARGETS 9 -#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV -#define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_RVV(func_name), /* RVV */ \ - nullptr /* reserved */ - -#elif HWY_ARCH_PPC -// See HWY_ARCH_X86 above for details. -#define HWY_MAX_DYNAMIC_TARGETS 9 -#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC -#define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \ - HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \ - HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \ - nullptr, /* reserved (VSX or AltiVec) */ \ - nullptr /* reserved (VSX or AltiVec) */ - -#elif HWY_ARCH_WASM -// See HWY_ARCH_X86 above for details. -#define HWY_MAX_DYNAMIC_TARGETS 9 -#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM -#define HWY_CHOOSE_TARGET_LIST(func_name) \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - nullptr, /* reserved */ \ - HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \ - HWY_CHOOSE_WASM(func_name), /* WASM */ \ - nullptr /* reserved */ - -#else -// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though -// still creating single-entry tables in HWY_EXPORT to ensure portability. -#define HWY_MAX_DYNAMIC_TARGETS 1 -#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR -#endif - -// Bitfield of supported and enabled targets. The format differs from that of -// HWY_TARGETS; the lowest bit governs the first function pointer (which is -// special in that it calls FunctionCache, then Update, then dispatches to the -// actual implementation) in the tables created by HWY_EXPORT. Monostate (see -// GetChosenTarget), thread-safe except on RVV. -struct ChosenTarget { - public: - // Reset bits according to `targets` (typically the return value of - // SupportedTargets()). Postcondition: IsInitialized() == true. - void Update(int64_t targets) { - // These are `targets` shifted downwards, see above. Also include SCALAR - // (corresponds to the last entry in the function table) as fallback. - StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR); - } - - // Reset to the uninitialized state, so that FunctionCache will call Update - // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false. - void DeInit() { StoreMask(1); } - - // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH - // function was called, which we check in tests. - bool IsInitialized() const { return LoadMask() != 1; } - - // Return the index in the dynamic dispatch table to be used by the current - // CPU. Note that this method must be in the header file so it uses the value - // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that - // calls it, which may be different from others. This means we only enable - // those targets that were actually compiled in this module. - size_t HWY_INLINE GetIndex() const { - return hwy::Num0BitsBelowLS1Bit_Nonzero64( - static_cast(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS)); - } - - private: - // TODO(janwas): remove RVV once is available -#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX) - int64_t LoadMask() const { return mask_; } - void StoreMask(int64_t mask) { mask_ = mask; } - - int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0. -#else - int64_t LoadMask() const { return mask_.load(); } - void StoreMask(int64_t mask) { mask_.store(mask); } - - std::atomic mask_{1}; // Initialized to 1 so GetIndex() returns 0. -#endif // HWY_ARCH_RVV -}; - -// For internal use (e.g. by FunctionCache and DisableTargets). -HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); - -} // namespace hwy - -#endif // HIGHWAY_HWY_TARGETS_H_ diff --git a/deps/highway/include/hwy/timer-inl.h b/deps/highway/include/hwy/timer-inl.h deleted file mode 100644 index c286b0a8..00000000 --- a/deps/highway/include/hwy/timer-inl.h +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright 2023 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// High-resolution and high-precision timer - -// Per-target include guard -#if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE) -#ifdef HIGHWAY_HWY_TIMER_INL_H_ -#undef HIGHWAY_HWY_TIMER_INL_H_ -#else -#define HIGHWAY_HWY_TIMER_INL_H_ -#endif - -#include "hwy/highway.h" -#include "hwy/timer.h" - -#if defined(_WIN32) || defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif // NOMINMAX -#include -#endif - -#if defined(__APPLE__) -#include -#include -#endif - -#if defined(__HAIKU__) -#include -#endif - -#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) -#include // NOLINT __ppc_get_timebase_freq -#endif - -#if HWY_ARCH_X86 && HWY_COMPILER_MSVC -#include -#endif - -#include // clock_gettime - -HWY_BEFORE_NAMESPACE(); -namespace hwy { -namespace HWY_NAMESPACE { -namespace timer { - -// Ticks := platform-specific timer values (CPU cycles on x86). Must be -// unsigned to guarantee wraparound on overflow. -using Ticks = uint64_t; - -// Start/Stop return absolute timestamps and must be placed immediately before -// and after the region to measure. We provide separate Start/Stop functions -// because they use different fences. -// -// Background: RDTSC is not 'serializing'; earlier instructions may complete -// after it, and/or later instructions may complete before it. 'Fences' ensure -// regions' elapsed times are independent of such reordering. The only -// documented unprivileged serializing instruction is CPUID, which acts as a -// full fence (no reordering across it in either direction). Unfortunately -// the latency of CPUID varies wildly (perhaps made worse by not initializing -// its EAX input). Because it cannot reliably be deducted from the region's -// elapsed time, it must not be included in the region to measure (i.e. -// between the two RDTSC). -// -// The newer RDTSCP is sometimes described as serializing, but it actually -// only serves as a half-fence with release semantics. Although all -// instructions in the region will complete before the final timestamp is -// captured, subsequent instructions may leak into the region and increase the -// elapsed time. Inserting another fence after the final RDTSCP would prevent -// such reordering without affecting the measured region. -// -// Fortunately, such a fence exists. The LFENCE instruction is only documented -// to delay later loads until earlier loads are visible. However, Intel's -// reference manual says it acts as a full fence (waiting until all earlier -// instructions have completed, and delaying later instructions until it -// completes). AMD assigns the same behavior to MFENCE. -// -// We need a fence before the initial RDTSC to prevent earlier instructions -// from leaking into the region, and arguably another after RDTSC to avoid -// region instructions from completing before the timestamp is recorded. -// When surrounded by fences, the additional RDTSCP half-fence provides no -// benefit, so the initial timestamp can be recorded via RDTSC, which has -// lower overhead than RDTSCP because it does not read TSC_AUX. In summary, -// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. -// -// Using Start+Start leads to higher variance and overhead than Stop+Stop. -// However, Stop+Stop includes an LFENCE in the region measurements, which -// adds a delay dependent on earlier loads. The combination of Start+Stop -// is faster than Start+Start and more consistent than Stop+Stop because -// the first LFENCE already delayed subsequent loads before the measured -// region. This combination seems not to have been considered in prior work: -// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c -// -// Note: performance counters can measure 'exact' instructions-retired or -// (unhalted) cycle counts. The RDPMC instruction is not serializing and also -// requires fences. Unfortunately, it is not accessible on all OSes and we -// prefer to avoid kernel-mode drivers. Performance counters are also affected -// by several under/over-count errata, so we use the TSC instead. - -// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, -// divide by InvariantTicksPerSecond. -inline Ticks Start() { - Ticks t; -#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) - asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); -#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC - // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); - t = __rdtsc(); - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); -#elif HWY_ARCH_X86_64 - asm volatile( - "lfence\n\t" - "rdtsc\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rdx = TSC >> 32. - // "cc" = flags modified by SHL. - : "rdx", "memory", "cc"); -#elif HWY_ARCH_RVV - asm volatile("rdtime %0" : "=r"(t)); -#elif defined(_WIN32) || defined(_WIN64) - LARGE_INTEGER counter; - (void)QueryPerformanceCounter(&counter); - t = counter.QuadPart; -#elif defined(__APPLE__) - t = mach_absolute_time(); -#elif defined(__HAIKU__) - t = system_time_nsecs(); // since boot -#else // POSIX - timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - t = static_cast(ts.tv_sec * 1000000000LL + ts.tv_nsec); -#endif - return t; -} - -// WARNING: on x86, caller must check HasRDTSCP before using this! -inline Ticks Stop() { - uint64_t t; -#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) - asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); -#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC - // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU. - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC - _ReadWriteBarrier(); - unsigned aux; - t = __rdtscp(&aux); - _ReadWriteBarrier(); - _mm_lfence(); - _ReadWriteBarrier(); -#elif HWY_ARCH_X86_64 - // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). - asm volatile( - "rdtscp\n\t" - "shl $32, %%rdx\n\t" - "or %%rdx, %0\n\t" - "lfence" - : "=a"(t) - : - // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. - // "cc" = flags modified by SHL. - : "rcx", "rdx", "memory", "cc"); -#else - t = Start(); -#endif - return t; -} - -} // namespace timer - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace hwy -HWY_AFTER_NAMESPACE(); - -#endif // per-target include guard diff --git a/deps/highway/include/hwy/timer.h b/deps/highway/include/hwy/timer.h deleted file mode 100644 index 0ca46e24..00000000 --- a/deps/highway/include/hwy/timer.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2023 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HIGHWAY_HWY_TIMER_H_ -#define HIGHWAY_HWY_TIMER_H_ - -// Platform-specific timer functions. Provides Now() and functions for -// interpreting and converting the timer-inl.h Ticks. - -#include - -#include "hwy/highway_export.h" - -namespace hwy { -namespace platform { - -// Returns current timestamp [in seconds] relative to an unspecified origin. -// Features: monotonic (no negative elapsed time), steady (unaffected by system -// time changes), high-resolution (on the order of microseconds). -// Uses InvariantTicksPerSecond and the baseline version of timer::Start(). -HWY_DLLEXPORT double Now(); - -// Functions for use with timer-inl.h: - -// Returns whether it is safe to call timer::Stop without executing an illegal -// instruction; if false, fills cpu100 (a pointer to a 100 character buffer) -// with the CPU brand string or an empty string if unknown. -HWY_DLLEXPORT bool HaveTimerStop(char* cpu100); - -// Returns tick rate, useful for converting timer::Ticks to seconds. Invariant -// means the tick counter frequency is independent of CPU throttling or sleep. -// This call may be expensive, callers should cache the result. -HWY_DLLEXPORT double InvariantTicksPerSecond(); - -// Returns ticks elapsed in back to back timer calls, i.e. a function of the -// timer resolution (minimum measurable difference) and overhead. -// This call is expensive, callers should cache the result. -HWY_DLLEXPORT uint64_t TimerResolution(); - -} // namespace platform -} // namespace hwy - -#endif // HIGHWAY_HWY_TIMER_H_ diff --git a/deps/highway/lib/cmake/hwy/hwy-config-release.cmake b/deps/highway/lib/cmake/hwy/hwy-config-release.cmake deleted file mode 100644 index 490d1014..00000000 --- a/deps/highway/lib/cmake/hwy/hwy-config-release.cmake +++ /dev/null @@ -1,19 +0,0 @@ -#---------------------------------------------------------------- -# Generated CMake target import file for configuration "Release". -#---------------------------------------------------------------- - -# Commands may need to know the format version. -set(CMAKE_IMPORT_FILE_VERSION 1) - -# Import target "hwy::hwy" for configuration "Release" -set_property(TARGET hwy::hwy APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) -set_target_properties(hwy::hwy PROPERTIES - IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" - IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libhwy.a" - ) - -list(APPEND _cmake_import_check_targets hwy::hwy ) -list(APPEND _cmake_import_check_files_for_hwy::hwy "${_IMPORT_PREFIX}/lib/libhwy.a" ) - -# Commands beyond this point should not need to know the version. -set(CMAKE_IMPORT_FILE_VERSION) diff --git a/deps/highway/lib/cmake/hwy/hwy-config-version.cmake b/deps/highway/lib/cmake/hwy/hwy-config-version.cmake deleted file mode 100644 index 1555f92e..00000000 --- a/deps/highway/lib/cmake/hwy/hwy-config-version.cmake +++ /dev/null @@ -1,70 +0,0 @@ -# This is a basic version file for the Config-mode of find_package(). -# It is used by write_basic_package_version_file() as input file for configure_file() -# to create a version-file which can be installed along a config.cmake file. -# -# The created file sets PACKAGE_VERSION_EXACT if the current version string and -# the requested version string are exactly the same and it sets -# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version, -# but only if the requested major version is the same as the current one. -# The variable CVF_VERSION must be set before calling configure_file(). - - -set(PACKAGE_VERSION "1.0.6") - -if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION) - set(PACKAGE_VERSION_COMPATIBLE FALSE) -else() - - if("1.0.6" MATCHES "^([0-9]+)\\.") - set(CVF_VERSION_MAJOR "${CMAKE_MATCH_1}") - if(NOT CVF_VERSION_MAJOR VERSION_EQUAL 0) - string(REGEX REPLACE "^0+" "" CVF_VERSION_MAJOR "${CVF_VERSION_MAJOR}") - endif() - else() - set(CVF_VERSION_MAJOR "1.0.6") - endif() - - if(PACKAGE_FIND_VERSION_RANGE) - # both endpoints of the range must have the expected major version - math (EXPR CVF_VERSION_MAJOR_NEXT "${CVF_VERSION_MAJOR} + 1") - if (NOT PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR - OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX_MAJOR STREQUAL CVF_VERSION_MAJOR) - OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX VERSION_LESS_EQUAL CVF_VERSION_MAJOR_NEXT))) - set(PACKAGE_VERSION_COMPATIBLE FALSE) - elseif(PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR - AND ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS_EQUAL PACKAGE_FIND_VERSION_MAX) - OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MAX))) - set(PACKAGE_VERSION_COMPATIBLE TRUE) - else() - set(PACKAGE_VERSION_COMPATIBLE FALSE) - endif() - else() - if(PACKAGE_FIND_VERSION_MAJOR STREQUAL CVF_VERSION_MAJOR) - set(PACKAGE_VERSION_COMPATIBLE TRUE) - else() - set(PACKAGE_VERSION_COMPATIBLE FALSE) - endif() - - if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION) - set(PACKAGE_VERSION_EXACT TRUE) - endif() - endif() -endif() - - -# if the installed project requested no architecture check, don't perform the check -if("FALSE") - return() -endif() - -# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: -if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "8" STREQUAL "") - return() -endif() - -# check that the installed version has the same 32/64bit-ness as the one which is currently searching: -if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "8") - math(EXPR installedBits "8 * 8") - set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") - set(PACKAGE_VERSION_UNSUITABLE TRUE) -endif() diff --git a/deps/highway/lib/cmake/hwy/hwy-config.cmake b/deps/highway/lib/cmake/hwy/hwy-config.cmake deleted file mode 100644 index cd098632..00000000 --- a/deps/highway/lib/cmake/hwy/hwy-config.cmake +++ /dev/null @@ -1,104 +0,0 @@ -# Generated by CMake - -if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8) - message(FATAL_ERROR "CMake >= 2.8.0 required") -endif() -if(CMAKE_VERSION VERSION_LESS "2.8.3") - message(FATAL_ERROR "CMake >= 2.8.3 required") -endif() -cmake_policy(PUSH) -cmake_policy(VERSION 2.8.3...3.23) -#---------------------------------------------------------------- -# Generated CMake target import file. -#---------------------------------------------------------------- - -# Commands may need to know the format version. -set(CMAKE_IMPORT_FILE_VERSION 1) - -# Protect against multiple inclusion, which would fail when already imported targets are added once more. -set(_cmake_targets_defined "") -set(_cmake_targets_not_defined "") -set(_cmake_expected_targets "") -foreach(_cmake_expected_target IN ITEMS hwy::hwy) - list(APPEND _cmake_expected_targets "${_cmake_expected_target}") - if(TARGET "${_cmake_expected_target}") - list(APPEND _cmake_targets_defined "${_cmake_expected_target}") - else() - list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}") - endif() -endforeach() -unset(_cmake_expected_target) -if(_cmake_targets_defined STREQUAL _cmake_expected_targets) - unset(_cmake_targets_defined) - unset(_cmake_targets_not_defined) - unset(_cmake_expected_targets) - unset(CMAKE_IMPORT_FILE_VERSION) - cmake_policy(POP) - return() -endif() -if(NOT _cmake_targets_defined STREQUAL "") - string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}") - string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}") - message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n") -endif() -unset(_cmake_targets_defined) -unset(_cmake_targets_not_defined) -unset(_cmake_expected_targets) - - -# Compute the installation prefix relative to this file. -get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) -get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH) -get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH) -get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH) -if(_IMPORT_PREFIX STREQUAL "/") - set(_IMPORT_PREFIX "") -endif() - -# Create imported target hwy::hwy -add_library(hwy::hwy STATIC IMPORTED) - -set_target_properties(hwy::hwy PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "TOOLCHAIN_MISS_SYS_AUXV_H;TOOLCHAIN_MISS_ASM_HWCAP_H;HWY_STATIC_DEFINE" - INTERFACE_COMPILE_FEATURES "cxx_std_11" - INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include" -) - -# Load information for each installed configuration. -file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/hwy-config-*.cmake") -foreach(_cmake_config_file IN LISTS _cmake_config_files) - include("${_cmake_config_file}") -endforeach() -unset(_cmake_config_file) -unset(_cmake_config_files) - -# Cleanup temporary variables. -set(_IMPORT_PREFIX) - -# Loop over all imported files and verify that they actually exist -foreach(_cmake_target IN LISTS _cmake_import_check_targets) - foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}") - if(NOT EXISTS "${_cmake_file}") - message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file - \"${_cmake_file}\" -but this file does not exist. Possible reasons include: -* The file was deleted, renamed, or moved to another location. -* An install or uninstall procedure did not complete successfully. -* The installation package was faulty and contained - \"${CMAKE_CURRENT_LIST_FILE}\" -but not all the files it references. -") - endif() - endforeach() - unset(_cmake_file) - unset("_cmake_import_check_files_for_${_cmake_target}") -endforeach() -unset(_cmake_target) -unset(_cmake_import_check_targets) - -# This file does not depend on other imported targets which have -# been exported from the same project but in a separate export set. - -# Commands beyond this point should not need to know the version. -set(CMAKE_IMPORT_FILE_VERSION) -cmake_policy(POP) diff --git a/deps/highway/lib/libhwy.a b/deps/highway/lib/libhwy.a deleted file mode 100644 index ef03477b..00000000 Binary files a/deps/highway/lib/libhwy.a and /dev/null differ diff --git a/deps/highway/lib/pkgconfig/libhwy.pc b/deps/highway/lib/pkgconfig/libhwy.pc deleted file mode 100644 index bf14ef07..00000000 --- a/deps/highway/lib/pkgconfig/libhwy.pc +++ /dev/null @@ -1,10 +0,0 @@ -prefix= -exec_prefix=${prefix} -libdir=${exec_prefix}/lib -includedir=${prefix}/include - -Name: libhwy -Description: Efficient and performance-portable SIMD wrapper -Version: 1.0.6 -Libs: -L${libdir} -lhwy -Cflags: -I${includedir} -DHWY_STATIC_DEFINE