deps: remove highway files

revert 97f7b194d3
This commit is contained in:
alexey.lysiuk 2023-08-26 09:42:00 +03:00
parent 8b7cf20552
commit 1b141dcfec
35 changed files with 0 additions and 69716 deletions

View File

@ -1,211 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
// Memory allocator with support for alignment and offsets.
#include <memory>
#include <utility>
#include "hwy/base.h"
namespace hwy {
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
// requires a literal. This matches typical L1 cache line sizes, which prevents
// false sharing.
#define HWY_ALIGNMENT 64
// Pointers to functions equivalent to malloc/free with an opaque void* passed
// to them.
using AllocPtr = void* (*)(void* opaque, size_t bytes);
using FreePtr = void (*)(void* opaque, void* memory);
// Returns null or a pointer to at least `payload_size` (which can be zero)
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
// memory or malloc() if it is null.
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
AllocPtr alloc_ptr, void* opaque_ptr);
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
// must have been returned from a previous call to `AllocateAlignedBytes`.
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
// `free_ptr` function is null, uses the default free().
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
FreePtr free_ptr, void* opaque_ptr);
// Class that deletes the aligned pointer passed to operator() calling the
// destructor before freeing the pointer. This is equivalent to the
// std::default_delete but for aligned objects. For a similar deleter equivalent
// to free() for aligned memory see AlignedFreer().
class AlignedDeleter {
public:
AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T>
void operator()(T* aligned_pointer) const {
return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
TypedArrayDeleter<T>);
}
private:
template <typename T>
static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
size_t elems = size_in_bytes / sizeof(T);
for (size_t i = 0; i < elems; i++) {
// Explicitly call the destructor on each element.
(static_cast<T*>(ptr) + i)->~T();
}
}
// Function prototype that calls the destructor for each element in a typed
// array. TypeArrayDeleter<T> would match this prototype.
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
FreePtr free_ptr,
void* opaque_ptr,
ArrayDeleter deleter);
FreePtr free_;
void* opaque_ptr_;
};
// Unique pointer to T with custom aligned deleter. This can be a single
// element U or an array of element if T is a U[]. The custom aligned deleter
// will call the destructor on U or each element of a U[] in the array case.
template <typename T>
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
// Aligned memory equivalent of make_unique<T> using the custom allocators
// alloc/free with the passed `opaque` pointer. This function calls the
// constructor with the passed Args... and calls the destructor of the object
// when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
void* opaque, Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
AlignedDeleter(free, opaque));
}
// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
// functions.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
AlignedDeleter());
}
// Helpers for array allocators (avoids overflow)
namespace detail {
// Returns x such that 1u << x == n (if n is a power of two).
static inline constexpr size_t ShiftCount(size_t n) {
return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
}
template <typename T>
T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
constexpr size_t size = sizeof(T);
constexpr bool is_pow2 = (size & (size - 1)) == 0;
constexpr size_t bits = ShiftCount(size);
static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
const size_t bytes = is_pow2 ? items << bits : items * size;
const size_t check = is_pow2 ? bytes >> bits : bytes / size;
if (check != items) {
return nullptr; // overflowed
}
return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
}
} // namespace detail
// Aligned memory equivalent of make_unique<T[]> for array types using the
// custom allocators alloc/free. This function calls the constructor with the
// passed Args... on every created item. The destructor of each element will be
// called when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
if (ptr != nullptr) {
for (size_t i = 0; i < items; i++) {
new (ptr + i) T(std::forward<Args>(args)...);
}
}
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
}
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
}
// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
// but for aligned memory.
class AlignedFreer {
public:
// Pass address of this to ctor to skip deleting externally-owned memory.
static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T>
void operator()(T* aligned_pointer) const {
// TODO(deymo): assert that we are using a POD type T.
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
}
private:
FreePtr free_;
void* opaque_ptr_;
};
// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
// data use AlignedUniquePtr.
template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
// Upon destruction of the unique_ptr the aligned array will be freed.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
FreePtr free, void* opaque) {
return AlignedFreeUniquePtr<T[]>(
detail::AllocateAlignedItems<T>(items, alloc, opaque),
AlignedFreer(free, opaque));
}
// Same as previous AllocateAligned(), using default allocate/free functions.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
}
} // namespace hwy
#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,108 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
#define HIGHWAY_HWY_CACHE_CONTROL_H_
#include "hwy/base.h"
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
// https://github.com/gperftools/gperftools/issues/946).
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
#undef HWY_DISABLE_CACHE_CONTROL
#define HWY_DISABLE_CACHE_CONTROL
#endif
// intrin.h is sufficient on MSVC and already included by base.h.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#include <emmintrin.h> // SSE2
#include <xmmintrin.h> // _mm_prefetch
#endif
namespace hwy {
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
#define HWY_STREAM_MULTIPLE 16
// The following functions may also require an attribute.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
#else
#define HWY_ATTR_CACHE
#endif
// Windows.h #defines this, which causes infinite recursion. Temporarily
// undefine to avoid conflict with our function.
// TODO(janwas): remove when this function is removed.
#pragma push_macro("LoadFence")
#undef LoadFence
// Delays subsequent loads until prior loads are visible. Beware of potentially
// differing behavior across architectures and vendors: on Intel but not
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
// complete).
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_lfence();
#endif
}
// TODO(janwas): remove when this function is removed. (See above.)
#pragma pop_macro("LoadFence")
// Ensures values written by previous `Stream` calls are visible on the current
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
// outputs are to be consumed by other core(s), the producer must publish
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_sfence();
#endif
}
// Optionally begins loading the cache line containing "p" to reduce latency of
// subsequent actual loads.
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
#elif HWY_COMPILER_GCC // includes clang
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
// desirable, so use the default 3 (keep in caches).
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
#else
(void)p;
#endif
}
// Invalidates and flushes the cache line containing "p", if possible.
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_clflush(p);
#else
(void)p;
#endif
}
// When called inside a spin-loop, may reduce power consumption.
HWY_INLINE HWY_ATTR_CACHE void Pause() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_pause();
#endif
}
} // namespace hwy
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_

View File

@ -1,281 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
// Detects compiler and arch from predefined macros. Zero dependencies for
// inclusion by foreach_target.h.
// Add to #if conditions to prevent IDE from graying out code.
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
(defined Q_CREATOR_RUN) || (defined __CLANGD__) || \
(defined GROK_ELLIPSIS_BUILD)
#define HWY_IDE 1
#else
#define HWY_IDE 0
#endif
//------------------------------------------------------------------------------
// Compiler
// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
#if defined(_MSC_VER) && !defined(__clang__)
#define HWY_COMPILER_MSVC _MSC_VER
#else
#define HWY_COMPILER_MSVC 0
#endif
#if defined(_MSC_VER) && defined(__clang__)
#define HWY_COMPILER_CLANGCL _MSC_VER
#else
#define HWY_COMPILER_CLANGCL 0
#endif
#ifdef __INTEL_COMPILER
#define HWY_COMPILER_ICC __INTEL_COMPILER
#else
#define HWY_COMPILER_ICC 0
#endif
#ifdef __INTEL_LLVM_COMPILER
#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
#else
#define HWY_COMPILER_ICX 0
#endif
// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
// compiler extensions (eg. Clang, Intel...)
#ifdef __GNUC__
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
#else
#define HWY_COMPILER_GCC 0
#endif
// Clang or clang-cl, not GCC.
#ifdef __clang__
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
// an invalid version number, deduce it from the presence of warnings.
// Originally based on
// https://github.com/simd-everywhere/simde/blob/47d6e603de9d04ee05cdfbc57cf282a02be1bf2a/simde/simde-detect-clang.h#L59.
// Please send updates below to them as well, thanks!
#if defined(__apple_build_version__) || __clang_major__ >= 999
#if __has_attribute(nouwtable) // no new warnings in 16.0
#define HWY_COMPILER_CLANG 1600
#elif __has_warning("-Warray-parameter")
#define HWY_COMPILER_CLANG 1500
#elif __has_warning("-Wbitwise-instead-of-logical")
#define HWY_COMPILER_CLANG 1400
#elif __has_warning("-Wreserved-identifier")
#define HWY_COMPILER_CLANG 1300
#elif __has_warning("-Wformat-insufficient-args")
#define HWY_COMPILER_CLANG 1200
#elif __has_warning("-Wimplicit-const-int-float-conversion")
#define HWY_COMPILER_CLANG 1100
#elif __has_warning("-Wmisleading-indentation")
#define HWY_COMPILER_CLANG 1000
#elif defined(__FILE_NAME__)
#define HWY_COMPILER_CLANG 900
#elif __has_warning("-Wextra-semi-stmt") || \
__has_builtin(__builtin_rotateleft32)
#define HWY_COMPILER_CLANG 800
// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
// based on Clang 7, but does not support the warning we test.
// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
// https://trac.macports.org/wiki/XcodeVersionInfo.
#elif __has_warning("-Wc++98-compat-extra-semi") || \
(defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
#define HWY_COMPILER_CLANG 700
#else // Anything older than 7.0 is not recommended for Highway.
#define HWY_COMPILER_CLANG 600
#endif // __has_warning chain
#define HWY_COMPILER3_CLANG (HWY_COMPILER_CLANG * 100)
#else // use normal version
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
#define HWY_COMPILER3_CLANG \
(__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
#endif
#else // Not clang
#define HWY_COMPILER_CLANG 0
#define HWY_COMPILER3_CLANG 0
#endif
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && !HWY_COMPILER_ICC
#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
#else
#define HWY_COMPILER_GCC_ACTUAL 0
#endif
// More than one may be nonzero, but we want at least one.
#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
#error "Unsupported compiler"
#endif
// We should only detect one of these (only clang/clangcl overlap)
#if 1 < \
(!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
!!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
#error "Detected multiple compilers"
#endif
#ifdef __has_builtin
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
#else
#define HWY_HAS_BUILTIN(name) 0
#endif
#ifdef __has_attribute
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
#else
#define HWY_HAS_ATTRIBUTE(name) 0
#endif
#ifdef __has_cpp_attribute
#define HWY_HAS_CPP_ATTRIBUTE(name) __has_cpp_attribute(name)
#else
#define HWY_HAS_CPP_ATTRIBUTE(name) 0
#endif
#ifdef __has_feature
#define HWY_HAS_FEATURE(name) __has_feature(name)
#else
#define HWY_HAS_FEATURE(name) 0
#endif
//------------------------------------------------------------------------------
// Architecture
#if defined(__i386__) || defined(_M_IX86)
#define HWY_ARCH_X86_32 1
#else
#define HWY_ARCH_X86_32 0
#endif
#if defined(__x86_64__) || defined(_M_X64)
#define HWY_ARCH_X86_64 1
#else
#define HWY_ARCH_X86_64 0
#endif
#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
#error "Cannot have both x86-32 and x86-64"
#endif
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
#define HWY_ARCH_X86 1
#else
#define HWY_ARCH_X86 0
#endif
#if defined(__powerpc64__) || defined(_M_PPC) || defined(__powerpc__)
#define HWY_ARCH_PPC 1
#else
#define HWY_ARCH_PPC 0
#endif
// aarch32 is currently not supported; please raise an issue if you want it.
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
#define HWY_ARCH_ARM_A64 1
#else
#define HWY_ARCH_ARM_A64 0
#endif
#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
#define HWY_ARCH_ARM_V7 1
#else
#define HWY_ARCH_ARM_V7 0
#endif
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
#error "Cannot have both A64 and V7"
#endif
// Any *supported* version of Arm, i.e. 7 or later
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
#define HWY_ARCH_ARM 1
#else
#define HWY_ARCH_ARM 0
#endif
// Older than Armv7 (e.g. armel aka Armv5) => we do not support SIMD.
#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
#define HWY_ARCH_ARM_OLD 1
#else
#define HWY_ARCH_ARM_OLD 0
#endif
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
#define HWY_ARCH_WASM 1
#else
#define HWY_ARCH_WASM 0
#endif
#ifdef __riscv
#define HWY_ARCH_RVV 1
#else
#define HWY_ARCH_RVV 0
#endif
// It is an error to detect multiple architectures at the same time, but OK to
// detect none of the above.
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
#error "Must not detect more than one architecture"
#endif
#if defined(_WIN32) || defined(_WIN64)
#define HWY_OS_WIN 1
#else
#define HWY_OS_WIN 0
#endif
#if defined(linux) || defined(__linux__)
#define HWY_OS_LINUX 1
#else
#define HWY_OS_LINUX 0
#endif
//------------------------------------------------------------------------------
// Endianness
#if HWY_COMPILER_MSVC
#if HWY_ARCH_PPC && defined(_XBOX_VER) && _XBOX_VER >= 200
// XBox 360 is big-endian
#define HWY_IS_LITTLE_ENDIAN 0
#define HWY_IS_BIG_ENDIAN 1
#else
// All other targets supported by MSVC are little-endian
#define HWY_IS_LITTLE_ENDIAN 1
#define HWY_IS_BIG_ENDIAN 0
#endif // HWY_ARCH_PPC && defined(_XBOX_VER) && _XBOX_VER >= 200
#elif defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define HWY_IS_LITTLE_ENDIAN 1
#define HWY_IS_BIG_ENDIAN 0
#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define HWY_IS_LITTLE_ENDIAN 0
#define HWY_IS_BIG_ENDIAN 1
#else
#error "Unable to detect endianness or unsupported byte order"
#endif
#if (HWY_IS_LITTLE_ENDIAN + HWY_IS_BIG_ENDIAN) != 1
#error "Must only detect one byte order"
#endif
#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_

View File

@ -1,644 +0,0 @@
// Copyright 2021 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
#define HIGHWAY_HWY_DETECT_TARGETS_H_
// Defines targets and chooses which to enable.
#include "hwy/detect_compiler_arch.h"
//------------------------------------------------------------------------------
// Optional configuration
// See g3doc/quick_reference.md for documentation of these macros.
// Uncomment to override the default baseline determined from predefined macros:
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
// Uncomment to override the default blocklist:
// #define HWY_BROKEN_TARGETS HWY_AVX3
// Uncomment to definitely avoid generating those target(s):
// #define HWY_DISABLED_TARGETS HWY_SSE4
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
// #define HWY_DISABLE_BMI2_FMA
// Uncomment to enable these on MSVC even if the predefined macros are not set.
// #define HWY_WANT_SSE2 1
// #define HWY_WANT_SSSE3 1
// #define HWY_WANT_SSE4 1
//------------------------------------------------------------------------------
// Targets
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
//
// All values are unconditionally defined so we can test HWY_TARGETS without
// first checking the HWY_ARCH_*.
//
// The C99 preprocessor evaluates #if expressions using intmax_t types. This
// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
// avoid overflow when computing HWY_TARGETS (subtracting one instead of
// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
// --------------------------- x86: 15 targets (+ one fallback)
// Bits 0..3 reserved (4 targets)
#define HWY_AVX3_SPR (1LL << 4)
// Bit 5 reserved (likely AVX10.2 with 256-bit vectors)
// Currently HWY_AVX3_DL plus a special case for CompressStore (10x as fast).
// We may later also use VPCONFLICT.
#define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
// VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is
// only in Tiger Lake?
#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
#define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL
#define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA
// Bit 10: reserved
#define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL
#define HWY_SSSE3 (1LL << 12) // S-SSE3
// Bit 13: reserved for SSE3
#define HWY_SSE2 (1LL << 14)
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
// dynamic dispatch. All x86 target bits must be lower or equal to
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
// HWY_MAX_DYNAMIC_TARGETS in total.
#define HWY_HIGHEST_TARGET_BIT_X86 14
// --------------------------- Arm: 15 targets (+ one fallback)
// Bits 15..23 reserved (9 targets)
#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
#define HWY_SVE2 (1LL << 26)
#define HWY_SVE (1LL << 27)
#define HWY_NEON (1LL << 28) // Implies support for AES
#define HWY_NEON_WITHOUT_AES (1LL << 29)
#define HWY_HIGHEST_TARGET_BIT_ARM 29
// --------------------------- RISC-V: 9 targets (+ one fallback)
// Bits 30..36 reserved (7 targets)
#define HWY_RVV (1LL << 37)
// Bit 38 reserved
#define HWY_HIGHEST_TARGET_BIT_RVV 38
// --------------------------- Future expansion: 4 targets
// Bits 39..42 reserved
// --------------------------- IBM Power: 9 targets (+ one fallback)
// Bits 43..46 reserved (4 targets)
#define HWY_PPC10 (1LL << 47) // v3.1
#define HWY_PPC9 (1LL << 48) // v3.0
#define HWY_PPC8 (1LL << 49) // v2.07
// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
#define HWY_HIGHEST_TARGET_BIT_PPC 51
// --------------------------- WebAssembly: 9 targets (+ one fallback)
// Bits 52..57 reserved (6 targets)
#define HWY_WASM_EMU256 (1LL << 58) // Experimental
#define HWY_WASM (1LL << 59)
// Bits 60 reserved
#define HWY_HIGHEST_TARGET_BIT_WASM 60
// --------------------------- Emulation: 2 targets
#define HWY_EMU128 (1LL << 61)
// We do not add/left-shift, so this will not overflow to a negative number.
#define HWY_SCALAR (1LL << 62)
#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
// Do not use bit 63 - would be confusing to have negative numbers.
//------------------------------------------------------------------------------
// Set default blocklists
// Disabled means excluded from enabled at user's request. A separate config
// macro allows disabling without deactivating the blocklist below.
#ifndef HWY_DISABLED_TARGETS
#define HWY_DISABLED_TARGETS 0
#endif
// Broken means excluded from enabled due to known compiler issues. We define
// separate HWY_BROKEN_* and then OR them together (more than one might apply).
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
// SSE4 codegen (possibly only for msan), so disable all those targets.
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
#define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1))
// This entails a major speed reduction, so warn unless the user explicitly
// opts in to scalar-only.
#if !defined(HWY_COMPILE_ONLY_SCALAR)
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
#endif
#else
#define HWY_BROKEN_CLANG6 0
#endif
// 32-bit may fail to compile AVX2/3.
#if HWY_ARCH_X86_32
#define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1))
#else
#define HWY_BROKEN_32BIT 0
#endif
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
#if HWY_COMPILER_MSVC != 0
#define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1))
#else
#define HWY_BROKEN_MSVC 0
#endif
// AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
// 2021.
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \
(HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
#define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4)
#else
#define HWY_BROKEN_AVX3_DL_ZEN4 0
#endif
// AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021.
#if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
(HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
#define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR)
#else
#define HWY_BROKEN_AVX3_SPR 0
#endif
// armv7be has not been tested and is not yet supported.
#if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN
#define HWY_BROKEN_ARM7_BIG_ENDIAN (HWY_NEON | HWY_NEON_WITHOUT_AES)
#else
#define HWY_BROKEN_ARM7_BIG_ENDIAN 0
#endif
// armv7-a without a detected vfpv4 is not supported
// (for example Cortex-A8, Cortex-A9)
// vfpv4 always have neon half-float _and_ FMA.
#if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \
!defined(__ARM_VFPV4__) && \
!((__ARM_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1))
#define HWY_BROKEN_ARM7_WITHOUT_VFP4 (HWY_NEON | HWY_NEON_WITHOUT_AES)
#else
#define HWY_BROKEN_ARM7_WITHOUT_VFP4 0
#endif
// SVE[2] require recent clang or gcc versions.
#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
#define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
#else
#define HWY_BROKEN_SVE 0
#endif
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100)
// GCC 10 supports the -mcpu=power10 option but does not support the PPC10
// vector intrinsics
#define HWY_BROKEN_PPC10 (HWY_PPC10)
#elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN && \
((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) || \
(HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \
(HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301))
// GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the
// vsldoi instruction is sometimes incorrectly optimized out (and this causes
// some of the Highway unit tests to fail on big-endian PPC10). Details about
// this compiler bug can be found at
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be
// fixed in the upcoming GCC 12.4 and 13.2 releases.
// Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler
// bug in the LLVM DAGCombiner that causes a zero-extend followed by an
// element insert into a vector, followed by a vector shuffle to be incorrectly
// optimized on big-endian PPC (and which caused some of the Highway unit tests
// to fail on big-endian PPC10).
// Details about this bug, which has already been fixed in Clang 16.0.1 and
// later, can be found at https://github.com/llvm/llvm-project/issues/61315.
#define HWY_BROKEN_PPC10 (HWY_PPC10)
#else
#define HWY_BROKEN_PPC10 0
#endif
// Allow the user to override this without any guarantee of success.
#ifndef HWY_BROKEN_TARGETS
#define HWY_BROKEN_TARGETS \
(HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \
HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \
HWY_BROKEN_SVE | HWY_BROKEN_PPC10)
#endif // HWY_BROKEN_TARGETS
// Enabled means not disabled nor blocklisted.
#define HWY_ENABLED(targets) \
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
// always be enabled. If 1, we instead choose HWY_SCALAR even without
// HWY_COMPILE_ONLY_SCALAR being set.
#if !defined(HWY_BROKEN_EMU128) // allow overriding
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \
defined(HWY_NO_LIBCXX)
#define HWY_BROKEN_EMU128 1
#else
#define HWY_BROKEN_EMU128 0
#endif
#endif // HWY_BROKEN_EMU128
//------------------------------------------------------------------------------
// Detect baseline targets using predefined macros
// Baseline means the targets for which the compiler is allowed to generate
// instructions, implying the target CPU would have to support them. This does
// not take the blocklist into account.
#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
#define HWY_BASELINE_SCALAR HWY_SCALAR
#else
#define HWY_BASELINE_SCALAR HWY_EMU128
#endif
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
// HWY_TARGET == HWY_BASELINE_SCALAR.
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
#if defined(HWY_WANT_WASM2)
#define HWY_BASELINE_WASM HWY_WASM_EMU256
#else
#define HWY_BASELINE_WASM HWY_WASM
#endif // HWY_WANT_WASM2
#else
#define HWY_BASELINE_WASM 0
#endif
// GCC or Clang.
#if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \
defined(__VSX__) && defined(__POWER8_VECTOR__) && \
(defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO))
#define HWY_BASELINE_PPC8 HWY_PPC8
#else
#define HWY_BASELINE_PPC8 0
#endif
#if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__)
#define HWY_BASELINE_PPC9 HWY_PPC9
#else
#define HWY_BASELINE_PPC9 0
#endif
#if HWY_BASELINE_PPC9 != 0 && \
(defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
#define HWY_BASELINE_PPC10 HWY_PPC10
#else
#define HWY_BASELINE_PPC10 0
#endif
#define HWY_BASELINE_SVE2 0
#define HWY_BASELINE_SVE 0
#define HWY_BASELINE_NEON 0
#if HWY_ARCH_ARM
#if defined(__ARM_FEATURE_SVE2)
#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
// If user specified -msve-vector-bits=128, they assert the vector length is
// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
#define HWY_BASELINE_SVE2 HWY_SVE2_128
// Otherwise we're not sure what the vector length will be. The baseline must be
// unconditionally valid, so we can only assume HWY_SVE2. However, when running
// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
#else
#define HWY_BASELINE_SVE2 HWY_SVE2
#endif // __ARM_FEATURE_SVE_BITS
#endif // __ARM_FEATURE_SVE2
#if defined(__ARM_FEATURE_SVE)
#undef HWY_BASELINE_SVE // was 0, will be re-defined
// See above. If user-specified vector length matches our optimization, use it.
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
#define HWY_BASELINE_SVE HWY_SVE_256
#else
#define HWY_BASELINE_SVE HWY_SVE
#endif // __ARM_FEATURE_SVE_BITS
#endif // __ARM_FEATURE_SVE
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#undef HWY_BASELINE_NEON
#if defined(__ARM_FEATURE_AES)
#define HWY_BASELINE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES)
#else
#define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES)
#endif
#endif
#endif // HWY_ARCH_ARM
// Special handling for MSVC because it has fewer predefined macros:
#if HWY_COMPILER_MSVC
#if HWY_ARCH_X86_32
#if _M_IX86_FP >= 2
#define HWY_CHECK_SSE2 1
#else
#define HWY_CHECK_SSE2 0
#endif
#elif HWY_ARCH_X86_64
#define HWY_CHECK_SSE2 1
#else
#define HWY_CHECK_SSE2 0
#endif
// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
// https://stackoverflow.com/questions/18563978/.
#if defined(__AVX__)
#define HWY_CHECK_SSSE3 1
#define HWY_CHECK_SSE4 1
#else
#define HWY_CHECK_SSSE3 0
#define HWY_CHECK_SSE4 0
#endif
// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
#define HWY_CHECK_PCLMUL_AES 1
#define HWY_CHECK_BMI2_FMA 1
#define HWY_CHECK_F16C 1
#else // non-MSVC
#if defined(__SSE2__)
#define HWY_CHECK_SSE2 1
#else
#define HWY_CHECK_SSE2 0
#endif
#if defined(__SSSE3__)
#define HWY_CHECK_SSSE3 1
#else
#define HWY_CHECK_SSSE3 0
#endif
#if defined(__SSE4_1__) && defined(__SSE4_2__)
#define HWY_CHECK_SSE4 1
#else
#define HWY_CHECK_SSE4 0
#endif
// If these are disabled, they should not gate the availability of SSE4/AVX2.
#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
#define HWY_CHECK_PCLMUL_AES 1
#else
#define HWY_CHECK_PCLMUL_AES 0
#endif
#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
#define HWY_CHECK_BMI2_FMA 1
#else
#define HWY_CHECK_BMI2_FMA 0
#endif
#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
#define HWY_CHECK_F16C 1
#else
#define HWY_CHECK_F16C 0
#endif
#endif // non-MSVC
#if HWY_ARCH_X86 && (HWY_WANT_SSE2 || HWY_CHECK_SSE2)
#define HWY_BASELINE_SSE2 HWY_SSE2
#else
#define HWY_BASELINE_SSE2 0
#endif
#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
#define HWY_BASELINE_SSSE3 HWY_SSSE3
#else
#define HWY_BASELINE_SSSE3 0
#endif
#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
#define HWY_BASELINE_SSE4 HWY_SSE4
#else
#define HWY_BASELINE_SSE4 0
#endif
#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
defined(__AVX2__)
#define HWY_BASELINE_AVX2 HWY_AVX2
#else
#define HWY_BASELINE_AVX2 0
#endif
// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
defined(__AVX512DQ__) && defined(__AVX512VL__)
#define HWY_BASELINE_AVX3 HWY_AVX3
#else
#define HWY_BASELINE_AVX3 0
#endif
// TODO(janwas): not yet known whether these will be set by MSVC
#if HWY_BASELINE_AVX3 != 0 && defined(__AVX512VNNI__) && defined(__VAES__) && \
defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
defined(__AVX512BITALG__)
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
#else
#define HWY_BASELINE_AVX3_DL 0
#endif
// The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus
// considered better. Do not enable it unless the user explicitly requests it -
// we do not want to choose the ZEN4 path on Intel because it could be slower.
#if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0
#define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4
#else
#define HWY_BASELINE_AVX3_ZEN4 0
#endif
#if HWY_BASELINE_AVX3_DL != 0 && defined(__AVX512FP16__)
#define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR
#else
#define HWY_BASELINE_AVX3_SPR 0
#endif
// RVV requires intrinsics 0.11 or later, see #1156.
#if HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 11000
#define HWY_BASELINE_RVV HWY_RVV
#else
#define HWY_BASELINE_RVV 0
#endif
// Allow the user to override this without any guarantee of success.
#ifndef HWY_BASELINE_TARGETS
#define HWY_BASELINE_TARGETS \
(HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_SVE2 | \
HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | \
HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | \
HWY_BASELINE_AVX3_SPR | HWY_BASELINE_RVV)
#endif // HWY_BASELINE_TARGETS
//------------------------------------------------------------------------------
// Choose target for static dispatch
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
#if HWY_ENABLED_BASELINE == 0
#error "At least one baseline target must be defined and enabled"
#endif
// Best baseline, used for static dispatch. This is the least-significant 1-bit
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
// Start by assuming static dispatch. If we later use dynamic dispatch, this
// will be defined to other targets during the multiple-inclusion, and finally
// return to the initial value. Defining this outside begin/end_target ensures
// inl headers successfully compile by themselves (required by Bazel).
#define HWY_TARGET HWY_STATIC_TARGET
//------------------------------------------------------------------------------
// Choose targets for dynamic dispatch according to one of four policies
#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
defined(HWY_COMPILE_ONLY_STATIC))
#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
#endif
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
// Clang, GCC and MSVC allow runtime dispatch on x86.
#if HWY_ARCH_X86
#define HWY_HAVE_RUNTIME_DISPATCH 1
// On Arm/PPC, currently only GCC does, and we require Linux to detect CPU
// capabilities.
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && HWY_COMPILER_GCC_ACTUAL && \
HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H)
#define HWY_HAVE_RUNTIME_DISPATCH 1
#else
#define HWY_HAVE_RUNTIME_DISPATCH 0
#endif
// AVX3_DL is not widely available yet. To reduce code size and compile time,
// only include it in the set of attainable targets (for dynamic dispatch) if
// the user opts in, OR it is in the baseline (we check whether enabled below).
#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL)
#else
#define HWY_ATTAINABLE_AVX3_DL 0
#endif
#if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH
#define HWY_ATTAINABLE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES)
#elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7
#define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON)
#else
#define HWY_ATTAINABLE_NEON 0
#endif
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
(HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
#define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256)
#else
#define HWY_ATTAINABLE_SVE 0
#endif
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
(HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
#define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128)
#else
#define HWY_ATTAINABLE_SVE2 0
#endif
#if HWY_ARCH_PPC && defined(__ALTIVEC__) && \
(!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0)
#define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
#else
#define HWY_ATTAINABLE_PPC 0
#endif
// Attainable means enabled and the compiler allows intrinsics (even when not
// allowed to autovectorize). Used in 3 and 4.
#if HWY_ARCH_X86
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \
HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \
HWY_AVX3_SPR)
#elif HWY_ARCH_ARM
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \
HWY_ATTAINABLE_SVE2)
#elif HWY_ARCH_PPC
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC)
#else
#define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
#endif // HWY_ARCH_*
// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
#undef HWY_STATIC_TARGET
#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
#define HWY_TARGETS HWY_EMU128
// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
// we currently still support it for backwards compatibility.
#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
(defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
#undef HWY_STATIC_TARGET
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
#define HWY_TARGETS HWY_SCALAR
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
#elif defined(HWY_COMPILE_ONLY_STATIC)
#define HWY_TARGETS HWY_STATIC_TARGET
// 3) For tests: include all attainable targets (in particular: scalar)
#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
// sets all lower bits (better targets), then we also include the static target.
#else
#define HWY_TARGETS \
(HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
#endif // target policy
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
#error "Logic error: best baseline should be included in dynamic targets"
#endif
#endif // HIGHWAY_HWY_DETECT_TARGETS_H_

View File

@ -1,340 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
#define HIGHWAY_HWY_FOREACH_TARGET_H_
// Re-includes the translation unit zero or more times to compile for any
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
// highway.h defines the corresponding macro/namespace.
#include "hwy/detect_targets.h"
// *_inl.h may include other headers, which requires include guards to prevent
// repeated inclusion. The guards must be reset after compiling each target, so
// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
// defining it if undefined and vice versa. This macro is initially undefined
// so that IDEs don't gray out the contents of each header.
#ifdef HWY_TARGET_TOGGLE
#error "This macro must not be defined outside foreach_target.h"
#endif
#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard
// Trigger fixup at the bottom of this header.
#define HWY_ALREADY_INCLUDED
// The next highway.h must re-include set_macros-inl.h because the first
// highway.h chose the static target instead of what we will set below.
#undef HWY_SET_MACROS_PER_TARGET
#endif
// Disable HWY_EXPORT in user code until we have generated all targets. Note
// that a subsequent highway.h will not override this definition.
#undef HWY_ONCE
#define HWY_ONCE (0 || HWY_IDE)
// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
// also skip if only 1 target defined (no re-inclusion will be necessary).
#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
#if !defined(HWY_TARGET_INCLUDE)
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
#endif
// ------------------------------ HWY_ARCH_X86
#if (HWY_TARGETS & HWY_SSE2) && (HWY_STATIC_TARGET != HWY_SSE2)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSE2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSSE3
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSE4
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3_DL
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3_ZEN4) && (HWY_STATIC_TARGET != HWY_AVX3_ZEN4)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3_ZEN4
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3_SPR) && (HWY_STATIC_TARGET != HWY_AVX3_SPR)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3_SPR
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
// ------------------------------ HWY_ARCH_ARM
#if (HWY_TARGETS & HWY_NEON_WITHOUT_AES) && \
(HWY_STATIC_TARGET != HWY_NEON_WITHOUT_AES)
#undef HWY_TARGET
#define HWY_TARGET HWY_NEON_WITHOUT_AES
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
#undef HWY_TARGET
#define HWY_TARGET HWY_NEON
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE_256
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE2_128
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
// ------------------------------ HWY_ARCH_WASM
#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
#undef HWY_TARGET
#define HWY_TARGET HWY_WASM_EMU256
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
#undef HWY_TARGET
#define HWY_TARGET HWY_WASM
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
// ------------------------------ HWY_ARCH_PPC
#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
#undef HWY_TARGET
#define HWY_TARGET HWY_PPC8
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_PPC9) && (HWY_STATIC_TARGET != HWY_PPC9)
#undef HWY_TARGET
#define HWY_TARGET HWY_PPC9
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_PPC10) && (HWY_STATIC_TARGET != HWY_PPC10)
#undef HWY_TARGET
#define HWY_TARGET HWY_PPC10
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
// ------------------------------ HWY_ARCH_RVV
#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
#undef HWY_TARGET
#define HWY_TARGET HWY_RVV
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
// ------------------------------ Scalar
#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
#undef HWY_TARGET
#define HWY_TARGET HWY_EMU128
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
#undef HWY_TARGET
#define HWY_TARGET HWY_SCALAR
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
// Now that all but the static target have been generated, re-enable HWY_EXPORT.
#undef HWY_ONCE
#define HWY_ONCE 1
// If we re-include once per enabled target, the translation unit's
// implementation would have to be skipped via #if to avoid redefining symbols.
// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
// implementation when resuming compilation of the translation unit.
#undef HWY_TARGET
#define HWY_TARGET HWY_STATIC_TARGET
#ifdef HWY_ALREADY_INCLUDED
// Revert the previous toggle to prevent redefinitions for the static target.
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif
#endif
#endif // HIGHWAY_HWY_FOREACH_TARGET_H_

View File

@ -1,435 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Main header required before using vector types.
// IWYU pragma: begin_exports
#include "hwy/base.h"
#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"
#include "hwy/targets.h"
// IWYU pragma: end_exports
// This include guard is checked by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
// after/outside this include guard.
#ifndef HWY_HIGHWAY_INCLUDED
#define HWY_HIGHWAY_INCLUDED
namespace hwy {
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR 1
#define HWY_MINOR 0
#define HWY_PATCH 6
//------------------------------------------------------------------------------
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
// HWY_CAPPED(T, N).
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
// registers in the group, and is ignored on targets that do not support groups.
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
#define HWY_FULL2(T, LMUL) \
hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
// Trailing comma avoids -pedantic false alarm
#define HWY_CHOOSE_FULL(...) \
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
//------------------------------------------------------------------------------
// Export user functions for static/dynamic dispatch
// Evaluates to 0 inside a translation unit if it is generating anything but the
// static target (the last one if multiple targets are enabled). Used to prevent
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
// compile once anyway, so this is 1 unless it is or has been included.
#ifndef HWY_ONCE
#define HWY_ONCE 1
#endif
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
// defined), and can be used to deduce the return type of Choose*.
#if HWY_STATIC_TARGET == HWY_SCALAR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_EMU128
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_RVV
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE_256
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE2_128
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC8
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC9
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC10
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSSE3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_DL
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_SPR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME
#endif
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
// nullptr is that target was not compiled.
#if HWY_TARGETS & HWY_EMU128
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
#elif HWY_TARGETS & HWY_SCALAR
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
#else
// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
#endif
#if HWY_TARGETS & HWY_WASM_EMU256
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
#else
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_WASM
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
#else
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_RVV
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
#else
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_NEON_WITHOUT_AES
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME
#else
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_NEON
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
#else
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE
#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
#else
#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE2
#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
#else
#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE_256
#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
#else
#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SVE2_128
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
#else
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_PPC8
#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME
#else
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_PPC9
#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME
#else
#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_PPC10
#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME
#else
#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSE2
#define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME
#else
#define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSSE3
#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
#else
#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSE4
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
#else
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX2
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
#else
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3_DL
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3_ZEN4
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3_SPR
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr
#endif
// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
// apparently cannot be an array. Use a function pointer instead, which has the
// disadvantage that we call the static (not best) target on the first call to
// any HWY_DYNAMIC_DISPATCH.
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
#define HWY_DISPATCH_WORKAROUND 1
#else
#define HWY_DISPATCH_WORKAROUND 0
#endif
// Provides a static member function which is what is called during the first
// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
// this function are the first entry in the tables created by HWY_EXPORT.
template <typename RetType, typename... Args>
struct FunctionCache {
public:
typedef RetType(FunctionType)(Args...);
#if HWY_DISPATCH_WORKAROUND
template <FunctionType* const func>
static RetType ChooseAndCall(Args... args) {
ChosenTarget& chosen_target = GetChosenTarget();
chosen_target.Update(SupportedTargets());
return (*func)(args...);
}
#else
// A template function that when instantiated has the same signature as the
// function being called. This function initializes the bit array of targets
// supported by the current CPU and then calls the appropriate entry within
// the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
// exported functions, even those defined by different translation units,
// will dispatch directly to the best available target.
template <FunctionType* const table[]>
static RetType ChooseAndCall(Args... args) {
ChosenTarget& chosen_target = GetChosenTarget();
chosen_target.Update(SupportedTargets());
return (table[chosen_target.GetIndex()])(args...);
}
#endif // HWY_DISPATCH_WORKAROUND
};
// Used to deduce the template parameters RetType and Args from a function.
template <typename RetType, typename... Args>
FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
return FunctionCache<RetType, Args...>();
}
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
// static array must be defined at the same namespace level as the function
// it is exporting.
// After being exported, it can be called from other parts of the same source
// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
// like in the following example:
//
// #include "hwy/highway.h"
// HWY_BEFORE_NAMESPACE();
// namespace skeleton {
// namespace HWY_NAMESPACE {
//
// void MyFunction(int a, char b, const char* c) { ... }
//
// // NOLINTNEXTLINE(google-readability-namespace-comments)
// } // namespace HWY_NAMESPACE
// } // namespace skeleton
// HWY_AFTER_NAMESPACE();
//
// namespace skeleton {
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
//
// void MyFunction(int a, char b, const char* c) {
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
// }
// } // namespace skeleton
//
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// Simplified version for IDE or the dynamic dispatch case with only one target.
// This case still uses a table, although of a single element, to provide the
// same compile error conditions as with the dynamic dispatch case when multiple
// targets are being compiled.
#define HWY_EXPORT(FUNC_NAME) \
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
#define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
#else
// Simplified version for MSVC 2017: function pointer instead of table.
#if HWY_DISPATCH_WORKAROUND
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
/* The first entry in the table initializes the global cache and \
* calls the function from HWY_STATIC_TARGET. */ \
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
}
#else
// Dynamic dispatch case with one entry per dynamic target plus the fallback
// target and the initialization wrapper.
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
/* The first entry in the table initializes the global cache and \
* calls the appropriate function. */ \
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
}
#endif // HWY_DISPATCH_WORKAROUND
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
#define HWY_DYNAMIC_POINTER(FUNC_NAME) \
(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// DEPRECATED names; please use HWY_HAVE_* instead.
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
} // namespace hwy
#endif // HWY_HIGHWAY_INCLUDED
//------------------------------------------------------------------------------
// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
// to include them once per target, which is ensured by the toggle check.
// Because ops/*.h are included under it, they do not need their own guard.
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_HIGHWAY_PER_TARGET
#undef HWY_HIGHWAY_PER_TARGET
#else
#define HWY_HIGHWAY_PER_TARGET
#endif
// These define ops inside namespace hwy::HWY_NAMESPACE.
#if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
#include "hwy/ops/x86_128-inl.h"
#elif HWY_TARGET == HWY_AVX2
#include "hwy/ops/x86_256-inl.h"
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
#include "hwy/ops/x86_512-inl.h"
#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
HWY_TARGET == HWY_PPC10
#include "hwy/ops/ppc_vsx-inl.h"
#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
#include "hwy/ops/arm_neon-inl.h"
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
#include "hwy/ops/arm_sve-inl.h"
#elif HWY_TARGET == HWY_WASM_EMU256
#include "hwy/ops/wasm_256-inl.h"
#elif HWY_TARGET == HWY_WASM
#include "hwy/ops/wasm_128-inl.h"
#elif HWY_TARGET == HWY_RVV
#include "hwy/ops/rvv-inl.h"
#elif HWY_TARGET == HWY_EMU128
#include "hwy/ops/emu128-inl.h"
#elif HWY_TARGET == HWY_SCALAR
#include "hwy/ops/scalar-inl.h"
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
#include "hwy/ops/generic_ops-inl.h"
#endif // HWY_HIGHWAY_PER_TARGET

View File

@ -1,74 +0,0 @@
// Pseudo-generated file to handle both cmake & bazel build system.
// Initial generation done using cmake code:
// include(GenerateExportHeader)
// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
// hwy/highway_export.h)
// code reformatted using clang-format --style=Google
#ifndef HWY_DLLEXPORT_H
#define HWY_DLLEXPORT_H
#if !defined(HWY_SHARED_DEFINE)
#define HWY_DLLEXPORT
#define HWY_CONTRIB_DLLEXPORT
#define HWY_TEST_DLLEXPORT
#else // !HWY_SHARED_DEFINE
#ifndef HWY_DLLEXPORT
#if defined(hwy_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_DLLEXPORT __declspec(dllexport)
#else
#define HWY_DLLEXPORT __attribute__((visibility("default")))
#endif
#else // defined(hwy_EXPORTS)
/* We are using this library */
#ifdef _WIN32
#define HWY_DLLEXPORT __declspec(dllimport)
#else
#define HWY_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif // defined(hwy_EXPORTS)
#endif // HWY_DLLEXPORT
#ifndef HWY_CONTRIB_DLLEXPORT
#if defined(hwy_contrib_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
#else
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
#endif
#else // defined(hwy_contrib_EXPORTS)
/* We are using this library */
#ifdef _WIN32
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
#else
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif // defined(hwy_contrib_EXPORTS)
#endif // HWY_CONTRIB_DLLEXPORT
#ifndef HWY_TEST_DLLEXPORT
#if defined(hwy_test_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_TEST_DLLEXPORT __declspec(dllexport)
#else
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
#endif
#else // defined(hwy_test_EXPORTS)
/* We are using this library */
#ifdef _WIN32
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
#else
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif // defined(hwy_test_EXPORTS)
#endif // HWY_TEST_DLLEXPORT
#endif // !HWY_SHARED_DEFINE
#endif /* HWY_DLLEXPORT_H */

View File

@ -1,171 +0,0 @@
// Copyright 2019 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
#define HIGHWAY_HWY_NANOBENCHMARK_H_
// Benchmarks functions of a single integer argument with realistic branch
// prediction hit rates. Uses a robust estimator to summarize the measurements.
// The precision is about 0.2%.
//
// Examples: see nanobenchmark_test.cc.
//
// Background: Microbenchmarks such as http://github.com/google/benchmark
// can measure elapsed times on the order of a microsecond. Shorter functions
// are typically measured by repeating them thousands of times and dividing
// the total elapsed time by this count. Unfortunately, repetition (especially
// with the same input parameter!) influences the runtime. In time-critical
// code, it is reasonable to expect warm instruction/data caches and TLBs,
// but a perfect record of which branches will be taken is unrealistic.
// Unless the application also repeatedly invokes the measured function with
// the same parameter, the benchmark is measuring something very different -
// a best-case result, almost as if the parameter were made a compile-time
// constant. This may lead to erroneous conclusions about branch-heavy
// algorithms outperforming branch-free alternatives.
//
// Our approach differs in three ways. Adding fences to the timer functions
// reduces variability due to instruction reordering, improving the timer
// resolution to about 40 CPU cycles. However, shorter functions must still
// be invoked repeatedly. For more realistic branch prediction performance,
// we vary the input parameter according to a user-specified distribution.
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
// central tendency of the measurement samples with the "half sample mode",
// which is more robust to outliers and skewed data than the mean or median.
#include <stddef.h>
#include <stdint.h>
#include "hwy/highway_export.h"
#include "hwy/timer.h"
// Enables sanity checks that verify correct operation at the cost of
// longer benchmark runs.
#ifndef NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_ENABLE_CHECKS 0
#endif
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
while (!(condition)) { \
fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
abort(); \
}
#if NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
#else
#define NANOBENCHMARK_CHECK(condition)
#endif
namespace hwy {
// Returns 1, but without the compiler knowing what the value is. This prevents
// optimizing out code.
HWY_DLLEXPORT int Unpredictable1();
// Input influencing the function being measured (e.g. number of bytes to copy).
using FuncInput = size_t;
// "Proof of work" returned by Func to ensure the compiler does not elide it.
using FuncOutput = uint64_t;
// Function to measure: either 1) a captureless lambda or function with two
// arguments or 2) a lambda with capture, in which case the first argument
// is reserved for use by MeasureClosure.
using Func = FuncOutput (*)(const void*, FuncInput);
// Internal parameters that determine precision/resolution/measuring time.
struct Params {
// Best-case precision, expressed as a divisor of the timer resolution.
// Larger => more calls to Func and higher precision.
size_t precision_divisor = 1024;
// Ratio between full and subset input distribution sizes. Cannot be less
// than 2; larger values increase measurement time but more faithfully
// model the given input distribution.
size_t subset_ratio = 2;
// Together with the estimated Func duration, determines how many times to
// call Func before checking the sample variability. Larger values increase
// measurement time, memory/cache use and precision.
double seconds_per_eval = 4E-3;
// The minimum number of samples before estimating the central tendency.
size_t min_samples_per_eval = 7;
// The mode is better than median for estimating the central tendency of
// skewed/fat-tailed distributions, but it requires sufficient samples
// relative to the width of half-ranges.
size_t min_mode_samples = 64;
// Maximum permissible variability (= median absolute deviation / center).
double target_rel_mad = 0.002;
// Abort after this many evals without reaching target_rel_mad. This
// prevents infinite loops.
size_t max_evals = 9;
// Whether to print additional statistics to stdout.
bool verbose = true;
};
// Measurement result for each unique input.
struct Result {
FuncInput input;
// Robust estimate (mode or median) of duration.
float ticks;
// Measure of variability (median absolute deviation relative to "ticks").
float variability;
};
// Precisely measures the number of ticks elapsed when calling "func" with the
// given inputs, shuffled to ensure realistic branch prediction hit rates.
//
// "func" returns a 'proof of work' to ensure its computations are not elided.
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
// "func". The values should be chosen to maximize coverage of "func". This
// represents a distribution, so a value's frequency should reflect its
// probability in the real application. Order does not matter; for example, a
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
// Returns how many Result were written to "results": one per unique input, or
// zero if the measurement failed (an error message goes to stderr).
HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg,
const FuncInput* inputs, size_t num_inputs,
Result* results, const Params& p = Params());
// Calls operator() of the given closure (lambda function).
template <class Closure>
static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
return (*f)(input);
}
// Same as Measure, except "closure" is typically a lambda function of
// FuncInput -> FuncOutput with a capture list.
template <class Closure>
static inline size_t MeasureClosure(const Closure& closure,
const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params()) {
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
results, p);
}
} // namespace hwy
#endif // HIGHWAY_HWY_NANOBENCHMARK_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,578 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Sets macros based on HWY_TARGET.
// This include guard is toggled by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it.
#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif
#endif // HWY_SET_MACROS_PER_TARGET
#include "hwy/detect_compiler_arch.h" // IWYU: export
#include "hwy/detect_targets.h" // IWYU: export
#undef HWY_NAMESPACE
#undef HWY_ALIGN
#undef HWY_MAX_BYTES
#undef HWY_LANES
#undef HWY_HAVE_SCALABLE
#undef HWY_HAVE_TUPLE
#undef HWY_HAVE_INTEGER64
#undef HWY_HAVE_FLOAT16
#undef HWY_HAVE_FLOAT64
#undef HWY_MEM_OPS_MIGHT_FAULT
#undef HWY_NATIVE_FMA
#undef HWY_CAP_GE256
#undef HWY_CAP_GE512
// Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
#if HWY_TARGET == HWY_RVV && \
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
(HWY_COMPILER_CLANG))
#define HWY_HAVE_TUPLE 0
#else
#define HWY_HAVE_TUPLE 1
#endif
// For internal use (clamping/validating N for Simd<>)
#undef HWY_MAX_N
#if HWY_TARGET == HWY_SCALAR
#define HWY_MAX_N 1
#else
#define HWY_MAX_N 65536
#endif
// For internal use (clamping kPow2 for Simd<>)
#undef HWY_MAX_POW2
// For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to
// support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3.
// However, those other targets do not actually support multiple vectors, and
// thus Lanes(d) must not exceed Lanes(ScalableTag<T>()).
#define HWY_MAX_POW2 3
// User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >>
// (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions.
#undef HWY_MIN_POW2
#if HWY_TARGET == HWY_RVV
#define HWY_MIN_POW2 -16
#else
// Tighter bound for other targets, whose vectors are smaller, to potentially
// save compile time.
#define HWY_MIN_POW2 -8
#endif // HWY_TARGET == HWY_RVV
#undef HWY_TARGET_STR
#if defined(HWY_DISABLE_PCLMUL_AES)
#define HWY_TARGET_STR_PCLMUL_AES ""
#else
#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
#endif
#if defined(HWY_DISABLE_BMI2_FMA)
#define HWY_TARGET_STR_BMI2_FMA ""
#else
#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
#endif
#if defined(HWY_DISABLE_F16C)
#define HWY_TARGET_STR_F16C ""
#else
#define HWY_TARGET_STR_F16C ",f16c"
#endif
#define HWY_TARGET_STR_SSE2 "sse2"
#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
#define HWY_TARGET_STR_SSE4 \
HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
// Include previous targets, which are the half-vectors of the next target.
#define HWY_TARGET_STR_AVX2 \
HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
#define HWY_TARGET_STR_AVX3 \
HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw"
#define HWY_TARGET_STR_AVX3_DL \
HWY_TARGET_STR_AVX3 \
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
"avx512vpopcntdq,gfni"
#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_DL ",avx512fp16"
#if defined(HWY_DISABLE_PPC8_CRYPTO)
#define HWY_TARGET_STR_PPC8_CRYPTO ""
#else
#define HWY_TARGET_STR_PPC8_CRYPTO ",crypto"
#endif
#define HWY_TARGET_STR_PPC8 \
"altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO
#define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector"
#if HWY_COMPILER_CLANG
#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector"
#else
#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10"
#endif
// Before include guard so we redefine HWY_TARGET_STR on each include,
// governed by the current HWY_TARGET.
//-----------------------------------------------------------------------------
// SSE2
#if HWY_TARGET == HWY_SSE2
#define HWY_NAMESPACE N_SSE2
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_SSE2
//-----------------------------------------------------------------------------
// SSSE3
#elif HWY_TARGET == HWY_SSSE3
#define HWY_NAMESPACE N_SSSE3
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
//-----------------------------------------------------------------------------
// SSE4
#elif HWY_TARGET == HWY_SSE4
#define HWY_NAMESPACE N_SSE4
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_SSE4
//-----------------------------------------------------------------------------
// AVX2
#elif HWY_TARGET == HWY_AVX2
#define HWY_NAMESPACE N_AVX2
#define HWY_ALIGN alignas(32)
#define HWY_MAX_BYTES 32
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#ifdef HWY_DISABLE_BMI2_FMA
#define HWY_NATIVE_FMA 0
#else
#define HWY_NATIVE_FMA 1
#endif
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
//-----------------------------------------------------------------------------
// AVX3[_DL]
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
#define HWY_ALIGN alignas(64)
#define HWY_MAX_BYTES 64
#define HWY_LANES(T) (64 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#if (HWY_TARGET == HWY_AVX3_SPR) && 0 // TODO(janwas): enable after testing
#define HWY_HAVE_FLOAT16 1
#else
#define HWY_HAVE_FLOAT16 0
#endif
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 1
#if HWY_TARGET == HWY_AVX3
#define HWY_NAMESPACE N_AVX3
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
#elif HWY_TARGET == HWY_AVX3_DL
#define HWY_NAMESPACE N_AVX3_DL
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
#elif HWY_TARGET == HWY_AVX3_ZEN4
#define HWY_NAMESPACE N_AVX3_ZEN4
// Currently the same as HWY_AVX3_DL: both support Icelake.
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
#elif HWY_TARGET == HWY_AVX3_SPR
#define HWY_NAMESPACE N_AVX3_SPR
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR
#else
#error "Logic error"
#endif // HWY_TARGET
//-----------------------------------------------------------------------------
// PPC8, PPC9, PPC10
#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
HWY_TARGET == HWY_PPC10
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if HWY_TARGET == HWY_PPC8
#define HWY_NAMESPACE N_PPC8
#define HWY_TARGET_STR HWY_TARGET_STR_PPC8
#elif HWY_TARGET == HWY_PPC9
#define HWY_NAMESPACE N_PPC9
#define HWY_TARGET_STR HWY_TARGET_STR_PPC9
#elif HWY_TARGET == HWY_PPC10
#define HWY_NAMESPACE N_PPC10
#define HWY_TARGET_STR HWY_TARGET_STR_PPC10
#else
#error "Logic error"
#endif // HWY_TARGET == HWY_PPC10
//-----------------------------------------------------------------------------
// NEON
#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#define HWY_HAVE_FLOAT16 1
#else
#define HWY_HAVE_FLOAT16 0
#endif
#if HWY_ARCH_ARM_A64
#define HWY_HAVE_FLOAT64 1
#else
#define HWY_HAVE_FLOAT64 0
#endif
#define HWY_MEM_OPS_MIGHT_FAULT 1
#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
#define HWY_NATIVE_FMA 1
#else
#define HWY_NATIVE_FMA 0
#endif
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
#define HWY_NAMESPACE N_NEON_WITHOUT_AES
#else
#define HWY_NAMESPACE N_NEON
#endif
// Can use pragmas instead of -march compiler flag
#if HWY_HAVE_RUNTIME_DISPATCH
#if HWY_ARCH_ARM_V7
// The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8.
#if HWY_COMPILER_GCC_ACTUAL >= 800
#define HWY_TARGET_STR "+neon-vfpv4"
#else // GCC < 7
// Do not define HWY_TARGET_STR (no pragma).
#endif // HWY_COMPILER_GCC_ACTUAL
#else // !HWY_ARCH_ARM_V7
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
// Do not define HWY_TARGET_STR (no pragma).
#else
#define HWY_TARGET_STR "+crypto"
#endif // HWY_TARGET == HWY_NEON_WITHOUT_AES
#endif // HWY_ARCH_ARM_V7
#else // !HWY_HAVE_RUNTIME_DISPATCH
// HWY_TARGET_STR remains undefined
#endif
//-----------------------------------------------------------------------------
// SVE[2]
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
// SVE only requires lane alignment, not natural alignment of the entire vector.
#define HWY_ALIGN alignas(8)
// Value ensures MaxLanes() is the tightest possible upper bound to reduce
// overallocation.
#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if HWY_TARGET == HWY_SVE2
#define HWY_NAMESPACE N_SVE2
#define HWY_MAX_BYTES 256
#define HWY_HAVE_SCALABLE 1
#elif HWY_TARGET == HWY_SVE_256
#define HWY_NAMESPACE N_SVE_256
#define HWY_MAX_BYTES 32
#define HWY_HAVE_SCALABLE 0
#elif HWY_TARGET == HWY_SVE2_128
#define HWY_NAMESPACE N_SVE2_128
#define HWY_MAX_BYTES 16
#define HWY_HAVE_SCALABLE 0
#else
#define HWY_NAMESPACE N_SVE
#define HWY_MAX_BYTES 256
#define HWY_HAVE_SCALABLE 1
#endif
// Can use pragmas instead of -march compiler flag
#if HWY_HAVE_RUNTIME_DISPATCH
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
#define HWY_TARGET_STR "+sve2-aes"
#else
#define HWY_TARGET_STR "+sve"
#endif
#else
// HWY_TARGET_STR remains undefined
#endif
//-----------------------------------------------------------------------------
// WASM
#elif HWY_TARGET == HWY_WASM
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_WASM
#define HWY_TARGET_STR "simd128"
//-----------------------------------------------------------------------------
// WASM_EMU256
#elif HWY_TARGET == HWY_WASM_EMU256
#define HWY_ALIGN alignas(32)
#define HWY_MAX_BYTES 32
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 0
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_WASM_EMU256
#define HWY_TARGET_STR "simd128"
//-----------------------------------------------------------------------------
// RVV
#elif HWY_TARGET == HWY_RVV
// RVV only requires lane alignment, not natural alignment of the entire vector,
// and the compiler already aligns builtin types, so nothing to do here.
#define HWY_ALIGN
// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
#define HWY_MAX_BYTES 65536
// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
// LMUL. This is the tightest possible upper bound.
#define HWY_LANES(T) (8192 / sizeof(T))
#define HWY_HAVE_SCALABLE 1
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if defined(__riscv_zvfh)
#define HWY_HAVE_FLOAT16 1
#else
#define HWY_HAVE_FLOAT16 0
#endif
#define HWY_NAMESPACE N_RVV
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
// (rv64gcv is not a valid target)
//-----------------------------------------------------------------------------
// EMU128
#elif HWY_TARGET == HWY_EMU128
#define HWY_ALIGN alignas(16)
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_EMU128
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
//-----------------------------------------------------------------------------
// SCALAR
#elif HWY_TARGET == HWY_SCALAR
#define HWY_ALIGN
#define HWY_MAX_BYTES 8
#define HWY_LANES(T) 1
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_MEM_OPS_MIGHT_FAULT 0
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_SCALAR
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
// Override this to 1 in asan/msan builds, which will still fault.
#if HWY_IS_ASAN || HWY_IS_MSAN
#undef HWY_MEM_OPS_MIGHT_FAULT
#define HWY_MEM_OPS_MIGHT_FAULT 1
#endif
// Clang <9 requires this be invoked at file scope, before any namespace.
#undef HWY_BEFORE_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_BEFORE_NAMESPACE() \
HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
static_assert(true, "For requiring trailing semicolon")
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_BEFORE_NAMESPACE() \
static_assert(true, "For requiring trailing semicolon")
#endif
// Clang <9 requires any namespaces be closed before this macro.
#undef HWY_AFTER_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_AFTER_NAMESPACE() \
HWY_POP_ATTRIBUTES \
static_assert(true, "For requiring trailing semicolon")
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_AFTER_NAMESPACE() \
static_assert(true, "For requiring trailing semicolon")
#endif
#undef HWY_ATTR
#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
#else
#define HWY_ATTR
#endif

View File

@ -1,520 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target definitions shared by ops/*.h and user code.
// IWYU pragma: begin_exports
// Export does not seem to be recursive, so re-export these (also in base.h)
#include <stddef.h>
#include "hwy/base.h"
// "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
#if !HWY_IDE
#include <stdint.h>
#endif
#include "hwy/detect_compiler_arch.h"
// Separate header because foreach_target.h re-enables its include guard.
#include "hwy/ops/set_macros-inl.h"
// IWYU pragma: end_exports
#if HWY_IS_MSAN
#include <sanitizer/msan_interface.h>
#endif
// We are covered by the highway.h include guard, but generic_ops-inl.h
// includes this again #if HWY_IDE.
#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
#else
#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// NOTE: GCC generates incorrect code for vector arguments to non-inlined
// functions in two situations:
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
// - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
// all) tests to fail.
//
// We therefore pass by const& only on GCC and (Windows or aarch64). This alias
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
// and possibly also other functions that are not inlined.
#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
template <class V>
using VecArg = const V&;
#else
template <class V>
using VecArg = V;
#endif
namespace detail {
// Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the
// desired fraction or multiple of it, see Simd<>. `pow2` is most often in
// [-3, 3] but can also be lower for user-specified fractions.
constexpr size_t ScaleByPower(size_t N, int pow2) {
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
}
template <typename T>
HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
// Workaround for MSAN not marking compressstore as initialized (b/233326619)
#if HWY_IS_MSAN
__msan_unpoison(unaligned, count * sizeof(T));
#else
(void)unaligned;
(void)count;
#endif
}
} // namespace detail
// Highway operations are implemented as overloaded functions selected using a
// zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type.
//
// N defines how many lanes are in a 'full' vector, typically equal to
// HWY_LANES(T) (which is the actual count on targets with vectors of known
// size, and an upper bound in case of scalable vectors), otherwise a
// user-specified limit at most that large.
//
// 2^kPow2 is a _subsequently_ applied scaling factor that indicates the
// desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3
// means two/four/eight full vectors ganged together. The largest supported
// kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping
// user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>`
// have the same `MaxLanes` and `Lanes`.
//
// We can theoretically keep halving Lanes(), but recursive instantiations of
// kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count.
// Users must terminate such compile-time recursions at or above HWY_MIN_POW2.
//
// WARNING: do not use N directly because it may be a special representation of
// a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to
// Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two,
// but we want MaxLanes to be the same in both cases. Hence ?? is a
// fixed-point encoding of 1/4.
//
// Instead of referring to Simd<> directly, users create D via aliases:
// - ScalableTag<T> for a full vector;
// - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is
// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`;
// - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or
// - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes.
//
// Instead of N, use Lanes(D()) for the actual number of lanes at runtime and
// D().MaxLanes() for a constexpr upper bound. Both are powers of two.
template <typename Lane, size_t N, int kPow2>
struct Simd {
constexpr Simd() = default;
using T = Lane;
private:
static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit");
// 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of
// N when kFrac == 0, otherwise it is one (see FracN).
static constexpr size_t kWhole = N & 0xFFFFF;
// Fractional part is in the bits above kWhole.
static constexpr int kFrac = static_cast<int>(N >> 20);
// Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger
// type to u8 results in fractions).
static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range");
static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1");
static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x");
// Important to check this here because kPow2 <= -64 causes confusing
// compile errors (invalid shift count).
static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?");
// However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to
// Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its
// kPow2 is out of bounds.
public:
// Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the
// common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2.
// E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>.
// The resulting number of lanes is still 1 because this N represents 1/4
// (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of
// the sizes so that the correct LMUL overloads are chosen, even if N is
// small enough that it would fit in an LMUL=1 vector.
//
// Cannot be an enum because GCC warns when using enums and non-enums in the
// same expression. Cannot be a static constexpr function (MSVC limitation).
// Rounded up to one so this is a valid array length.
//
// Do not use this directly - only 'public' so it is visible from the accessor
// macro required by MSVC.
static constexpr size_t kPrivateLanes =
HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac));
constexpr size_t MaxLanes() const { return kPrivateLanes; }
constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); }
constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; }
// For SFINAE on RVV.
constexpr int Pow2() const { return kPow2; }
// ------------------------------ Changing lane type or count
// Do not use any of these directly. Anything used from member typedefs cannot
// be made private, but functions only used within other functions can.
// Returns number of NewT lanes that fit within MaxBytes().
template <typename NewT>
static constexpr size_t RepartitionLanes() {
// Round up to correctly handle larger NewT.
return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
}
// Returns the new kPow2 required for lanes of type NewT.
template <typename NewT>
static constexpr int RebindPow2() {
return kPow2 +
((sizeof(NewT) >= sizeof(T))
? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
: -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))));
}
private:
// Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
template <int kNewPow2, size_t kNewMaxLanes>
static constexpr size_t WholeN() {
return detail::ScaleByPower(kNewMaxLanes, -kNewPow2);
}
// Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
template <int kNewPow2, size_t kNewMaxLanes>
static constexpr size_t FracN() {
// Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN
// would not have been zero), but clamp to zero to avoid warnings. kFrac is
// the difference, stored in the upper bits of N, and we also set kWhole =
// 1 so that the new kPrivateLanes = kNewMaxLanes.
static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift");
return static_cast<size_t>(
1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes)))
<< 20));
}
public:
// Returns (whole or fractional) NewN, see above.
template <int kNewPow2, size_t kNewMaxLanes>
static constexpr size_t NewN() {
// We require a fraction if inverting kNewPow2 results in 0.
return WholeN<kNewPow2, kNewMaxLanes>() == 0
? FracN<kNewPow2, kNewMaxLanes>()
: WholeN<kNewPow2, kNewMaxLanes>();
}
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
template <typename NewT>
using Rebind =
Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>;
// Change lane type while keeping the same vector size, e.g. for MulEven.
template <typename NewT>
using Repartition =
Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>;
// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
using Half = Simd<T, N, kPow2 - 1>;
// Twice the lanes while keeping the same lane type, e.g. for Combine.
using Twice = Simd<T, N, kPow2 + 1>;
};
namespace detail {
template <typename T, size_t N, int kPow2>
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
return N == HWY_LANES(T) && kPow2 == 0;
}
// Struct wrappers enable validation of arguments via static_assert.
template <typename T, size_t N, int kPow2>
struct ClampNAndPow2 {
using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>;
};
template <typename T, int kPow2>
struct ScalableTagChecker {
using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type;
};
template <typename T, size_t kLimit, int kPow2>
struct CappedTagChecker {
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
// Safely handle non-power-of-two inputs by rounding down, which is allowed by
// CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T));
using type = typename ClampNAndPow2<T, N, kPow2>::type;
};
template <typename T, size_t kNumLanes>
struct FixedTagChecker {
static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
using type = Simd<T, kNumLanes, 0>;
};
} // namespace detail
// ------------------------------ Aliases for Simd<>
// Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D
// loops where the application does not care about the vector size) or a
// fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or
// return values of type promotion and demotion. User-specified kPow2 is
// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
template <typename T, int kPow2 = 0>
using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
// Tag describing a vector with *up to* kLimit active lanes, even on targets
// with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may
// be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for
// 1D loops with a relatively low application-defined upper bound, e.g. for 8x8
// DCTs. However, it is better if data structures are designed to be
// vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >=
// MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would
// enable vector-length-agnostic loops using ScalableTag). User-specified kPow2
// is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
template <typename T, size_t kLimit, int kPow2 = 0>
using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type;
#if !HWY_HAVE_SCALABLE
// If the vector size is known, and the app knows it does not want more than
// kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower
// IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2.
template <typename T, size_t kLimit, int kPow2 = 0>
using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>;
#else // HWY_HAVE_SCALABLE
// .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit.
template <typename T, size_t kLimit, int kPow2 = 0>
using CappedTagIfFixed = ScalableTag<T, kPow2>;
#endif
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
// two not exceeding `HWY_LANES(T)`.
//
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
// This is useful for data structures that rely on exactly 128-bit SIMD, but
// these are discouraged because they cannot benefit from wider vectors.
// Instead, applications would ideally define a larger problem size and loop
// over it with the (unknown size) vectors from ScalableTag.
//
// + e.g. if the baseline is known to support SIMD, or the application requires
// ops such as TableLookupBytes not supported by HWY_SCALAR.
template <typename T, size_t kNumLanes>
using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
// Convenience form for fixed sizes.
template <typename T>
using Full16 = Simd<T, 2 / sizeof(T), 0>;
template <typename T>
using Full32 = Simd<T, 4 / sizeof(T), 0>;
template <typename T>
using Full64 = Simd<T, 8 / sizeof(T), 0>;
template <typename T>
using Full128 = Simd<T, 16 / sizeof(T), 0>;
// ------------------------------ Accessors for Simd<>
// Lane type.
template <class D>
using TFromD = typename D::T;
// Upper bound on the number of lanes, typically used for SFINAE conditions and
// to allocate storage for targets with known vector sizes. Note: this may be a
// loose bound, instead use Lanes() as the actual size for AllocateAligned.
// MSVC workaround: use static constant directly instead of a function.
#define HWY_MAX_LANES_D(D) D::kPrivateLanes
// Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the
// macro form may be required for MSVC, which has limitations on deducing
// arguments.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
return HWY_MAX_LANES_D(D);
}
#if !HWY_HAVE_SCALABLE
// If non-scalable, this is constexpr; otherwise the target's header defines a
// non-constexpr version of this function. This is the actual vector length,
// used when advancing loop counters.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) {
return HWY_MAX_LANES_D(D);
}
#endif // !HWY_HAVE_SCALABLE
// Tag for the same number of lanes as D, but with the LaneType T.
template <class T, class D>
using Rebind = typename D::template Rebind<T>;
template <class D>
using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
template <class D>
using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
template <class D>
using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
// Tag for the same total size as D, but with the LaneType T.
template <class T, class D>
using Repartition = typename D::template Repartition<T>;
template <class D>
using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
template <class D>
using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
// Tag for the same lane type as D, but half the lanes.
template <class D>
using Half = typename D::Half;
// Tag for the same lane type as D, but twice the lanes.
template <class D>
using Twice = typename D::Twice;
// Tag for a 16-byte block with the same lane type as D
#if HWY_HAVE_SCALABLE
namespace detail {
template <class D>
class BlockDFromD_t {};
template <typename T, size_t N, int kPow2>
class BlockDFromD_t<Simd<T, N, kPow2>> {
using D = Simd<T, N, kPow2>;
static constexpr int kNewPow2 = HWY_MIN(kPow2, 0);
static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D));
static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();
public:
using type = Simd<T, kNewN, kNewPow2>;
};
} // namespace detail
template <class D>
using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
#else
template <class D>
using BlockDFromD =
Simd<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), HWY_MAX_LANES_D(D)), 0>;
#endif
// ------------------------------ Choosing overloads (SFINAE)
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
#define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(TFromD<D>)
#define HWY_IF_NOT_FLOAT3264_D(D) HWY_IF_NOT_FLOAT3264(TFromD<D>)
#define HWY_IF_SPECIAL_FLOAT_D(D) HWY_IF_SPECIAL_FLOAT(TFromD<D>)
#define HWY_IF_NOT_SPECIAL_FLOAT_D(D) HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)
#define HWY_IF_FLOAT_OR_SPECIAL_D(D) HWY_IF_FLOAT_OR_SPECIAL(TFromD<D>)
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \
HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)
#define HWY_IF_T_SIZE_D(D, bytes) HWY_IF_T_SIZE(TFromD<D>, bytes)
#define HWY_IF_NOT_T_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE(TFromD<D>, bytes)
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \
HWY_IF_T_SIZE_ONE_OF(TFromD<D>, bit_array)
#define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes)
#define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes)
#define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes)
#define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \
HWY_IF_LANES_PER_BLOCK( \
TFromD<D>, HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>)), lanes)
#define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr
#define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr
#define HWY_IF_U8_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint8_t>()>* = nullptr
#define HWY_IF_U16_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint16_t>()>* = nullptr
#define HWY_IF_U32_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint32_t>()>* = nullptr
#define HWY_IF_U64_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint64_t>()>* = nullptr
#define HWY_IF_I8_D(D) hwy::EnableIf<IsSame<TFromD<D>, int8_t>()>* = nullptr
#define HWY_IF_I16_D(D) hwy::EnableIf<IsSame<TFromD<D>, int16_t>()>* = nullptr
#define HWY_IF_I32_D(D) hwy::EnableIf<IsSame<TFromD<D>, int32_t>()>* = nullptr
#define HWY_IF_I64_D(D) hwy::EnableIf<IsSame<TFromD<D>, int64_t>()>* = nullptr
// Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double
// overloads.
#define HWY_IF_UI16_D(D) HWY_IF_UI16(TFromD<D>)
#define HWY_IF_UI32_D(D) HWY_IF_UI32(TFromD<D>)
#define HWY_IF_UI64_D(D) HWY_IF_UI64(TFromD<D>)
#define HWY_IF_BF16_D(D) \
hwy::EnableIf<IsSame<TFromD<D>, hwy::bfloat16_t>()>* = nullptr
#define HWY_IF_F16_D(D) \
hwy::EnableIf<IsSame<TFromD<D>, hwy::float16_t>()>* = nullptr
#define HWY_IF_F32_D(D) hwy::EnableIf<IsSame<TFromD<D>, float>()>* = nullptr
#define HWY_IF_F64_D(D) hwy::EnableIf<IsSame<TFromD<D>, double>()>* = nullptr
#define HWY_IF_V_SIZE_D(D, bytes) \
HWY_IF_V_SIZE(TFromD<D>, HWY_MAX_LANES_D(D), bytes)
#define HWY_IF_V_SIZE_LE_D(D, bytes) \
HWY_IF_V_SIZE_LE(TFromD<D>, HWY_MAX_LANES_D(D), bytes)
#define HWY_IF_V_SIZE_GT_D(D, bytes) \
HWY_IF_V_SIZE_GT(TFromD<D>, HWY_MAX_LANES_D(D), bytes)
// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(TFromV<V>)
#define HWY_IF_SPECIAL_FLOAT_V(V) HWY_IF_SPECIAL_FLOAT(TFromV<V>)
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromV<V>)
#define HWY_IF_T_SIZE_V(V, bytes) HWY_IF_T_SIZE(TFromV<V>, bytes)
#define HWY_IF_NOT_T_SIZE_V(V, bytes) HWY_IF_NOT_T_SIZE(TFromV<V>, bytes)
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
HWY_IF_T_SIZE_ONE_OF(TFromV<V>, bit_array)
#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>)
#define HWY_IF_V_SIZE_V(V, bytes) \
HWY_IF_V_SIZE(TFromV<V>, HWY_MAX_LANES_V(V), bytes)
#define HWY_IF_V_SIZE_LE_V(V, bytes) \
HWY_IF_V_SIZE_LE(TFromV<V>, HWY_MAX_LANES_V(V), bytes)
#define HWY_IF_V_SIZE_GT_V(V, bytes) \
HWY_IF_V_SIZE_GT(TFromV<V>, HWY_MAX_LANES_V(V), bytes)
// Old names (deprecated)
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE

View File

@ -1,125 +0,0 @@
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Tuple support. Included by those ops/* that lack native tuple types, after
// they define VFromD and before they use the tuples e.g. for LoadInterleaved2.
// Assumes we are already in the HWY_NAMESPACE and under an include guard.
// If viewing this header standalone, define VFromD to avoid IDE warnings.
// This is normally set by set_macros-inl.h before this header is included.
#if !defined(HWY_NAMESPACE)
#include "hwy/base.h"
template <class D>
using VFromD = int;
#endif
// On SVE, Vec2..4 are aliases to built-in types.
template <class D>
struct Vec2 {
VFromD<D> v0;
VFromD<D> v1;
};
template <class D>
struct Vec3 {
VFromD<D> v0;
VFromD<D> v1;
VFromD<D> v2;
};
template <class D>
struct Vec4 {
VFromD<D> v0;
VFromD<D> v1;
VFromD<D> v2;
VFromD<D> v3;
};
// D arg is unused but allows deducing D.
template <class D>
HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) {
return Vec2<D>{v0, v1};
}
template <class D>
HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) {
return Vec3<D>{v0, v1, v2};
}
template <class D>
HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
VFromD<D> v3) {
return Vec4<D>{v0, v1, v2, v3};
}
template <size_t kIndex, class D>
HWY_API VFromD<D> Get2(Vec2<D> tuple) {
static_assert(kIndex < 2, "Tuple index out of bounds");
return kIndex == 0 ? tuple.v0 : tuple.v1;
}
template <size_t kIndex, class D>
HWY_API VFromD<D> Get3(Vec3<D> tuple) {
static_assert(kIndex < 3, "Tuple index out of bounds");
return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2;
}
template <size_t kIndex, class D>
HWY_API VFromD<D> Get4(Vec4<D> tuple) {
static_assert(kIndex < 4, "Tuple index out of bounds");
return kIndex == 0 ? tuple.v0
: kIndex == 1 ? tuple.v1
: kIndex == 2 ? tuple.v2
: tuple.v3;
}
template <size_t kIndex, class D>
HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) {
static_assert(kIndex < 2, "Tuple index out of bounds");
if (kIndex == 0) {
tuple.v0 = val;
} else {
tuple.v1 = val;
}
return tuple;
}
template <size_t kIndex, class D>
HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) {
static_assert(kIndex < 3, "Tuple index out of bounds");
if (kIndex == 0) {
tuple.v0 = val;
} else if (kIndex == 1) {
tuple.v1 = val;
} else {
tuple.v2 = val;
}
return tuple;
}
template <size_t kIndex, class D>
HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) {
static_assert(kIndex < 4, "Tuple index out of bounds");
if (kIndex == 0) {
tuple.v0 = val;
} else if (kIndex == 1) {
tuple.v1 = val;
} else if (kIndex == 2) {
tuple.v2 = val;
} else {
tuple.v3 = val;
}
return tuple;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,44 +0,0 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_PER_TARGET_H_
#define HIGHWAY_HWY_PER_TARGET_H_
#include <stddef.h>
#include "hwy/highway_export.h"
// Functions to query the capabilities of the target that will be called by
// HWY_DYNAMIC_DISPATCH, which is not necessarily the current target.
namespace hwy {
// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
//
// Do not cache the result, which may change after calling DisableTargets, or
// if software requests a different vector size (e.g. when entering/exiting SME
// streaming mode). Instead call this right before the code that depends on the
// result, without any DisableTargets or SME transition in-between. Note that
// this involves an indirect call, so prefer not to call this frequently nor
// unnecessarily.
HWY_DLLEXPORT size_t VectorBytes();
// Returns whether 16/64-bit floats are a supported lane type.
HWY_DLLEXPORT bool HaveFloat16();
HWY_DLLEXPORT bool HaveFloat64();
} // namespace hwy
#endif // HIGHWAY_HWY_PER_TARGET_H_

View File

@ -1,62 +0,0 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Print() function
#include "hwy/highway.h"
#include "hwy/print.h"
// Per-target include guard
#if defined(HIGHWAY_HWY_PRINT_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_PRINT_INL_H_
#undef HIGHWAY_HWY_PRINT_INL_H_
#else
#define HIGHWAY_HWY_PRINT_INL_H_
#endif
#if HWY_TARGET == HWY_RVV
#include "hwy/aligned_allocator.h"
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Prints lanes around `lane`, in memory order.
template <class D, class V = VFromD<D>>
HWY_API void Print(const D d, const char* caption, V v, size_t lane_u = 0,
size_t max_lanes = 7) {
const size_t N = Lanes(d);
using T = TFromD<D>;
#if HWY_TARGET == HWY_RVV
auto storage = AllocateAligned<T>(N);
T* HWY_RESTRICT lanes = storage.get();
#else
// This works around an SVE compile error on GCC 11 and 12. Calling
// AllocateAligned here would seem to require it be marked with HWY_ATTR.
HWY_ALIGN T lanes[MaxLanes(d)];
#endif
Store(v, d, lanes);
const auto info = hwy::detail::MakeTypeInfo<T>();
hwy::detail::PrintArray(info, caption, lanes, N, lane_u, max_lanes);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // per-target include guard

View File

@ -1,75 +0,0 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HWY_PRINT_H_
#define HWY_PRINT_H_
// Helpers for printing vector lanes.
#include <stddef.h>
#include <stdio.h>
#include "hwy/base.h"
#include "hwy/highway_export.h"
namespace hwy {
namespace detail {
// For implementing value comparisons etc. as type-erased functions to reduce
// template bloat.
struct TypeInfo {
size_t sizeof_t;
bool is_float;
bool is_signed;
bool is_bf16;
};
template <typename T>
HWY_INLINE TypeInfo MakeTypeInfo() {
TypeInfo info;
info.sizeof_t = sizeof(T);
info.is_float = IsFloat<T>();
info.is_signed = IsSigned<T>();
info.is_bf16 = IsSame<T, bfloat16_t>();
return info;
}
HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
char* string100);
HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
const void* array_void, size_t N,
size_t lane_u = 0, size_t max_lanes = 7);
} // namespace detail
template <typename T>
HWY_NOINLINE void PrintValue(T value) {
char str[100];
detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
fprintf(stderr, "%s,", str);
}
template <typename T>
HWY_NOINLINE void PrintArray(const T* value, size_t count) {
detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
count);
}
} // namespace hwy
#endif // HWY_PRINT_H_

View File

@ -1,148 +0,0 @@
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_ROBUST_STATISTICS_H_
#define HIGHWAY_HWY_ROBUST_STATISTICS_H_
#include <algorithm> // std::sort, std::find_if
#include <limits>
#include <utility> // std::pair
#include <vector>
#include "hwy/base.h"
namespace hwy {
namespace robust_statistics {
// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
// than std::sort for input distributions with very few unique values.
template <class T>
void CountingSort(T* values, size_t num_values) {
// Unique values and their frequency (similar to flat_map).
using Unique = std::pair<T, int>;
std::vector<Unique> unique;
for (size_t i = 0; i < num_values; ++i) {
const T value = values[i];
const auto pos =
std::find_if(unique.begin(), unique.end(),
[value](const Unique u) { return u.first == value; });
if (pos == unique.end()) {
unique.push_back(std::make_pair(value, 1));
} else {
++pos->second;
}
}
// Sort in ascending order of value (pair.first).
std::sort(unique.begin(), unique.end());
// Write that many copies of each unique value to the array.
T* HWY_RESTRICT p = values;
for (const auto& value_count : unique) {
std::fill(p, p + value_count.second, value_count.first);
p += value_count.second;
}
HWY_ASSERT(p == values + num_values);
}
// @return i in [idx_begin, idx_begin + half_count) that minimizes
// sorted[i + half_count] - sorted[i].
template <typename T>
size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
const size_t half_count) {
T min_range = std::numeric_limits<T>::max();
size_t min_idx = 0;
for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
HWY_ASSERT(sorted[idx] <= sorted[idx + half_count]);
const T range = sorted[idx + half_count] - sorted[idx];
if (range < min_range) {
min_range = range;
min_idx = idx;
}
}
return min_idx;
}
// Returns an estimate of the mode by calling MinRange on successively
// halved intervals. "sorted" must be in ascending order. This is the
// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
// estimator of the mode", with complexity O(N log N). The mode is less
// affected by outliers in highly-skewed distributions than the median.
// The averaging operation below assumes "T" is an unsigned integer type.
template <typename T>
T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
size_t idx_begin = 0;
size_t half_count = num_values / 2;
while (half_count > 1) {
idx_begin = MinRange(sorted, idx_begin, half_count);
half_count >>= 1;
}
const T x = sorted[idx_begin + 0];
if (half_count == 0) {
return x;
}
HWY_ASSERT(half_count == 1);
const T average = (x + sorted[idx_begin + 1] + 1) / 2;
return average;
}
// Returns the mode. Side effect: sorts "values".
template <typename T>
T Mode(T* values, const size_t num_values) {
CountingSort(values, num_values);
return ModeOfSorted(values, num_values);
}
template <typename T, size_t N>
T Mode(T (&values)[N]) {
return Mode(&values[0], N);
}
// Returns the median value. Side effect: sorts "values".
template <typename T>
T Median(T* values, const size_t num_values) {
HWY_ASSERT(num_values != 0);
std::sort(values, values + num_values);
const size_t half = num_values / 2;
// Odd count: return middle
if (num_values % 2) {
return values[half];
}
// Even count: return average of middle two.
return (values[half] + values[half - 1] + 1) / 2;
}
// Returns a robust measure of variability.
template <typename T>
T MedianAbsoluteDeviation(const T* values, const size_t num_values,
const T median) {
HWY_ASSERT(num_values != 0);
std::vector<T> abs_deviations;
abs_deviations.reserve(num_values);
for (size_t i = 0; i < num_values; ++i) {
const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
static_cast<int64_t>(median));
abs_deviations.push_back(static_cast<T>(abs));
}
return Median(abs_deviations.data(), num_values);
}
} // namespace robust_statistics
} // namespace hwy
#endif // HIGHWAY_HWY_ROBUST_STATISTICS_H_

View File

@ -1,338 +0,0 @@
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_TARGETS_H_
#define HIGHWAY_HWY_TARGETS_H_
// Allows opting out of C++ standard library usage, which is not available in
// some Compiler Explorer environments.
#ifndef HWY_NO_LIBCXX
#include <vector>
#endif
// For SIMD module implementations and their callers. Defines which targets to
// generate and call.
#include "hwy/base.h"
#include "hwy/detect_targets.h"
#include "hwy/highway_export.h"
#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX)
#include <atomic>
#endif
namespace hwy {
// Returns bitfield of enabled targets that are supported on this CPU; there is
// always at least one such target, hence the return value is never 0. The
// targets returned may change after calling DisableTargets. This function is
// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
// calls to it if there is only a single target enabled.
HWY_DLLEXPORT int64_t SupportedTargets();
// Evaluates to a function call, or literal if there is a single target.
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
#else
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
#endif
// Subsequent SupportedTargets will not return targets whose bit(s) are set in
// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
// instead return HWY_STATIC_TARGET (there must always be one target to call).
//
// This function is useful for disabling targets known to be buggy, or if the
// best available target is undesirable (perhaps due to throttling or memory
// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
// function for iteratively enabling specific targets for testing.
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
// Subsequent SupportedTargets will return the given set of targets, except
// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
// and return to the normal SupportedTargets behavior. Used to run tests for
// all targets.
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
#ifndef HWY_NO_LIBCXX
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
// is affected by the current SetSupportedTargetsForTest() mock if any.
HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
std::vector<int64_t> ret;
for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
targets = targets & (targets - 1)) {
int64_t current_target = targets & ~(targets - 1);
ret.push_back(current_target);
}
return ret;
}
#endif // HWY_NO_LIBCXX
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
switch (target) {
#if HWY_ARCH_X86
case HWY_SSE2:
return "SSE2";
case HWY_SSSE3:
return "SSSE3";
case HWY_SSE4:
return "SSE4";
case HWY_AVX2:
return "AVX2";
case HWY_AVX3:
return "AVX3";
case HWY_AVX3_DL:
return "AVX3_DL";
case HWY_AVX3_ZEN4:
return "AVX3_ZEN4";
case HWY_AVX3_SPR:
return "AVX3_SPR";
#endif
#if HWY_ARCH_ARM
case HWY_SVE2_128:
return "SVE2_128";
case HWY_SVE_256:
return "SVE_256";
case HWY_SVE2:
return "SVE2";
case HWY_SVE:
return "SVE";
case HWY_NEON:
return "NEON";
case HWY_NEON_WITHOUT_AES:
return "NEON_WITHOUT_AES";
#endif
#if HWY_ARCH_PPC
case HWY_PPC8:
return "PPC8";
case HWY_PPC9:
return "PPC9";
case HWY_PPC10:
return "PPC10";
#endif
#if HWY_ARCH_WASM
case HWY_WASM:
return "WASM";
case HWY_WASM_EMU256:
return "WASM_EMU256";
#endif
#if HWY_ARCH_RVV
case HWY_RVV:
return "RVV";
#endif
case HWY_EMU128:
return "EMU128";
case HWY_SCALAR:
return "SCALAR";
default:
return "Unknown"; // must satisfy gtest IsValidParamName()
}
}
// The maximum number of dynamic targets on any architecture is defined by
// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
// For the ChosenTarget mask and index we use a different bit arrangement than
// in the HWY_TARGETS mask. Only the targets involved in the current
// architecture are used in this mask, and therefore only the least significant
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
// significant bit is set when the mask is not initialized, the next
// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
// that position and the next more significant bit is used for HWY_SCALAR (if
// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
// define equivalent values for HWY_TARGETS in this representation.
// This mask representation allows to use ctz() on this mask and obtain a small
// number that's used as an index of the table for dynamic dispatch. In this
// way the first entry is used when the mask is uninitialized, the following
// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
// scalar.
// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
// current architecture.
#define HWY_CHOSEN_TARGET_SHIFT(X) \
((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
<< 1)
// The HWY_TARGETS mask in the ChosenTarget mask format.
#define HWY_CHOSEN_TARGET_MASK_TARGETS \
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
#if HWY_ARCH_X86
// Maximum number of dynamic targets, changing this value is an ABI incompatible
// change
#define HWY_MAX_DYNAMIC_TARGETS 15
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
// These must match the order in which the HWY_TARGETS are defined
// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
// corresponds to the best target. Don't include a "," at the end of the list.
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \
nullptr, /* reserved */ \
HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \
HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
nullptr, /* AVX */ \
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
nullptr, /* reserved - SSE3? */ \
HWY_CHOOSE_SSE2(func_name) /* SSE2 */
#elif HWY_ARCH_ARM
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 15
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \
HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
HWY_CHOOSE_SVE(func_name), /* SVE */ \
HWY_CHOOSE_NEON(func_name), /* NEON */ \
HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */
#elif HWY_ARCH_RVV
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 9
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_RVV(func_name), /* RVV */ \
nullptr /* reserved */
#elif HWY_ARCH_PPC
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 9
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \
HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
nullptr, /* reserved (VSX or AltiVec) */ \
nullptr /* reserved (VSX or AltiVec) */
#elif HWY_ARCH_WASM
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 9
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
HWY_CHOOSE_WASM(func_name), /* WASM */ \
nullptr /* reserved */
#else
// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
// still creating single-entry tables in HWY_EXPORT to ensure portability.
#define HWY_MAX_DYNAMIC_TARGETS 1
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
#endif
// Bitfield of supported and enabled targets. The format differs from that of
// HWY_TARGETS; the lowest bit governs the first function pointer (which is
// special in that it calls FunctionCache, then Update, then dispatches to the
// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
// GetChosenTarget), thread-safe except on RVV.
struct ChosenTarget {
public:
// Reset bits according to `targets` (typically the return value of
// SupportedTargets()). Postcondition: IsInitialized() == true.
void Update(int64_t targets) {
// These are `targets` shifted downwards, see above. Also include SCALAR
// (corresponds to the last entry in the function table) as fallback.
StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
}
// Reset to the uninitialized state, so that FunctionCache will call Update
// during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
void DeInit() { StoreMask(1); }
// Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
// function was called, which we check in tests.
bool IsInitialized() const { return LoadMask() != 1; }
// Return the index in the dynamic dispatch table to be used by the current
// CPU. Note that this method must be in the header file so it uses the value
// of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
// calls it, which may be different from others. This means we only enable
// those targets that were actually compiled in this module.
size_t HWY_INLINE GetIndex() const {
return hwy::Num0BitsBelowLS1Bit_Nonzero64(
static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
}
private:
// TODO(janwas): remove RVV once <atomic> is available
#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX)
int64_t LoadMask() const { return mask_; }
void StoreMask(int64_t mask) { mask_ = mask; }
int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0.
#else
int64_t LoadMask() const { return mask_.load(); }
void StoreMask(int64_t mask) { mask_.store(mask); }
std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0.
#endif // HWY_ARCH_RVV
};
// For internal use (e.g. by FunctionCache and DisableTargets).
HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
} // namespace hwy
#endif // HIGHWAY_HWY_TARGETS_H_

View File

@ -1,200 +0,0 @@
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// High-resolution and high-precision timer
// Per-target include guard
#if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_TIMER_INL_H_
#undef HIGHWAY_HWY_TIMER_INL_H_
#else
#define HIGHWAY_HWY_TIMER_INL_H_
#endif
#include "hwy/highway.h"
#include "hwy/timer.h"
#if defined(_WIN32) || defined(_WIN64)
#ifndef NOMINMAX
#define NOMINMAX
#endif // NOMINMAX
#include <windows.h>
#endif
#if defined(__APPLE__)
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif
#if defined(__HAIKU__)
#include <OS.h>
#endif
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
#endif
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
#include <intrin.h>
#endif
#include <time.h> // clock_gettime
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace timer {
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
// unsigned to guarantee wraparound on overflow.
using Ticks = uint64_t;
// Start/Stop return absolute timestamps and must be placed immediately before
// and after the region to measure. We provide separate Start/Stop functions
// because they use different fences.
//
// Background: RDTSC is not 'serializing'; earlier instructions may complete
// after it, and/or later instructions may complete before it. 'Fences' ensure
// regions' elapsed times are independent of such reordering. The only
// documented unprivileged serializing instruction is CPUID, which acts as a
// full fence (no reordering across it in either direction). Unfortunately
// the latency of CPUID varies wildly (perhaps made worse by not initializing
// its EAX input). Because it cannot reliably be deducted from the region's
// elapsed time, it must not be included in the region to measure (i.e.
// between the two RDTSC).
//
// The newer RDTSCP is sometimes described as serializing, but it actually
// only serves as a half-fence with release semantics. Although all
// instructions in the region will complete before the final timestamp is
// captured, subsequent instructions may leak into the region and increase the
// elapsed time. Inserting another fence after the final RDTSCP would prevent
// such reordering without affecting the measured region.
//
// Fortunately, such a fence exists. The LFENCE instruction is only documented
// to delay later loads until earlier loads are visible. However, Intel's
// reference manual says it acts as a full fence (waiting until all earlier
// instructions have completed, and delaying later instructions until it
// completes). AMD assigns the same behavior to MFENCE.
//
// We need a fence before the initial RDTSC to prevent earlier instructions
// from leaking into the region, and arguably another after RDTSC to avoid
// region instructions from completing before the timestamp is recorded.
// When surrounded by fences, the additional RDTSCP half-fence provides no
// benefit, so the initial timestamp can be recorded via RDTSC, which has
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
//
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
// However, Stop+Stop includes an LFENCE in the region measurements, which
// adds a delay dependent on earlier loads. The combination of Start+Stop
// is faster than Start+Start and more consistent than Stop+Stop because
// the first LFENCE already delayed subsequent loads before the measured
// region. This combination seems not to have been considered in prior work:
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
//
// Note: performance counters can measure 'exact' instructions-retired or
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
// requires fences. Unfortunately, it is not accessible on all OSes and we
// prefer to avoid kernel-mode drivers. Performance counters are also affected
// by several under/over-count errata, so we use the TSC instead.
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
// divide by InvariantTicksPerSecond.
inline Ticks Start() {
Ticks t;
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
t = __rdtsc();
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
asm volatile(
"lfence\n\t"
"rdtsc\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rdx", "memory", "cc");
#elif HWY_ARCH_RVV
asm volatile("rdtime %0" : "=r"(t));
#elif defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER counter;
(void)QueryPerformanceCounter(&counter);
t = counter.QuadPart;
#elif defined(__APPLE__)
t = mach_absolute_time();
#elif defined(__HAIKU__)
t = system_time_nsecs(); // since boot
#else // POSIX
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
#endif
return t;
}
// WARNING: on x86, caller must check HasRDTSCP before using this!
inline Ticks Stop() {
uint64_t t;
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
unsigned aux;
t = __rdtscp(&aux);
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
asm volatile(
"rdtscp\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rcx", "rdx", "memory", "cc");
#else
t = Start();
#endif
return t;
}
} // namespace timer
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // per-target include guard

View File

@ -1,55 +0,0 @@
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_TIMER_H_
#define HIGHWAY_HWY_TIMER_H_
// Platform-specific timer functions. Provides Now() and functions for
// interpreting and converting the timer-inl.h Ticks.
#include <stdint.h>
#include "hwy/highway_export.h"
namespace hwy {
namespace platform {
// Returns current timestamp [in seconds] relative to an unspecified origin.
// Features: monotonic (no negative elapsed time), steady (unaffected by system
// time changes), high-resolution (on the order of microseconds).
// Uses InvariantTicksPerSecond and the baseline version of timer::Start().
HWY_DLLEXPORT double Now();
// Functions for use with timer-inl.h:
// Returns whether it is safe to call timer::Stop without executing an illegal
// instruction; if false, fills cpu100 (a pointer to a 100 character buffer)
// with the CPU brand string or an empty string if unknown.
HWY_DLLEXPORT bool HaveTimerStop(char* cpu100);
// Returns tick rate, useful for converting timer::Ticks to seconds. Invariant
// means the tick counter frequency is independent of CPU throttling or sleep.
// This call may be expensive, callers should cache the result.
HWY_DLLEXPORT double InvariantTicksPerSecond();
// Returns ticks elapsed in back to back timer calls, i.e. a function of the
// timer resolution (minimum measurable difference) and overhead.
// This call is expensive, callers should cache the result.
HWY_DLLEXPORT uint64_t TimerResolution();
} // namespace platform
} // namespace hwy
#endif // HIGHWAY_HWY_TIMER_H_

View File

@ -1,19 +0,0 @@
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Release".
#----------------------------------------------------------------
# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)
# Import target "hwy::hwy" for configuration "Release"
set_property(TARGET hwy::hwy APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
set_target_properties(hwy::hwy PROPERTIES
IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libhwy.a"
)
list(APPEND _cmake_import_check_targets hwy::hwy )
list(APPEND _cmake_import_check_files_for_hwy::hwy "${_IMPORT_PREFIX}/lib/libhwy.a" )
# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)

View File

@ -1,70 +0,0 @@
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version,
# but only if the requested major version is the same as the current one.
# The variable CVF_VERSION must be set before calling configure_file().
set(PACKAGE_VERSION "1.0.6")
if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
set(PACKAGE_VERSION_COMPATIBLE FALSE)
else()
if("1.0.6" MATCHES "^([0-9]+)\\.")
set(CVF_VERSION_MAJOR "${CMAKE_MATCH_1}")
if(NOT CVF_VERSION_MAJOR VERSION_EQUAL 0)
string(REGEX REPLACE "^0+" "" CVF_VERSION_MAJOR "${CVF_VERSION_MAJOR}")
endif()
else()
set(CVF_VERSION_MAJOR "1.0.6")
endif()
if(PACKAGE_FIND_VERSION_RANGE)
# both endpoints of the range must have the expected major version
math (EXPR CVF_VERSION_MAJOR_NEXT "${CVF_VERSION_MAJOR} + 1")
if (NOT PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR
OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX_MAJOR STREQUAL CVF_VERSION_MAJOR)
OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX VERSION_LESS_EQUAL CVF_VERSION_MAJOR_NEXT)))
set(PACKAGE_VERSION_COMPATIBLE FALSE)
elseif(PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR
AND ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS_EQUAL PACKAGE_FIND_VERSION_MAX)
OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MAX)))
set(PACKAGE_VERSION_COMPATIBLE TRUE)
else()
set(PACKAGE_VERSION_COMPATIBLE FALSE)
endif()
else()
if(PACKAGE_FIND_VERSION_MAJOR STREQUAL CVF_VERSION_MAJOR)
set(PACKAGE_VERSION_COMPATIBLE TRUE)
else()
set(PACKAGE_VERSION_COMPATIBLE FALSE)
endif()
if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
set(PACKAGE_VERSION_EXACT TRUE)
endif()
endif()
endif()
# if the installed project requested no architecture check, don't perform the check
if("FALSE")
return()
endif()
# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "8" STREQUAL "")
return()
endif()
# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "8")
math(EXPR installedBits "8 * 8")
set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()

View File

@ -1,104 +0,0 @@
# Generated by CMake
if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "2.8.3")
message(FATAL_ERROR "CMake >= 2.8.3 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 2.8.3...3.23)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------
# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)
# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS hwy::hwy)
list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
if(TARGET "${_cmake_expected_target}")
list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
else()
list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)
unset(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)
return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)
# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
set(_IMPORT_PREFIX "")
endif()
# Create imported target hwy::hwy
add_library(hwy::hwy STATIC IMPORTED)
set_target_properties(hwy::hwy PROPERTIES
INTERFACE_COMPILE_DEFINITIONS "TOOLCHAIN_MISS_SYS_AUXV_H;TOOLCHAIN_MISS_ASM_HWCAP_H;HWY_STATIC_DEFINE"
INTERFACE_COMPILE_FEATURES "cxx_std_11"
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
)
# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/hwy-config-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)
# Cleanup temporary variables.
set(_IMPORT_PREFIX)
# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
if(NOT EXISTS "${_cmake_file}")
message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
\"${_cmake_file}\"
but this file does not exist. Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
\"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
endif()
endforeach()
unset(_cmake_file)
unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)
# This file does not depend on other imported targets which have
# been exported from the same project but in a separate export set.
# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)

Binary file not shown.

View File

@ -1,10 +0,0 @@
prefix=
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include
Name: libhwy
Description: Efficient and performance-portable SIMD wrapper
Version: 1.0.6
Libs: -L${libdir} -lhwy
Cflags: -I${includedir} -DHWY_STATIC_DEFINE