mirror of
https://github.com/ZDoom/zdoom-macos-deps.git
synced 2024-11-21 19:41:15 +00:00
deps: add highway 1.0.6 files
This commit is contained in:
parent
ad7c5ca771
commit
97f7b194d3
35 changed files with 69716 additions and 0 deletions
211
deps/highway/include/hwy/aligned_allocator.h
vendored
Normal file
211
deps/highway/include/hwy/aligned_allocator.h
vendored
Normal file
|
@ -0,0 +1,211 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
|
||||
// Memory allocator with support for alignment and offsets.
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
|
||||
// requires a literal. This matches typical L1 cache line sizes, which prevents
|
||||
// false sharing.
|
||||
#define HWY_ALIGNMENT 64
|
||||
|
||||
// Pointers to functions equivalent to malloc/free with an opaque void* passed
|
||||
// to them.
|
||||
using AllocPtr = void* (*)(void* opaque, size_t bytes);
|
||||
using FreePtr = void (*)(void* opaque, void* memory);
|
||||
|
||||
// Returns null or a pointer to at least `payload_size` (which can be zero)
|
||||
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
|
||||
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
|
||||
// memory or malloc() if it is null.
|
||||
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
|
||||
AllocPtr alloc_ptr, void* opaque_ptr);
|
||||
|
||||
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
|
||||
// must have been returned from a previous call to `AllocateAlignedBytes`.
|
||||
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
|
||||
// `free_ptr` function is null, uses the default free().
|
||||
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
|
||||
FreePtr free_ptr, void* opaque_ptr);
|
||||
|
||||
// Class that deletes the aligned pointer passed to operator() calling the
|
||||
// destructor before freeing the pointer. This is equivalent to the
|
||||
// std::default_delete but for aligned objects. For a similar deleter equivalent
|
||||
// to free() for aligned memory see AlignedFreer().
|
||||
class AlignedDeleter {
|
||||
public:
|
||||
AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
|
||||
AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
|
||||
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
|
||||
|
||||
template <typename T>
|
||||
void operator()(T* aligned_pointer) const {
|
||||
return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
|
||||
TypedArrayDeleter<T>);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
|
||||
size_t elems = size_in_bytes / sizeof(T);
|
||||
for (size_t i = 0; i < elems; i++) {
|
||||
// Explicitly call the destructor on each element.
|
||||
(static_cast<T*>(ptr) + i)->~T();
|
||||
}
|
||||
}
|
||||
|
||||
// Function prototype that calls the destructor for each element in a typed
|
||||
// array. TypeArrayDeleter<T> would match this prototype.
|
||||
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
|
||||
|
||||
HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
|
||||
FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter);
|
||||
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
};
|
||||
|
||||
// Unique pointer to T with custom aligned deleter. This can be a single
|
||||
// element U or an array of element if T is a U[]. The custom aligned deleter
|
||||
// will call the destructor on U or each element of a U[] in the array case.
|
||||
template <typename T>
|
||||
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
|
||||
|
||||
// Aligned memory equivalent of make_unique<T> using the custom allocators
|
||||
// alloc/free with the passed `opaque` pointer. This function calls the
|
||||
// constructor with the passed Args... and calls the destructor of the object
|
||||
// when the AlignedUniquePtr is destroyed.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
|
||||
void* opaque, Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
|
||||
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
|
||||
AlignedDeleter(free, opaque));
|
||||
}
|
||||
|
||||
// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
|
||||
// functions.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(
|
||||
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
|
||||
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
|
||||
AlignedDeleter());
|
||||
}
|
||||
|
||||
// Helpers for array allocators (avoids overflow)
|
||||
namespace detail {
|
||||
|
||||
// Returns x such that 1u << x == n (if n is a power of two).
|
||||
static inline constexpr size_t ShiftCount(size_t n) {
|
||||
return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
|
||||
constexpr size_t size = sizeof(T);
|
||||
|
||||
constexpr bool is_pow2 = (size & (size - 1)) == 0;
|
||||
constexpr size_t bits = ShiftCount(size);
|
||||
static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
|
||||
|
||||
const size_t bytes = is_pow2 ? items << bits : items * size;
|
||||
const size_t check = is_pow2 ? bytes >> bits : bytes / size;
|
||||
if (check != items) {
|
||||
return nullptr; // overflowed
|
||||
}
|
||||
return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Aligned memory equivalent of make_unique<T[]> for array types using the
|
||||
// custom allocators alloc/free. This function calls the constructor with the
|
||||
// passed Args... on every created item. The destructor of each element will be
|
||||
// called when the AlignedUniquePtr is destroyed.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
|
||||
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
|
||||
T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
|
||||
if (ptr != nullptr) {
|
||||
for (size_t i = 0; i < items; i++) {
|
||||
new (ptr + i) T(std::forward<Args>(args)...);
|
||||
}
|
||||
}
|
||||
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
|
||||
return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
|
||||
items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
|
||||
// but for aligned memory.
|
||||
class AlignedFreer {
|
||||
public:
|
||||
// Pass address of this to ctor to skip deleting externally-owned memory.
|
||||
static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
|
||||
|
||||
AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
|
||||
AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
|
||||
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
|
||||
|
||||
template <typename T>
|
||||
void operator()(T* aligned_pointer) const {
|
||||
// TODO(deymo): assert that we are using a POD type T.
|
||||
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
|
||||
}
|
||||
|
||||
private:
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
};
|
||||
|
||||
// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
|
||||
// data use AlignedUniquePtr.
|
||||
template <typename T>
|
||||
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
|
||||
|
||||
// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
|
||||
// Upon destruction of the unique_ptr the aligned array will be freed.
|
||||
template <typename T>
|
||||
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
|
||||
FreePtr free, void* opaque) {
|
||||
return AlignedFreeUniquePtr<T[]>(
|
||||
detail::AllocateAlignedItems<T>(items, alloc, opaque),
|
||||
AlignedFreer(free, opaque));
|
||||
}
|
||||
|
||||
// Same as previous AllocateAligned(), using default allocate/free functions.
|
||||
template <typename T>
|
||||
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
|
||||
return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
1323
deps/highway/include/hwy/base.h
vendored
Normal file
1323
deps/highway/include/hwy/base.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
108
deps/highway/include/hwy/cache_control.h
vendored
Normal file
108
deps/highway/include/hwy/cache_control.h
vendored
Normal file
|
@ -0,0 +1,108 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
#define HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
|
||||
// https://github.com/gperftools/gperftools/issues/946).
|
||||
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
|
||||
#undef HWY_DISABLE_CACHE_CONTROL
|
||||
#define HWY_DISABLE_CACHE_CONTROL
|
||||
#endif
|
||||
|
||||
// intrin.h is sufficient on MSVC and already included by base.h.
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
|
||||
#include <emmintrin.h> // SSE2
|
||||
#include <xmmintrin.h> // _mm_prefetch
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
|
||||
#define HWY_STREAM_MULTIPLE 16
|
||||
|
||||
// The following functions may also require an attribute.
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
|
||||
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
|
||||
#else
|
||||
#define HWY_ATTR_CACHE
|
||||
#endif
|
||||
|
||||
// Windows.h #defines this, which causes infinite recursion. Temporarily
|
||||
// undefine to avoid conflict with our function.
|
||||
// TODO(janwas): remove when this function is removed.
|
||||
#pragma push_macro("LoadFence")
|
||||
#undef LoadFence
|
||||
|
||||
// Delays subsequent loads until prior loads are visible. Beware of potentially
|
||||
// differing behavior across architectures and vendors: on Intel but not
|
||||
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
|
||||
// complete).
|
||||
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_lfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO(janwas): remove when this function is removed. (See above.)
|
||||
#pragma pop_macro("LoadFence")
|
||||
|
||||
// Ensures values written by previous `Stream` calls are visible on the current
|
||||
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
|
||||
// outputs are to be consumed by other core(s), the producer must publish
|
||||
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_sfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Optionally begins loading the cache line containing "p" to reduce latency of
|
||||
// subsequent actual loads.
|
||||
template <typename T>
|
||||
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
|
||||
#elif HWY_COMPILER_GCC // includes clang
|
||||
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
|
||||
// desirable, so use the default 3 (keep in caches).
|
||||
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
|
||||
#else
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Invalidates and flushes the cache line containing "p", if possible.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_clflush(p);
|
||||
#else
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
// When called inside a spin-loop, may reduce power consumption.
|
||||
HWY_INLINE HWY_ATTR_CACHE void Pause() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
|
281
deps/highway/include/hwy/detect_compiler_arch.h
vendored
Normal file
281
deps/highway/include/hwy/detect_compiler_arch.h
vendored
Normal file
|
@ -0,0 +1,281 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
|
||||
// Detects compiler and arch from predefined macros. Zero dependencies for
|
||||
// inclusion by foreach_target.h.
|
||||
|
||||
// Add to #if conditions to prevent IDE from graying out code.
|
||||
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
|
||||
(defined Q_CREATOR_RUN) || (defined __CLANGD__) || \
|
||||
(defined GROK_ELLIPSIS_BUILD)
|
||||
#define HWY_IDE 1
|
||||
#else
|
||||
#define HWY_IDE 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler
|
||||
|
||||
// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
|
||||
// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HWY_COMPILER_MSVC _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_MSVC 0
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && defined(__clang__)
|
||||
#define HWY_COMPILER_CLANGCL _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_CLANGCL 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define HWY_COMPILER_ICC __INTEL_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICC 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_LLVM_COMPILER
|
||||
#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICX 0
|
||||
#endif
|
||||
|
||||
// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
|
||||
// compiler extensions (eg. Clang, Intel...)
|
||||
#ifdef __GNUC__
|
||||
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
|
||||
#else
|
||||
#define HWY_COMPILER_GCC 0
|
||||
#endif
|
||||
|
||||
// Clang or clang-cl, not GCC.
|
||||
#ifdef __clang__
|
||||
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
|
||||
// an invalid version number, deduce it from the presence of warnings.
|
||||
// Originally based on
|
||||
// https://github.com/simd-everywhere/simde/blob/47d6e603de9d04ee05cdfbc57cf282a02be1bf2a/simde/simde-detect-clang.h#L59.
|
||||
// Please send updates below to them as well, thanks!
|
||||
#if defined(__apple_build_version__) || __clang_major__ >= 999
|
||||
#if __has_attribute(nouwtable) // no new warnings in 16.0
|
||||
#define HWY_COMPILER_CLANG 1600
|
||||
#elif __has_warning("-Warray-parameter")
|
||||
#define HWY_COMPILER_CLANG 1500
|
||||
#elif __has_warning("-Wbitwise-instead-of-logical")
|
||||
#define HWY_COMPILER_CLANG 1400
|
||||
#elif __has_warning("-Wreserved-identifier")
|
||||
#define HWY_COMPILER_CLANG 1300
|
||||
#elif __has_warning("-Wformat-insufficient-args")
|
||||
#define HWY_COMPILER_CLANG 1200
|
||||
#elif __has_warning("-Wimplicit-const-int-float-conversion")
|
||||
#define HWY_COMPILER_CLANG 1100
|
||||
#elif __has_warning("-Wmisleading-indentation")
|
||||
#define HWY_COMPILER_CLANG 1000
|
||||
#elif defined(__FILE_NAME__)
|
||||
#define HWY_COMPILER_CLANG 900
|
||||
#elif __has_warning("-Wextra-semi-stmt") || \
|
||||
__has_builtin(__builtin_rotateleft32)
|
||||
#define HWY_COMPILER_CLANG 800
|
||||
// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
|
||||
// based on Clang 7, but does not support the warning we test.
|
||||
// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
|
||||
// https://trac.macports.org/wiki/XcodeVersionInfo.
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi") || \
|
||||
(defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
|
||||
#define HWY_COMPILER_CLANG 700
|
||||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
#endif // __has_warning chain
|
||||
#define HWY_COMPILER3_CLANG (HWY_COMPILER_CLANG * 100)
|
||||
#else // use normal version
|
||||
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
|
||||
#define HWY_COMPILER3_CLANG \
|
||||
(__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
|
||||
#endif
|
||||
#else // Not clang
|
||||
#define HWY_COMPILER_CLANG 0
|
||||
#define HWY_COMPILER3_CLANG 0
|
||||
#endif
|
||||
|
||||
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && !HWY_COMPILER_ICC
|
||||
#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
|
||||
#else
|
||||
#define HWY_COMPILER_GCC_ACTUAL 0
|
||||
#endif
|
||||
|
||||
// More than one may be nonzero, but we want at least one.
|
||||
#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
|
||||
HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
|
||||
// We should only detect one of these (only clang/clangcl overlap)
|
||||
#if 1 < \
|
||||
(!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
|
||||
!!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
|
||||
#error "Detected multiple compilers"
|
||||
#endif
|
||||
|
||||
#ifdef __has_builtin
|
||||
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
|
||||
#else
|
||||
#define HWY_HAS_BUILTIN(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_attribute
|
||||
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
|
||||
#else
|
||||
#define HWY_HAS_ATTRIBUTE(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_cpp_attribute
|
||||
#define HWY_HAS_CPP_ATTRIBUTE(name) __has_cpp_attribute(name)
|
||||
#else
|
||||
#define HWY_HAS_CPP_ATTRIBUTE(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_feature
|
||||
#define HWY_HAS_FEATURE(name) __has_feature(name)
|
||||
#else
|
||||
#define HWY_HAS_FEATURE(name) 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Architecture
|
||||
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HWY_ARCH_X86_32 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define HWY_ARCH_X86_64 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
|
||||
#error "Cannot have both x86-32 and x86-64"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
|
||||
#define HWY_ARCH_X86 1
|
||||
#else
|
||||
#define HWY_ARCH_X86 0
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc64__) || defined(_M_PPC) || defined(__powerpc__)
|
||||
#define HWY_ARCH_PPC 1
|
||||
#else
|
||||
#define HWY_ARCH_PPC 0
|
||||
#endif
|
||||
|
||||
// aarch32 is currently not supported; please raise an issue if you want it.
|
||||
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_A64 0
|
||||
#endif
|
||||
|
||||
#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
|
||||
#define HWY_ARCH_ARM_V7 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
|
||||
#error "Cannot have both A64 and V7"
|
||||
#endif
|
||||
|
||||
// Any *supported* version of Arm, i.e. 7 or later
|
||||
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
||||
#define HWY_ARCH_ARM 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM 0
|
||||
#endif
|
||||
|
||||
// Older than Armv7 (e.g. armel aka Armv5) => we do not support SIMD.
|
||||
#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
|
||||
#define HWY_ARCH_ARM_OLD 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_OLD 0
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
||||
#define HWY_ARCH_WASM 1
|
||||
#else
|
||||
#define HWY_ARCH_WASM 0
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#define HWY_ARCH_RVV 1
|
||||
#else
|
||||
#define HWY_ARCH_RVV 0
|
||||
#endif
|
||||
|
||||
// It is an error to detect multiple architectures at the same time, but OK to
|
||||
// detect none of the above.
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
|
||||
HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
|
||||
#error "Must not detect more than one architecture"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define HWY_OS_WIN 1
|
||||
#else
|
||||
#define HWY_OS_WIN 0
|
||||
#endif
|
||||
|
||||
#if defined(linux) || defined(__linux__)
|
||||
#define HWY_OS_LINUX 1
|
||||
#else
|
||||
#define HWY_OS_LINUX 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Endianness
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#if HWY_ARCH_PPC && defined(_XBOX_VER) && _XBOX_VER >= 200
|
||||
// XBox 360 is big-endian
|
||||
#define HWY_IS_LITTLE_ENDIAN 0
|
||||
#define HWY_IS_BIG_ENDIAN 1
|
||||
#else
|
||||
// All other targets supported by MSVC are little-endian
|
||||
#define HWY_IS_LITTLE_ENDIAN 1
|
||||
#define HWY_IS_BIG_ENDIAN 0
|
||||
#endif // HWY_ARCH_PPC && defined(_XBOX_VER) && _XBOX_VER >= 200
|
||||
#elif defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && \
|
||||
__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define HWY_IS_LITTLE_ENDIAN 1
|
||||
#define HWY_IS_BIG_ENDIAN 0
|
||||
#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && \
|
||||
__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#define HWY_IS_LITTLE_ENDIAN 0
|
||||
#define HWY_IS_BIG_ENDIAN 1
|
||||
#else
|
||||
#error "Unable to detect endianness or unsupported byte order"
|
||||
#endif
|
||||
|
||||
#if (HWY_IS_LITTLE_ENDIAN + HWY_IS_BIG_ENDIAN) != 1
|
||||
#error "Must only detect one byte order"
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
644
deps/highway/include/hwy/detect_targets.h
vendored
Normal file
644
deps/highway/include/hwy/detect_targets.h
vendored
Normal file
|
@ -0,0 +1,644 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
#define HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
|
||||
// Defines targets and chooses which to enable.
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Optional configuration
|
||||
|
||||
// See g3doc/quick_reference.md for documentation of these macros.
|
||||
|
||||
// Uncomment to override the default baseline determined from predefined macros:
|
||||
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
|
||||
|
||||
// Uncomment to override the default blocklist:
|
||||
// #define HWY_BROKEN_TARGETS HWY_AVX3
|
||||
|
||||
// Uncomment to definitely avoid generating those target(s):
|
||||
// #define HWY_DISABLED_TARGETS HWY_SSE4
|
||||
|
||||
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
|
||||
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
|
||||
// #define HWY_DISABLE_BMI2_FMA
|
||||
|
||||
// Uncomment to enable these on MSVC even if the predefined macros are not set.
|
||||
// #define HWY_WANT_SSE2 1
|
||||
// #define HWY_WANT_SSSE3 1
|
||||
// #define HWY_WANT_SSE4 1
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Targets
|
||||
|
||||
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
|
||||
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
|
||||
//
|
||||
// All values are unconditionally defined so we can test HWY_TARGETS without
|
||||
// first checking the HWY_ARCH_*.
|
||||
//
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types. This
|
||||
// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
|
||||
// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
|
||||
// avoid overflow when computing HWY_TARGETS (subtracting one instead of
|
||||
// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
|
||||
|
||||
// --------------------------- x86: 15 targets (+ one fallback)
|
||||
// Bits 0..3 reserved (4 targets)
|
||||
#define HWY_AVX3_SPR (1LL << 4)
|
||||
// Bit 5 reserved (likely AVX10.2 with 256-bit vectors)
|
||||
// Currently HWY_AVX3_DL plus a special case for CompressStore (10x as fast).
|
||||
// We may later also use VPCONFLICT.
|
||||
#define HWY_AVX3_ZEN4 (1LL << 6) // see HWY_WANT_AVX3_ZEN4 below
|
||||
|
||||
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
|
||||
// VAES, BITALG, GFNI). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is
|
||||
// only in Tiger Lake?
|
||||
#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
|
||||
#define HWY_AVX3 (1LL << 8) // HWY_AVX2 plus AVX-512F/BW/CD/DQ/VL
|
||||
#define HWY_AVX2 (1LL << 9) // HWY_SSE4 plus BMI2 + F16 + FMA
|
||||
// Bit 10: reserved
|
||||
#define HWY_SSE4 (1LL << 11) // SSE4.2 plus AES + CLMUL
|
||||
#define HWY_SSSE3 (1LL << 12) // S-SSE3
|
||||
// Bit 13: reserved for SSE3
|
||||
#define HWY_SSE2 (1LL << 14)
|
||||
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
|
||||
// dynamic dispatch. All x86 target bits must be lower or equal to
|
||||
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
|
||||
// HWY_MAX_DYNAMIC_TARGETS in total.
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 14
|
||||
|
||||
// --------------------------- Arm: 15 targets (+ one fallback)
|
||||
// Bits 15..23 reserved (9 targets)
|
||||
#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
|
||||
#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
|
||||
#define HWY_SVE2 (1LL << 26)
|
||||
#define HWY_SVE (1LL << 27)
|
||||
#define HWY_NEON (1LL << 28) // Implies support for AES
|
||||
#define HWY_NEON_WITHOUT_AES (1LL << 29)
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 29
|
||||
|
||||
// --------------------------- RISC-V: 9 targets (+ one fallback)
|
||||
// Bits 30..36 reserved (7 targets)
|
||||
#define HWY_RVV (1LL << 37)
|
||||
// Bit 38 reserved
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 38
|
||||
|
||||
// --------------------------- Future expansion: 4 targets
|
||||
// Bits 39..42 reserved
|
||||
|
||||
// --------------------------- IBM Power: 9 targets (+ one fallback)
|
||||
// Bits 43..46 reserved (4 targets)
|
||||
#define HWY_PPC10 (1LL << 47) // v3.1
|
||||
#define HWY_PPC9 (1LL << 48) // v3.0
|
||||
#define HWY_PPC8 (1LL << 49) // v2.07
|
||||
// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 51
|
||||
|
||||
// --------------------------- WebAssembly: 9 targets (+ one fallback)
|
||||
// Bits 52..57 reserved (6 targets)
|
||||
#define HWY_WASM_EMU256 (1LL << 58) // Experimental
|
||||
#define HWY_WASM (1LL << 59)
|
||||
// Bits 60 reserved
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 60
|
||||
|
||||
// --------------------------- Emulation: 2 targets
|
||||
|
||||
#define HWY_EMU128 (1LL << 61)
|
||||
// We do not add/left-shift, so this will not overflow to a negative number.
|
||||
#define HWY_SCALAR (1LL << 62)
|
||||
#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
|
||||
|
||||
// Do not use bit 63 - would be confusing to have negative numbers.
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set default blocklists
|
||||
|
||||
// Disabled means excluded from enabled at user's request. A separate config
|
||||
// macro allows disabling without deactivating the blocklist below.
|
||||
#ifndef HWY_DISABLED_TARGETS
|
||||
#define HWY_DISABLED_TARGETS 0
|
||||
#endif
|
||||
|
||||
// Broken means excluded from enabled due to known compiler issues. We define
|
||||
// separate HWY_BROKEN_* and then OR them together (more than one might apply).
|
||||
|
||||
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
||||
// SSE4 codegen (possibly only for msan), so disable all those targets.
|
||||
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
||||
|
||||
#define HWY_BROKEN_CLANG6 (HWY_SSE4 | (HWY_SSE4 - 1))
|
||||
// This entails a major speed reduction, so warn unless the user explicitly
|
||||
// opts in to scalar-only.
|
||||
#if !defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
|
||||
#endif
|
||||
|
||||
#else
|
||||
#define HWY_BROKEN_CLANG6 0
|
||||
#endif
|
||||
|
||||
// 32-bit may fail to compile AVX2/3.
|
||||
#if HWY_ARCH_X86_32
|
||||
#define HWY_BROKEN_32BIT (HWY_AVX2 | (HWY_AVX2 - 1))
|
||||
#else
|
||||
#define HWY_BROKEN_32BIT 0
|
||||
#endif
|
||||
|
||||
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
|
||||
#if HWY_COMPILER_MSVC != 0
|
||||
#define HWY_BROKEN_MSVC (HWY_AVX3 | (HWY_AVX3 - 1))
|
||||
#else
|
||||
#define HWY_BROKEN_MSVC 0
|
||||
#endif
|
||||
|
||||
// AVX3_DL and AVX3_ZEN4 require clang >= 7 (ensured above), gcc >= 8.1 or ICC
|
||||
// 2021.
|
||||
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 801) || \
|
||||
(HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
|
||||
#define HWY_BROKEN_AVX3_DL_ZEN4 (HWY_AVX3_DL | HWY_AVX3_ZEN4)
|
||||
#else
|
||||
#define HWY_BROKEN_AVX3_DL_ZEN4 0
|
||||
#endif
|
||||
|
||||
// AVX3_SPR requires clang >= 14, gcc >= 12, or ICC 2021.
|
||||
#if (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1400) || \
|
||||
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200) || \
|
||||
(HWY_COMPILER_ICC && HWY_COMPILER_ICC < 2021)
|
||||
#define HWY_BROKEN_AVX3_SPR (HWY_AVX3_SPR)
|
||||
#else
|
||||
#define HWY_BROKEN_AVX3_SPR 0
|
||||
#endif
|
||||
|
||||
// armv7be has not been tested and is not yet supported.
|
||||
#if HWY_ARCH_ARM_V7 && HWY_IS_BIG_ENDIAN
|
||||
#define HWY_BROKEN_ARM7_BIG_ENDIAN (HWY_NEON | HWY_NEON_WITHOUT_AES)
|
||||
#else
|
||||
#define HWY_BROKEN_ARM7_BIG_ENDIAN 0
|
||||
#endif
|
||||
|
||||
// armv7-a without a detected vfpv4 is not supported
|
||||
// (for example Cortex-A8, Cortex-A9)
|
||||
// vfpv4 always have neon half-float _and_ FMA.
|
||||
#if HWY_ARCH_ARM_V7 && (__ARM_ARCH_PROFILE == 'A') && \
|
||||
!defined(__ARM_VFPV4__) && \
|
||||
!((__ARM_NEON_FP & 0x2 /* half-float */) && (__ARM_FEATURE_FMA == 1))
|
||||
#define HWY_BROKEN_ARM7_WITHOUT_VFP4 (HWY_NEON | HWY_NEON_WITHOUT_AES)
|
||||
#else
|
||||
#define HWY_BROKEN_ARM7_WITHOUT_VFP4 0
|
||||
#endif
|
||||
|
||||
// SVE[2] require recent clang or gcc versions.
|
||||
#if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
|
||||
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
|
||||
#define HWY_BROKEN_SVE (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
|
||||
#else
|
||||
#define HWY_BROKEN_SVE 0
|
||||
#endif
|
||||
|
||||
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1100)
|
||||
// GCC 10 supports the -mcpu=power10 option but does not support the PPC10
|
||||
// vector intrinsics
|
||||
#define HWY_BROKEN_PPC10 (HWY_PPC10)
|
||||
#elif HWY_ARCH_PPC && HWY_IS_BIG_ENDIAN && \
|
||||
((HWY_COMPILER3_CLANG && HWY_COMPILER3_CLANG < 160001) || \
|
||||
(HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_COMPILER_GCC_ACTUAL <= 1203) || \
|
||||
(HWY_COMPILER_GCC_ACTUAL >= 1300 && HWY_COMPILER_GCC_ACTUAL <= 1301))
|
||||
// GCC 12.0 through 12.3 and GCC 13.0 through 13.1 have a compiler bug where the
|
||||
// vsldoi instruction is sometimes incorrectly optimized out (and this causes
|
||||
// some of the Highway unit tests to fail on big-endian PPC10). Details about
|
||||
// this compiler bug can be found at
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069, and this bug will be
|
||||
// fixed in the upcoming GCC 12.4 and 13.2 releases.
|
||||
|
||||
// Clang 16.0.0 and earlier (but not Clang 16.0.1 and later) have a compiler
|
||||
// bug in the LLVM DAGCombiner that causes a zero-extend followed by an
|
||||
// element insert into a vector, followed by a vector shuffle to be incorrectly
|
||||
// optimized on big-endian PPC (and which caused some of the Highway unit tests
|
||||
// to fail on big-endian PPC10).
|
||||
|
||||
// Details about this bug, which has already been fixed in Clang 16.0.1 and
|
||||
// later, can be found at https://github.com/llvm/llvm-project/issues/61315.
|
||||
#define HWY_BROKEN_PPC10 (HWY_PPC10)
|
||||
#else
|
||||
#define HWY_BROKEN_PPC10 0
|
||||
#endif
|
||||
|
||||
// Allow the user to override this without any guarantee of success.
|
||||
#ifndef HWY_BROKEN_TARGETS
|
||||
|
||||
#define HWY_BROKEN_TARGETS \
|
||||
(HWY_BROKEN_CLANG6 | HWY_BROKEN_32BIT | HWY_BROKEN_MSVC | \
|
||||
HWY_BROKEN_AVX3_DL_ZEN4 | HWY_BROKEN_AVX3_SPR | \
|
||||
HWY_BROKEN_ARM7_BIG_ENDIAN | HWY_BROKEN_ARM7_WITHOUT_VFP4 | \
|
||||
HWY_BROKEN_SVE | HWY_BROKEN_PPC10)
|
||||
|
||||
#endif // HWY_BROKEN_TARGETS
|
||||
|
||||
// Enabled means not disabled nor blocklisted.
|
||||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
|
||||
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
|
||||
// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
|
||||
// always be enabled. If 1, we instead choose HWY_SCALAR even without
|
||||
// HWY_COMPILE_ONLY_SCALAR being set.
|
||||
#if !defined(HWY_BROKEN_EMU128) // allow overriding
|
||||
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \
|
||||
defined(HWY_NO_LIBCXX)
|
||||
#define HWY_BROKEN_EMU128 1
|
||||
#else
|
||||
#define HWY_BROKEN_EMU128 0
|
||||
#endif
|
||||
#endif // HWY_BROKEN_EMU128
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect baseline targets using predefined macros
|
||||
|
||||
// Baseline means the targets for which the compiler is allowed to generate
|
||||
// instructions, implying the target CPU would have to support them. This does
|
||||
// not take the blocklist into account.
|
||||
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
|
||||
#define HWY_BASELINE_SCALAR HWY_SCALAR
|
||||
#else
|
||||
#define HWY_BASELINE_SCALAR HWY_EMU128
|
||||
#endif
|
||||
|
||||
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
||||
// HWY_TARGET == HWY_BASELINE_SCALAR.
|
||||
|
||||
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
|
||||
#if defined(HWY_WANT_WASM2)
|
||||
#define HWY_BASELINE_WASM HWY_WASM_EMU256
|
||||
#else
|
||||
#define HWY_BASELINE_WASM HWY_WASM
|
||||
#endif // HWY_WANT_WASM2
|
||||
#else
|
||||
#define HWY_BASELINE_WASM 0
|
||||
#endif
|
||||
|
||||
// GCC or Clang.
|
||||
#if HWY_ARCH_PPC && HWY_COMPILER_GCC && defined(__ALTIVEC__) && \
|
||||
defined(__VSX__) && defined(__POWER8_VECTOR__) && \
|
||||
(defined(__CRYPTO__) || defined(HWY_DISABLE_PPC8_CRYPTO))
|
||||
#define HWY_BASELINE_PPC8 HWY_PPC8
|
||||
#else
|
||||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
#if HWY_BASELINE_PPC8 != 0 && defined(__POWER9_VECTOR__)
|
||||
#define HWY_BASELINE_PPC9 HWY_PPC9
|
||||
#else
|
||||
#define HWY_BASELINE_PPC9 0
|
||||
#endif
|
||||
|
||||
#if HWY_BASELINE_PPC9 != 0 && \
|
||||
(defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__))
|
||||
#define HWY_BASELINE_PPC10 HWY_PPC10
|
||||
#else
|
||||
#define HWY_BASELINE_PPC10 0
|
||||
#endif
|
||||
|
||||
#define HWY_BASELINE_SVE2 0
|
||||
#define HWY_BASELINE_SVE 0
|
||||
#define HWY_BASELINE_NEON 0
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE2)
|
||||
#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
|
||||
// If user specified -msve-vector-bits=128, they assert the vector length is
|
||||
// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2_128
|
||||
// Otherwise we're not sure what the vector length will be. The baseline must be
|
||||
// unconditionally valid, so we can only assume HWY_SVE2. However, when running
|
||||
// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
|
||||
// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
|
||||
#else
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#endif // __ARM_FEATURE_SVE_BITS
|
||||
#endif // __ARM_FEATURE_SVE2
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#undef HWY_BASELINE_SVE // was 0, will be re-defined
|
||||
// See above. If user-specified vector length matches our optimization, use it.
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
|
||||
#define HWY_BASELINE_SVE HWY_SVE_256
|
||||
#else
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#endif // __ARM_FEATURE_SVE_BITS
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#undef HWY_BASELINE_NEON
|
||||
#if defined(__ARM_FEATURE_AES)
|
||||
#define HWY_BASELINE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES)
|
||||
#else
|
||||
#define HWY_BASELINE_NEON (HWY_NEON_WITHOUT_AES)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // HWY_ARCH_ARM
|
||||
|
||||
// Special handling for MSVC because it has fewer predefined macros:
|
||||
#if HWY_COMPILER_MSVC
|
||||
|
||||
#if HWY_ARCH_X86_32
|
||||
#if _M_IX86_FP >= 2
|
||||
#define HWY_CHECK_SSE2 1
|
||||
#else
|
||||
#define HWY_CHECK_SSE2 0
|
||||
#endif
|
||||
#elif HWY_ARCH_X86_64
|
||||
#define HWY_CHECK_SSE2 1
|
||||
#else
|
||||
#define HWY_CHECK_SSE2 0
|
||||
#endif
|
||||
|
||||
// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
|
||||
// https://stackoverflow.com/questions/18563978/.
|
||||
#if defined(__AVX__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#define HWY_CHECK_SSE4 1
|
||||
#else
|
||||
#define HWY_CHECK_SSSE3 0
|
||||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
|
||||
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#define HWY_CHECK_F16C 1
|
||||
|
||||
#else // non-MSVC
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#define HWY_CHECK_SSE2 1
|
||||
#else
|
||||
#define HWY_CHECK_SSE2 0
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#else
|
||||
#define HWY_CHECK_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) && defined(__SSE4_2__)
|
||||
#define HWY_CHECK_SSE4 1
|
||||
#else
|
||||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// If these are disabled, they should not gate the availability of SSE4/AVX2.
|
||||
#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#else
|
||||
#define HWY_CHECK_PCLMUL_AES 0
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#else
|
||||
#define HWY_CHECK_BMI2_FMA 0
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
|
||||
#define HWY_CHECK_F16C 1
|
||||
#else
|
||||
#define HWY_CHECK_F16C 0
|
||||
#endif
|
||||
|
||||
#endif // non-MSVC
|
||||
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSE2 || HWY_CHECK_SSE2)
|
||||
#define HWY_BASELINE_SSE2 HWY_SSE2
|
||||
#else
|
||||
#define HWY_BASELINE_SSE2 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
|
||||
#define HWY_BASELINE_SSSE3 HWY_SSSE3
|
||||
#else
|
||||
#define HWY_BASELINE_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
|
||||
#define HWY_BASELINE_SSE4 HWY_SSE4
|
||||
#else
|
||||
#define HWY_BASELINE_SSE4 0
|
||||
#endif
|
||||
|
||||
#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
|
||||
defined(__AVX2__)
|
||||
#define HWY_BASELINE_AVX2 HWY_AVX2
|
||||
#else
|
||||
#define HWY_BASELINE_AVX2 0
|
||||
#endif
|
||||
|
||||
// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
|
||||
#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
|
||||
defined(__AVX512DQ__) && defined(__AVX512VL__)
|
||||
#define HWY_BASELINE_AVX3 HWY_AVX3
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3 0
|
||||
#endif
|
||||
|
||||
// TODO(janwas): not yet known whether these will be set by MSVC
|
||||
#if HWY_BASELINE_AVX3 != 0 && defined(__AVX512VNNI__) && defined(__VAES__) && \
|
||||
defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
|
||||
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
|
||||
defined(__AVX512BITALG__)
|
||||
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
// The ZEN4-optimized AVX3 target is numerically lower than AVX3_DL and is thus
|
||||
// considered better. Do not enable it unless the user explicitly requests it -
|
||||
// we do not want to choose the ZEN4 path on Intel because it could be slower.
|
||||
#if defined(HWY_WANT_AVX3_ZEN4) && HWY_BASELINE_AVX3_DL != 0
|
||||
#define HWY_BASELINE_AVX3_ZEN4 HWY_AVX3_ZEN4
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3_ZEN4 0
|
||||
#endif
|
||||
|
||||
#if HWY_BASELINE_AVX3_DL != 0 && defined(__AVX512FP16__)
|
||||
#define HWY_BASELINE_AVX3_SPR HWY_AVX3_SPR
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3_SPR 0
|
||||
#endif
|
||||
|
||||
// RVV requires intrinsics 0.11 or later, see #1156.
|
||||
#if HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && __riscv_v_intrinsic >= 11000
|
||||
#define HWY_BASELINE_RVV HWY_RVV
|
||||
#else
|
||||
#define HWY_BASELINE_RVV 0
|
||||
#endif
|
||||
|
||||
// Allow the user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
|
||||
HWY_BASELINE_PPC9 | HWY_BASELINE_PPC10 | HWY_BASELINE_SVE2 | \
|
||||
HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE2 | \
|
||||
HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
|
||||
HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_AVX3_ZEN4 | \
|
||||
HWY_BASELINE_AVX3_SPR | HWY_BASELINE_RVV)
|
||||
#endif // HWY_BASELINE_TARGETS
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose target for static dispatch
|
||||
|
||||
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
|
||||
#if HWY_ENABLED_BASELINE == 0
|
||||
#error "At least one baseline target must be defined and enabled"
|
||||
#endif
|
||||
|
||||
// Best baseline, used for static dispatch. This is the least-significant 1-bit
|
||||
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
|
||||
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
|
||||
|
||||
// Start by assuming static dispatch. If we later use dynamic dispatch, this
|
||||
// will be defined to other targets during the multiple-inclusion, and finally
|
||||
// return to the initial value. Defining this outside begin/end_target ensures
|
||||
// inl headers successfully compile by themselves (required by Bazel).
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose targets for dynamic dispatch according to one of four policies
|
||||
|
||||
#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
|
||||
defined(HWY_COMPILE_ONLY_STATIC))
|
||||
#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
|
||||
#endif
|
||||
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
|
||||
|
||||
// Clang, GCC and MSVC allow runtime dispatch on x86.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
// On Arm/PPC, currently only GCC does, and we require Linux to detect CPU
|
||||
// capabilities.
|
||||
#elif (HWY_ARCH_ARM || HWY_ARCH_PPC) && HWY_COMPILER_GCC_ACTUAL && \
|
||||
HWY_OS_LINUX && !defined(TOOLCHAIN_MISS_SYS_AUXV_H)
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
#else
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 0
|
||||
#endif
|
||||
|
||||
// AVX3_DL is not widely available yet. To reduce code size and compile time,
|
||||
// only include it in the set of attainable targets (for dynamic dispatch) if
|
||||
// the user opts in, OR it is in the baseline (we check whether enabled below).
|
||||
#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
|
||||
#define HWY_ATTAINABLE_AVX3_DL (HWY_AVX3_DL)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && HWY_HAVE_RUNTIME_DISPATCH
|
||||
#define HWY_ATTAINABLE_NEON (HWY_NEON | HWY_NEON_WITHOUT_AES)
|
||||
#elif HWY_ARCH_ARM // static dispatch, or HWY_ARCH_ARM_V7
|
||||
#define HWY_ATTAINABLE_NEON (HWY_BASELINE_NEON)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_NEON 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
|
||||
(HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
|
||||
#define HWY_ATTAINABLE_SVE (HWY_SVE | HWY_SVE_256)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
|
||||
(HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
|
||||
#define HWY_ATTAINABLE_SVE2 (HWY_SVE2 | HWY_SVE2_128)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE2 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_PPC && defined(__ALTIVEC__) && \
|
||||
(!HWY_COMPILER_CLANG || HWY_BASELINE_PPC8 != 0)
|
||||
#define HWY_ATTAINABLE_PPC (HWY_PPC8 | HWY_PPC9 | HWY_PPC10)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_PPC 0
|
||||
#endif
|
||||
|
||||
// Attainable means enabled and the compiler allows intrinsics (even when not
|
||||
// allowed to autovectorize). Used in 3 and 4.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | \
|
||||
HWY_AVX2 | HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL | HWY_AVX3_ZEN4 | \
|
||||
HWY_AVX3_SPR)
|
||||
#elif HWY_ARCH_ARM
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_NEON | HWY_ATTAINABLE_SVE | \
|
||||
HWY_ATTAINABLE_SVE2)
|
||||
#elif HWY_ARCH_PPC
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_ATTAINABLE_PPC)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_TARGETS (HWY_ENABLED_BASELINE)
|
||||
#endif // HWY_ARCH_*
|
||||
|
||||
// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
|
||||
#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
|
||||
#define HWY_TARGETS HWY_EMU128
|
||||
|
||||
// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
|
||||
// we currently still support it for backwards compatibility.
|
||||
#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
|
||||
(defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
|
||||
#define HWY_TARGETS HWY_SCALAR
|
||||
|
||||
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
|
||||
#elif defined(HWY_COMPILE_ONLY_STATIC)
|
||||
#define HWY_TARGETS HWY_STATIC_TARGET
|
||||
|
||||
// 3) For tests: include all attainable targets (in particular: scalar)
|
||||
#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
|
||||
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
||||
|
||||
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
||||
// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
|
||||
// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
|
||||
// sets all lower bits (better targets), then we also include the static target.
|
||||
#else
|
||||
#define HWY_TARGETS \
|
||||
(HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
|
||||
|
||||
#endif // target policy
|
||||
|
||||
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
|
||||
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
|
||||
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
|
||||
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
|
||||
#error "Logic error: best baseline should be included in dynamic targets"
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_TARGETS_H_
|
340
deps/highway/include/hwy/foreach_target.h
vendored
Normal file
340
deps/highway/include/hwy/foreach_target.h
vendored
Normal file
|
@ -0,0 +1,340 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
#define HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
|
||||
// Re-includes the translation unit zero or more times to compile for any
|
||||
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
|
||||
// highway.h defines the corresponding macro/namespace.
|
||||
|
||||
#include "hwy/detect_targets.h"
|
||||
|
||||
// *_inl.h may include other headers, which requires include guards to prevent
|
||||
// repeated inclusion. The guards must be reset after compiling each target, so
|
||||
// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
|
||||
// defining it if undefined and vice versa. This macro is initially undefined
|
||||
// so that IDEs don't gray out the contents of each header.
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#error "This macro must not be defined outside foreach_target.h"
|
||||
#endif
|
||||
|
||||
#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard
|
||||
// Trigger fixup at the bottom of this header.
|
||||
#define HWY_ALREADY_INCLUDED
|
||||
|
||||
// The next highway.h must re-include set_macros-inl.h because the first
|
||||
// highway.h chose the static target instead of what we will set below.
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
|
||||
// Disable HWY_EXPORT in user code until we have generated all targets. Note
|
||||
// that a subsequent highway.h will not override this definition.
|
||||
#undef HWY_ONCE
|
||||
#define HWY_ONCE (0 || HWY_IDE)
|
||||
|
||||
// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
|
||||
// also skip if only 1 target defined (no re-inclusion will be necessary).
|
||||
#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
|
||||
|
||||
#if !defined(HWY_TARGET_INCLUDE)
|
||||
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
|
||||
#endif
|
||||
|
||||
// ------------------------------ HWY_ARCH_X86
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSE2) && (HWY_STATIC_TARGET != HWY_SSE2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSE2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSSE3
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSE4
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3_DL
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3_ZEN4) && (HWY_STATIC_TARGET != HWY_AVX3_ZEN4)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3_ZEN4
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3_SPR) && (HWY_STATIC_TARGET != HWY_AVX3_SPR)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3_SPR
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ------------------------------ HWY_ARCH_ARM
|
||||
|
||||
#if (HWY_TARGETS & HWY_NEON_WITHOUT_AES) && \
|
||||
(HWY_STATIC_TARGET != HWY_NEON_WITHOUT_AES)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_NEON_WITHOUT_AES
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_NEON
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE_256
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE2_128
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ------------------------------ HWY_ARCH_WASM
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM_EMU256
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ------------------------------ HWY_ARCH_PPC
|
||||
|
||||
#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_PPC8
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_PPC9) && (HWY_STATIC_TARGET != HWY_PPC9)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_PPC9
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_PPC10) && (HWY_STATIC_TARGET != HWY_PPC10)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_PPC10
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ------------------------------ HWY_ARCH_RVV
|
||||
|
||||
#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_RVV
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// ------------------------------ Scalar
|
||||
|
||||
#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_EMU128
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SCALAR
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
|
||||
|
||||
// Now that all but the static target have been generated, re-enable HWY_EXPORT.
|
||||
#undef HWY_ONCE
|
||||
#define HWY_ONCE 1
|
||||
|
||||
// If we re-include once per enabled target, the translation unit's
|
||||
// implementation would have to be skipped via #if to avoid redefining symbols.
|
||||
// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
|
||||
// implementation when resuming compilation of the translation unit.
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
#ifdef HWY_ALREADY_INCLUDED
|
||||
// Revert the previous toggle to prevent redefinitions for the static target.
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
|
||||
// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
|
||||
#ifdef HWY_SET_MACROS_PER_TARGET
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#else
|
||||
#define HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_FOREACH_TARGET_H_
|
435
deps/highway/include/hwy/highway.h
vendored
Normal file
435
deps/highway/include/hwy/highway.h
vendored
Normal file
|
@ -0,0 +1,435 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Main header required before using vector types.
|
||||
|
||||
// IWYU pragma: begin_exports
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
#include "hwy/highway_export.h"
|
||||
#include "hwy/targets.h"
|
||||
// IWYU pragma: end_exports
|
||||
|
||||
// This include guard is checked by foreach_target, so avoid the usual _H_
|
||||
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
|
||||
// after/outside this include guard.
|
||||
#ifndef HWY_HIGHWAY_INCLUDED
|
||||
#define HWY_HIGHWAY_INCLUDED
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
||||
#define HWY_MAJOR 1
|
||||
#define HWY_MINOR 0
|
||||
#define HWY_PATCH 6
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
|
||||
// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
|
||||
// HWY_CAPPED(T, N).
|
||||
|
||||
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
|
||||
// registers in the group, and is ignored on targets that do not support groups.
|
||||
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
|
||||
#define HWY_FULL2(T, LMUL) \
|
||||
hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
|
||||
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
|
||||
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
|
||||
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
|
||||
// Trailing comma avoids -pedantic false alarm
|
||||
#define HWY_CHOOSE_FULL(...) \
|
||||
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
|
||||
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
|
||||
|
||||
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
|
||||
#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Export user functions for static/dynamic dispatch
|
||||
|
||||
// Evaluates to 0 inside a translation unit if it is generating anything but the
|
||||
// static target (the last one if multiple targets are enabled). Used to prevent
|
||||
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
|
||||
// compile once anyway, so this is 1 unless it is or has been included.
|
||||
#ifndef HWY_ONCE
|
||||
#define HWY_ONCE 1
|
||||
#endif
|
||||
|
||||
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
|
||||
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
|
||||
// defined), and can be used to deduce the return type of Choose*.
|
||||
#if HWY_STATIC_TARGET == HWY_SCALAR
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_EMU128
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_RVV
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_NEON
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE_256
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE2_128
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_PPC8
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_PPC9
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_PPC10
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSE2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSSE3
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSE4
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3_DL
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3_SPR
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME
|
||||
#endif
|
||||
|
||||
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
|
||||
// nullptr is that target was not compiled.
|
||||
#if HWY_TARGETS & HWY_EMU128
|
||||
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
|
||||
#elif HWY_TARGETS & HWY_SCALAR
|
||||
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
|
||||
#else
|
||||
// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
|
||||
// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
|
||||
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM_EMU256
|
||||
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM
|
||||
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_RVV
|
||||
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_NEON_WITHOUT_AES
|
||||
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_NEON
|
||||
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SVE
|
||||
#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SVE2
|
||||
#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SVE_256
|
||||
#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SVE2_128
|
||||
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_PPC8
|
||||
#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_PPC9
|
||||
#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_PPC10
|
||||
#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SSE2
|
||||
#define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SSSE3
|
||||
#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SSE4
|
||||
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX2
|
||||
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX3
|
||||
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX3_DL
|
||||
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX3_ZEN4
|
||||
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX3_SPR
|
||||
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
|
||||
// apparently cannot be an array. Use a function pointer instead, which has the
|
||||
// disadvantage that we call the static (not best) target on the first call to
|
||||
// any HWY_DYNAMIC_DISPATCH.
|
||||
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
|
||||
#define HWY_DISPATCH_WORKAROUND 1
|
||||
#else
|
||||
#define HWY_DISPATCH_WORKAROUND 0
|
||||
#endif
|
||||
|
||||
// Provides a static member function which is what is called during the first
|
||||
// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
|
||||
// this function are the first entry in the tables created by HWY_EXPORT.
|
||||
template <typename RetType, typename... Args>
|
||||
struct FunctionCache {
|
||||
public:
|
||||
typedef RetType(FunctionType)(Args...);
|
||||
|
||||
#if HWY_DISPATCH_WORKAROUND
|
||||
template <FunctionType* const func>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
ChosenTarget& chosen_target = GetChosenTarget();
|
||||
chosen_target.Update(SupportedTargets());
|
||||
return (*func)(args...);
|
||||
}
|
||||
#else
|
||||
// A template function that when instantiated has the same signature as the
|
||||
// function being called. This function initializes the bit array of targets
|
||||
// supported by the current CPU and then calls the appropriate entry within
|
||||
// the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
|
||||
// exported functions, even those defined by different translation units,
|
||||
// will dispatch directly to the best available target.
|
||||
template <FunctionType* const table[]>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
ChosenTarget& chosen_target = GetChosenTarget();
|
||||
chosen_target.Update(SupportedTargets());
|
||||
return (table[chosen_target.GetIndex()])(args...);
|
||||
}
|
||||
#endif // HWY_DISPATCH_WORKAROUND
|
||||
};
|
||||
|
||||
// Used to deduce the template parameters RetType and Args from a function.
|
||||
template <typename RetType, typename... Args>
|
||||
FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
|
||||
return FunctionCache<RetType, Args...>();
|
||||
}
|
||||
|
||||
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
|
||||
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
|
||||
|
||||
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
|
||||
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
|
||||
// static array must be defined at the same namespace level as the function
|
||||
// it is exporting.
|
||||
// After being exported, it can be called from other parts of the same source
|
||||
// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
|
||||
// like in the following example:
|
||||
//
|
||||
// #include "hwy/highway.h"
|
||||
// HWY_BEFORE_NAMESPACE();
|
||||
// namespace skeleton {
|
||||
// namespace HWY_NAMESPACE {
|
||||
//
|
||||
// void MyFunction(int a, char b, const char* c) { ... }
|
||||
//
|
||||
// // NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
// } // namespace HWY_NAMESPACE
|
||||
// } // namespace skeleton
|
||||
// HWY_AFTER_NAMESPACE();
|
||||
//
|
||||
// namespace skeleton {
|
||||
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
|
||||
//
|
||||
// void MyFunction(int a, char b, const char* c) {
|
||||
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
|
||||
// }
|
||||
// } // namespace skeleton
|
||||
//
|
||||
|
||||
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
||||
|
||||
// Simplified version for IDE or the dynamic dispatch case with only one target.
|
||||
// This case still uses a table, although of a single element, to provide the
|
||||
// same compile error conditions as with the dynamic dispatch case when multiple
|
||||
// targets are being compiled.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
|
||||
HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
#define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
|
||||
#else
|
||||
|
||||
// Simplified version for MSVC 2017: function pointer instead of table.
|
||||
#if HWY_DISPATCH_WORKAROUND
|
||||
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
|
||||
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
||||
/* The first entry in the table initializes the global cache and \
|
||||
* calls the function from HWY_STATIC_TARGET. */ \
|
||||
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
|
||||
FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
|
||||
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
||||
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Dynamic dispatch case with one entry per dynamic target plus the fallback
|
||||
// target and the initialization wrapper.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
|
||||
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
||||
/* The first entry in the table initializes the global cache and \
|
||||
* calls the appropriate function. */ \
|
||||
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
|
||||
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
|
||||
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
||||
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
|
||||
}
|
||||
|
||||
#endif // HWY_DISPATCH_WORKAROUND
|
||||
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
|
||||
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
|
||||
#define HWY_DYNAMIC_POINTER(FUNC_NAME) \
|
||||
(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])
|
||||
|
||||
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
||||
|
||||
// DEPRECATED names; please use HWY_HAVE_* instead.
|
||||
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
|
||||
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
|
||||
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_HIGHWAY_INCLUDED
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
|
||||
// to include them once per target, which is ensured by the toggle check.
|
||||
// Because ops/*.h are included under it, they do not need their own guard.
|
||||
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HWY_HIGHWAY_PER_TARGET
|
||||
#undef HWY_HIGHWAY_PER_TARGET
|
||||
#else
|
||||
#define HWY_HIGHWAY_PER_TARGET
|
||||
#endif
|
||||
|
||||
// These define ops inside namespace hwy::HWY_NAMESPACE.
|
||||
#if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
|
||||
#include "hwy/ops/x86_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_AVX2
|
||||
#include "hwy/ops/x86_256-inl.h"
|
||||
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
|
||||
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
|
||||
#include "hwy/ops/x86_512-inl.h"
|
||||
#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
|
||||
HWY_TARGET == HWY_PPC10
|
||||
#include "hwy/ops/ppc_vsx-inl.h"
|
||||
#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
|
||||
#include "hwy/ops/arm_neon-inl.h"
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
|
||||
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
||||
#include "hwy/ops/arm_sve-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM_EMU256
|
||||
#include "hwy/ops/wasm_256-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
#include "hwy/ops/wasm_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
#include "hwy/ops/rvv-inl.h"
|
||||
#elif HWY_TARGET == HWY_EMU128
|
||||
#include "hwy/ops/emu128-inl.h"
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
#include "hwy/ops/scalar-inl.h"
|
||||
#else
|
||||
#pragma message("HWY_TARGET does not match any known target")
|
||||
#endif // HWY_TARGET
|
||||
|
||||
#include "hwy/ops/generic_ops-inl.h"
|
||||
|
||||
#endif // HWY_HIGHWAY_PER_TARGET
|
74
deps/highway/include/hwy/highway_export.h
vendored
Normal file
74
deps/highway/include/hwy/highway_export.h
vendored
Normal file
|
@ -0,0 +1,74 @@
|
|||
// Pseudo-generated file to handle both cmake & bazel build system.
|
||||
|
||||
// Initial generation done using cmake code:
|
||||
// include(GenerateExportHeader)
|
||||
// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
|
||||
// hwy/highway_export.h)
|
||||
// code reformatted using clang-format --style=Google
|
||||
|
||||
#ifndef HWY_DLLEXPORT_H
|
||||
#define HWY_DLLEXPORT_H
|
||||
|
||||
#if !defined(HWY_SHARED_DEFINE)
|
||||
#define HWY_DLLEXPORT
|
||||
#define HWY_CONTRIB_DLLEXPORT
|
||||
#define HWY_TEST_DLLEXPORT
|
||||
#else // !HWY_SHARED_DEFINE
|
||||
|
||||
#ifndef HWY_DLLEXPORT
|
||||
#if defined(hwy_EXPORTS)
|
||||
/* We are building this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_DLLEXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else // defined(hwy_EXPORTS)
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif // defined(hwy_EXPORTS)
|
||||
#endif // HWY_DLLEXPORT
|
||||
|
||||
#ifndef HWY_CONTRIB_DLLEXPORT
|
||||
#if defined(hwy_contrib_EXPORTS)
|
||||
/* We are building this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else // defined(hwy_contrib_EXPORTS)
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif // defined(hwy_contrib_EXPORTS)
|
||||
#endif // HWY_CONTRIB_DLLEXPORT
|
||||
|
||||
#ifndef HWY_TEST_DLLEXPORT
|
||||
#if defined(hwy_test_EXPORTS)
|
||||
/* We are building this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_DLLEXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else // defined(hwy_test_EXPORTS)
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif // defined(hwy_test_EXPORTS)
|
||||
#endif // HWY_TEST_DLLEXPORT
|
||||
|
||||
#endif // !HWY_SHARED_DEFINE
|
||||
|
||||
#endif /* HWY_DLLEXPORT_H */
|
171
deps/highway/include/hwy/nanobenchmark.h
vendored
Normal file
171
deps/highway/include/hwy/nanobenchmark.h
vendored
Normal file
|
@ -0,0 +1,171 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
|
||||
#define HIGHWAY_HWY_NANOBENCHMARK_H_
|
||||
|
||||
// Benchmarks functions of a single integer argument with realistic branch
|
||||
// prediction hit rates. Uses a robust estimator to summarize the measurements.
|
||||
// The precision is about 0.2%.
|
||||
//
|
||||
// Examples: see nanobenchmark_test.cc.
|
||||
//
|
||||
// Background: Microbenchmarks such as http://github.com/google/benchmark
|
||||
// can measure elapsed times on the order of a microsecond. Shorter functions
|
||||
// are typically measured by repeating them thousands of times and dividing
|
||||
// the total elapsed time by this count. Unfortunately, repetition (especially
|
||||
// with the same input parameter!) influences the runtime. In time-critical
|
||||
// code, it is reasonable to expect warm instruction/data caches and TLBs,
|
||||
// but a perfect record of which branches will be taken is unrealistic.
|
||||
// Unless the application also repeatedly invokes the measured function with
|
||||
// the same parameter, the benchmark is measuring something very different -
|
||||
// a best-case result, almost as if the parameter were made a compile-time
|
||||
// constant. This may lead to erroneous conclusions about branch-heavy
|
||||
// algorithms outperforming branch-free alternatives.
|
||||
//
|
||||
// Our approach differs in three ways. Adding fences to the timer functions
|
||||
// reduces variability due to instruction reordering, improving the timer
|
||||
// resolution to about 40 CPU cycles. However, shorter functions must still
|
||||
// be invoked repeatedly. For more realistic branch prediction performance,
|
||||
// we vary the input parameter according to a user-specified distribution.
|
||||
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
|
||||
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
|
||||
// central tendency of the measurement samples with the "half sample mode",
|
||||
// which is more robust to outliers and skewed data than the mean or median.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/highway_export.h"
|
||||
#include "hwy/timer.h"
|
||||
|
||||
// Enables sanity checks that verify correct operation at the cost of
|
||||
// longer benchmark runs.
|
||||
#ifndef NANOBENCHMARK_ENABLE_CHECKS
|
||||
#define NANOBENCHMARK_ENABLE_CHECKS 0
|
||||
#endif
|
||||
|
||||
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
|
||||
while (!(condition)) { \
|
||||
fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
#if NANOBENCHMARK_ENABLE_CHECKS
|
||||
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
|
||||
#else
|
||||
#define NANOBENCHMARK_CHECK(condition)
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns 1, but without the compiler knowing what the value is. This prevents
|
||||
// optimizing out code.
|
||||
HWY_DLLEXPORT int Unpredictable1();
|
||||
|
||||
// Input influencing the function being measured (e.g. number of bytes to copy).
|
||||
using FuncInput = size_t;
|
||||
|
||||
// "Proof of work" returned by Func to ensure the compiler does not elide it.
|
||||
using FuncOutput = uint64_t;
|
||||
|
||||
// Function to measure: either 1) a captureless lambda or function with two
|
||||
// arguments or 2) a lambda with capture, in which case the first argument
|
||||
// is reserved for use by MeasureClosure.
|
||||
using Func = FuncOutput (*)(const void*, FuncInput);
|
||||
|
||||
// Internal parameters that determine precision/resolution/measuring time.
|
||||
struct Params {
|
||||
// Best-case precision, expressed as a divisor of the timer resolution.
|
||||
// Larger => more calls to Func and higher precision.
|
||||
size_t precision_divisor = 1024;
|
||||
|
||||
// Ratio between full and subset input distribution sizes. Cannot be less
|
||||
// than 2; larger values increase measurement time but more faithfully
|
||||
// model the given input distribution.
|
||||
size_t subset_ratio = 2;
|
||||
|
||||
// Together with the estimated Func duration, determines how many times to
|
||||
// call Func before checking the sample variability. Larger values increase
|
||||
// measurement time, memory/cache use and precision.
|
||||
double seconds_per_eval = 4E-3;
|
||||
|
||||
// The minimum number of samples before estimating the central tendency.
|
||||
size_t min_samples_per_eval = 7;
|
||||
|
||||
// The mode is better than median for estimating the central tendency of
|
||||
// skewed/fat-tailed distributions, but it requires sufficient samples
|
||||
// relative to the width of half-ranges.
|
||||
size_t min_mode_samples = 64;
|
||||
|
||||
// Maximum permissible variability (= median absolute deviation / center).
|
||||
double target_rel_mad = 0.002;
|
||||
|
||||
// Abort after this many evals without reaching target_rel_mad. This
|
||||
// prevents infinite loops.
|
||||
size_t max_evals = 9;
|
||||
|
||||
// Whether to print additional statistics to stdout.
|
||||
bool verbose = true;
|
||||
};
|
||||
|
||||
// Measurement result for each unique input.
|
||||
struct Result {
|
||||
FuncInput input;
|
||||
|
||||
// Robust estimate (mode or median) of duration.
|
||||
float ticks;
|
||||
|
||||
// Measure of variability (median absolute deviation relative to "ticks").
|
||||
float variability;
|
||||
};
|
||||
|
||||
// Precisely measures the number of ticks elapsed when calling "func" with the
|
||||
// given inputs, shuffled to ensure realistic branch prediction hit rates.
|
||||
//
|
||||
// "func" returns a 'proof of work' to ensure its computations are not elided.
|
||||
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
|
||||
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
|
||||
// "func". The values should be chosen to maximize coverage of "func". This
|
||||
// represents a distribution, so a value's frequency should reflect its
|
||||
// probability in the real application. Order does not matter; for example, a
|
||||
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
|
||||
// Returns how many Result were written to "results": one per unique input, or
|
||||
// zero if the measurement failed (an error message goes to stderr).
|
||||
HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg,
|
||||
const FuncInput* inputs, size_t num_inputs,
|
||||
Result* results, const Params& p = Params());
|
||||
|
||||
// Calls operator() of the given closure (lambda function).
|
||||
template <class Closure>
|
||||
static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
|
||||
return (*f)(input);
|
||||
}
|
||||
|
||||
// Same as Measure, except "closure" is typically a lambda function of
|
||||
// FuncInput -> FuncOutput with a capture list.
|
||||
template <class Closure>
|
||||
static inline size_t MeasureClosure(const Closure& closure,
|
||||
const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results,
|
||||
const Params& p = Params()) {
|
||||
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
|
||||
reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
|
||||
results, p);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_NANOBENCHMARK_H_
|
8625
deps/highway/include/hwy/ops/arm_neon-inl.h
vendored
Normal file
8625
deps/highway/include/hwy/ops/arm_neon-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
5050
deps/highway/include/hwy/ops/arm_sve-inl.h
vendored
Normal file
5050
deps/highway/include/hwy/ops/arm_sve-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
2728
deps/highway/include/hwy/ops/emu128-inl.h
vendored
Normal file
2728
deps/highway/include/hwy/ops/emu128-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
4596
deps/highway/include/hwy/ops/generic_ops-inl.h
vendored
Normal file
4596
deps/highway/include/hwy/ops/generic_ops-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
5339
deps/highway/include/hwy/ops/ppc_vsx-inl.h
vendored
Normal file
5339
deps/highway/include/hwy/ops/ppc_vsx-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
4887
deps/highway/include/hwy/ops/rvv-inl.h
vendored
Normal file
4887
deps/highway/include/hwy/ops/rvv-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
1921
deps/highway/include/hwy/ops/scalar-inl.h
vendored
Normal file
1921
deps/highway/include/hwy/ops/scalar-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
578
deps/highway/include/hwy/ops/set_macros-inl.h
vendored
Normal file
578
deps/highway/include/hwy/ops/set_macros-inl.h
vendored
Normal file
|
@ -0,0 +1,578 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Sets macros based on HWY_TARGET.
|
||||
|
||||
// This include guard is toggled by foreach_target, so avoid the usual _H_
|
||||
// suffix to prevent copybara from renaming it.
|
||||
#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HWY_SET_MACROS_PER_TARGET
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#else
|
||||
#define HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
|
||||
#endif // HWY_SET_MACROS_PER_TARGET
|
||||
|
||||
#include "hwy/detect_compiler_arch.h" // IWYU: export
|
||||
#include "hwy/detect_targets.h" // IWYU: export
|
||||
|
||||
#undef HWY_NAMESPACE
|
||||
#undef HWY_ALIGN
|
||||
#undef HWY_MAX_BYTES
|
||||
#undef HWY_LANES
|
||||
|
||||
#undef HWY_HAVE_SCALABLE
|
||||
#undef HWY_HAVE_TUPLE
|
||||
#undef HWY_HAVE_INTEGER64
|
||||
#undef HWY_HAVE_FLOAT16
|
||||
#undef HWY_HAVE_FLOAT64
|
||||
#undef HWY_MEM_OPS_MIGHT_FAULT
|
||||
#undef HWY_NATIVE_FMA
|
||||
#undef HWY_CAP_GE256
|
||||
#undef HWY_CAP_GE512
|
||||
|
||||
// Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
|
||||
#if HWY_TARGET == HWY_RVV && \
|
||||
((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
|
||||
(HWY_COMPILER_CLANG))
|
||||
#define HWY_HAVE_TUPLE 0
|
||||
#else
|
||||
#define HWY_HAVE_TUPLE 1
|
||||
#endif
|
||||
|
||||
// For internal use (clamping/validating N for Simd<>)
|
||||
#undef HWY_MAX_N
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
#define HWY_MAX_N 1
|
||||
#else
|
||||
#define HWY_MAX_N 65536
|
||||
#endif
|
||||
|
||||
// For internal use (clamping kPow2 for Simd<>)
|
||||
#undef HWY_MAX_POW2
|
||||
// For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to
|
||||
// support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3.
|
||||
// However, those other targets do not actually support multiple vectors, and
|
||||
// thus Lanes(d) must not exceed Lanes(ScalableTag<T>()).
|
||||
#define HWY_MAX_POW2 3
|
||||
|
||||
// User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >>
|
||||
// (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions.
|
||||
#undef HWY_MIN_POW2
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
#define HWY_MIN_POW2 -16
|
||||
#else
|
||||
// Tighter bound for other targets, whose vectors are smaller, to potentially
|
||||
// save compile time.
|
||||
#define HWY_MIN_POW2 -8
|
||||
#endif // HWY_TARGET == HWY_RVV
|
||||
|
||||
#undef HWY_TARGET_STR
|
||||
|
||||
#if defined(HWY_DISABLE_PCLMUL_AES)
|
||||
#define HWY_TARGET_STR_PCLMUL_AES ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA)
|
||||
#define HWY_TARGET_STR_BMI2_FMA ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_F16C)
|
||||
#define HWY_TARGET_STR_F16C ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_F16C ",f16c"
|
||||
#endif
|
||||
|
||||
#define HWY_TARGET_STR_SSE2 "sse2"
|
||||
|
||||
#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
|
||||
|
||||
#define HWY_TARGET_STR_SSE4 \
|
||||
HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
|
||||
// Include previous targets, which are the half-vectors of the next target.
|
||||
#define HWY_TARGET_STR_AVX2 \
|
||||
HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
|
||||
#define HWY_TARGET_STR_AVX3 \
|
||||
HWY_TARGET_STR_AVX2 ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw"
|
||||
#define HWY_TARGET_STR_AVX3_DL \
|
||||
HWY_TARGET_STR_AVX3 \
|
||||
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
|
||||
"avx512vpopcntdq,gfni"
|
||||
|
||||
#define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_DL ",avx512fp16"
|
||||
|
||||
#if defined(HWY_DISABLE_PPC8_CRYPTO)
|
||||
#define HWY_TARGET_STR_PPC8_CRYPTO ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_PPC8_CRYPTO ",crypto"
|
||||
#endif
|
||||
|
||||
#define HWY_TARGET_STR_PPC8 \
|
||||
"altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO
|
||||
#define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector"
|
||||
|
||||
#if HWY_COMPILER_CLANG
|
||||
#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector"
|
||||
#else
|
||||
#define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10"
|
||||
#endif
|
||||
|
||||
// Before include guard so we redefine HWY_TARGET_STR on each include,
|
||||
// governed by the current HWY_TARGET.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE2
|
||||
#if HWY_TARGET == HWY_SSE2
|
||||
|
||||
#define HWY_NAMESPACE N_SSE2
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_SSE2
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSSE3
|
||||
#elif HWY_TARGET == HWY_SSSE3
|
||||
|
||||
#define HWY_NAMESPACE N_SSSE3
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE4
|
||||
#elif HWY_TARGET == HWY_SSE4
|
||||
|
||||
#define HWY_NAMESPACE N_SSE4
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_SSE4
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AVX2
|
||||
#elif HWY_TARGET == HWY_AVX2
|
||||
|
||||
#define HWY_NAMESPACE N_AVX2
|
||||
#define HWY_ALIGN alignas(32)
|
||||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
|
||||
#ifdef HWY_DISABLE_BMI2_FMA
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#else
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#endif
|
||||
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AVX3[_DL]
|
||||
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
|
||||
HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
|
||||
|
||||
#define HWY_ALIGN alignas(64)
|
||||
#define HWY_MAX_BYTES 64
|
||||
#define HWY_LANES(T) (64 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#if (HWY_TARGET == HWY_AVX3_SPR) && 0 // TODO(janwas): enable after testing
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#endif
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 1
|
||||
|
||||
#if HWY_TARGET == HWY_AVX3
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
|
||||
|
||||
#elif HWY_TARGET == HWY_AVX3_DL
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3_DL
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
|
||||
|
||||
#elif HWY_TARGET == HWY_AVX3_ZEN4
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3_ZEN4
|
||||
// Currently the same as HWY_AVX3_DL: both support Icelake.
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
|
||||
|
||||
#elif HWY_TARGET == HWY_AVX3_SPR
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3_SPR
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR
|
||||
|
||||
#else
|
||||
#error "Logic error"
|
||||
#endif // HWY_TARGET
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// PPC8, PPC9, PPC10
|
||||
#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
|
||||
HWY_TARGET == HWY_PPC10
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_TARGET == HWY_PPC8
|
||||
|
||||
#define HWY_NAMESPACE N_PPC8
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_PPC8
|
||||
|
||||
#elif HWY_TARGET == HWY_PPC9
|
||||
|
||||
#define HWY_NAMESPACE N_PPC9
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_PPC9
|
||||
|
||||
#elif HWY_TARGET == HWY_PPC10
|
||||
|
||||
#define HWY_NAMESPACE N_PPC10
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_PPC10
|
||||
|
||||
#else
|
||||
#error "Logic error"
|
||||
#endif // HWY_TARGET == HWY_PPC10
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// NEON
|
||||
#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#endif
|
||||
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
|
||||
#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#else
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#endif
|
||||
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
|
||||
#define HWY_NAMESPACE N_NEON_WITHOUT_AES
|
||||
#else
|
||||
#define HWY_NAMESPACE N_NEON
|
||||
#endif
|
||||
|
||||
// Can use pragmas instead of -march compiler flag
|
||||
#if HWY_HAVE_RUNTIME_DISPATCH
|
||||
#if HWY_ARCH_ARM_V7
|
||||
|
||||
// The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8.
|
||||
#if HWY_COMPILER_GCC_ACTUAL >= 800
|
||||
#define HWY_TARGET_STR "+neon-vfpv4"
|
||||
#else // GCC < 7
|
||||
// Do not define HWY_TARGET_STR (no pragma).
|
||||
#endif // HWY_COMPILER_GCC_ACTUAL
|
||||
|
||||
#else // !HWY_ARCH_ARM_V7
|
||||
|
||||
#if HWY_TARGET == HWY_NEON_WITHOUT_AES
|
||||
// Do not define HWY_TARGET_STR (no pragma).
|
||||
#else
|
||||
#define HWY_TARGET_STR "+crypto"
|
||||
#endif // HWY_TARGET == HWY_NEON_WITHOUT_AES
|
||||
|
||||
#endif // HWY_ARCH_ARM_V7
|
||||
#else // !HWY_HAVE_RUNTIME_DISPATCH
|
||||
// HWY_TARGET_STR remains undefined
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SVE[2]
|
||||
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
|
||||
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
||||
|
||||
// SVE only requires lane alignment, not natural alignment of the entire vector.
|
||||
#define HWY_ALIGN alignas(8)
|
||||
|
||||
// Value ensures MaxLanes() is the tightest possible upper bound to reduce
|
||||
// overallocation.
|
||||
#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_TARGET == HWY_SVE2
|
||||
#define HWY_NAMESPACE N_SVE2
|
||||
#define HWY_MAX_BYTES 256
|
||||
#define HWY_HAVE_SCALABLE 1
|
||||
#elif HWY_TARGET == HWY_SVE_256
|
||||
#define HWY_NAMESPACE N_SVE_256
|
||||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#elif HWY_TARGET == HWY_SVE2_128
|
||||
#define HWY_NAMESPACE N_SVE2_128
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#else
|
||||
#define HWY_NAMESPACE N_SVE
|
||||
#define HWY_MAX_BYTES 256
|
||||
#define HWY_HAVE_SCALABLE 1
|
||||
#endif
|
||||
|
||||
// Can use pragmas instead of -march compiler flag
|
||||
#if HWY_HAVE_RUNTIME_DISPATCH
|
||||
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
|
||||
#define HWY_TARGET_STR "+sve2-aes"
|
||||
#else
|
||||
#define HWY_TARGET_STR "+sve"
|
||||
#endif
|
||||
#else
|
||||
// HWY_TARGET_STR remains undefined
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_WASM
|
||||
|
||||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM_EMU256
|
||||
#elif HWY_TARGET == HWY_WASM_EMU256
|
||||
|
||||
#define HWY_ALIGN alignas(32)
|
||||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_WASM_EMU256
|
||||
|
||||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// RVV
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
|
||||
// RVV only requires lane alignment, not natural alignment of the entire vector,
|
||||
// and the compiler already aligns builtin types, so nothing to do here.
|
||||
#define HWY_ALIGN
|
||||
|
||||
// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
|
||||
#define HWY_MAX_BYTES 65536
|
||||
|
||||
// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
|
||||
// LMUL. This is the tightest possible upper bound.
|
||||
#define HWY_LANES(T) (8192 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 1
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if defined(__riscv_zvfh)
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#endif
|
||||
|
||||
#define HWY_NAMESPACE N_RVV
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
// (rv64gcv is not a valid target)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// EMU128
|
||||
#elif HWY_TARGET == HWY_EMU128
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_EMU128
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SCALAR
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
|
||||
#define HWY_ALIGN
|
||||
#define HWY_MAX_BYTES 8
|
||||
#define HWY_LANES(T) 1
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_SCALAR
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
|
||||
#else
|
||||
#pragma message("HWY_TARGET does not match any known target")
|
||||
#endif // HWY_TARGET
|
||||
|
||||
// Override this to 1 in asan/msan builds, which will still fault.
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN
|
||||
#undef HWY_MEM_OPS_MIGHT_FAULT
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#endif
|
||||
|
||||
// Clang <9 requires this be invoked at file scope, before any namespace.
|
||||
#undef HWY_BEFORE_NAMESPACE
|
||||
#if defined(HWY_TARGET_STR)
|
||||
#define HWY_BEFORE_NAMESPACE() \
|
||||
HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#else
|
||||
// avoids compiler warning if no HWY_TARGET_STR
|
||||
#define HWY_BEFORE_NAMESPACE() \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#endif
|
||||
|
||||
// Clang <9 requires any namespaces be closed before this macro.
|
||||
#undef HWY_AFTER_NAMESPACE
|
||||
#if defined(HWY_TARGET_STR)
|
||||
#define HWY_AFTER_NAMESPACE() \
|
||||
HWY_POP_ATTRIBUTES \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#else
|
||||
// avoids compiler warning if no HWY_TARGET_STR
|
||||
#define HWY_AFTER_NAMESPACE() \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#endif
|
||||
|
||||
#undef HWY_ATTR
|
||||
#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
|
||||
#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
|
||||
#else
|
||||
#define HWY_ATTR
|
||||
#endif
|
520
deps/highway/include/hwy/ops/shared-inl.h
vendored
Normal file
520
deps/highway/include/hwy/ops/shared-inl.h
vendored
Normal file
|
@ -0,0 +1,520 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target definitions shared by ops/*.h and user code.
|
||||
|
||||
// IWYU pragma: begin_exports
|
||||
// Export does not seem to be recursive, so re-export these (also in base.h)
|
||||
#include <stddef.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
// "IWYU pragma: keep" does not work for this include, so hide it from the IDE.
|
||||
#if !HWY_IDE
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
|
||||
// Separate header because foreach_target.h re-enables its include guard.
|
||||
#include "hwy/ops/set_macros-inl.h"
|
||||
|
||||
// IWYU pragma: end_exports
|
||||
|
||||
#if HWY_IS_MSAN
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif
|
||||
|
||||
// We are covered by the highway.h include guard, but generic_ops-inl.h
|
||||
// includes this again #if HWY_IDE.
|
||||
#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// NOTE: GCC generates incorrect code for vector arguments to non-inlined
|
||||
// functions in two situations:
|
||||
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
|
||||
// - on aarch64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
|
||||
// all) tests to fail.
|
||||
//
|
||||
// We therefore pass by const& only on GCC and (Windows or aarch64). This alias
|
||||
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
|
||||
// and possibly also other functions that are not inlined.
|
||||
#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
|
||||
template <class V>
|
||||
using VecArg = const V&;
|
||||
#else
|
||||
template <class V>
|
||||
using VecArg = V;
|
||||
#endif
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Returns N * 2^pow2. N is the number of lanes in a full vector and pow2 the
|
||||
// desired fraction or multiple of it, see Simd<>. `pow2` is most often in
|
||||
// [-3, 3] but can also be lower for user-specified fractions.
|
||||
constexpr size_t ScaleByPower(size_t N, int pow2) {
|
||||
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE void MaybeUnpoison(T* HWY_RESTRICT unaligned, size_t count) {
|
||||
// Workaround for MSAN not marking compressstore as initialized (b/233326619)
|
||||
#if HWY_IS_MSAN
|
||||
__msan_unpoison(unaligned, count * sizeof(T));
|
||||
#else
|
||||
(void)unaligned;
|
||||
(void)count;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Highway operations are implemented as overloaded functions selected using a
|
||||
// zero-sized tag type D := Simd<T, N, kPow2>. T denotes the lane type.
|
||||
//
|
||||
// N defines how many lanes are in a 'full' vector, typically equal to
|
||||
// HWY_LANES(T) (which is the actual count on targets with vectors of known
|
||||
// size, and an upper bound in case of scalable vectors), otherwise a
|
||||
// user-specified limit at most that large.
|
||||
//
|
||||
// 2^kPow2 is a _subsequently_ applied scaling factor that indicates the
|
||||
// desired fraction of a 'full' vector: 0 means full, -1 means half; 1,2,3
|
||||
// means two/four/eight full vectors ganged together. The largest supported
|
||||
// kPow2 is `HWY_MAX_POW2` and the aliases below take care of clamping
|
||||
// user-specified values to that. Note that `Simd<T, 1, 0>` and `Simd<T, 2, -1>`
|
||||
// have the same `MaxLanes` and `Lanes`.
|
||||
//
|
||||
// We can theoretically keep halving Lanes(), but recursive instantiations of
|
||||
// kPow2 - 1 will eventually fail e.g. because -64 is not a valid shift count.
|
||||
// Users must terminate such compile-time recursions at or above HWY_MIN_POW2.
|
||||
//
|
||||
// WARNING: do not use N directly because it may be a special representation of
|
||||
// a fractional MaxLanes. This arises when we Rebind Simd<uint8_t, 1, 0> to
|
||||
// Simd<uint32_t, ??, 2>. RVV requires that the last argument (kPow2) be two,
|
||||
// but we want MaxLanes to be the same in both cases. Hence ?? is a
|
||||
// fixed-point encoding of 1/4.
|
||||
//
|
||||
// Instead of referring to Simd<> directly, users create D via aliases:
|
||||
// - ScalableTag<T> for a full vector;
|
||||
// - ScalableTag<T, kPow2>() for a fraction/group, where `kPow2` is
|
||||
// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`;
|
||||
// - CappedTag<T, kLimit> for a vector with up to kLimit lanes; or
|
||||
// - FixedTag<T, kNumLanes> for a vector with exactly kNumLanes lanes.
|
||||
//
|
||||
// Instead of N, use Lanes(D()) for the actual number of lanes at runtime and
|
||||
// D().MaxLanes() for a constexpr upper bound. Both are powers of two.
|
||||
template <typename Lane, size_t N, int kPow2>
|
||||
struct Simd {
|
||||
constexpr Simd() = default;
|
||||
using T = Lane;
|
||||
|
||||
private:
|
||||
static_assert(sizeof(Lane) <= 8, "Lanes are up to 64-bit");
|
||||
// 20 bits are sufficient for any HWY_MAX_BYTES. This is the 'normal' value of
|
||||
// N when kFrac == 0, otherwise it is one (see FracN).
|
||||
static constexpr size_t kWhole = N & 0xFFFFF;
|
||||
// Fractional part is in the bits above kWhole.
|
||||
static constexpr int kFrac = static_cast<int>(N >> 20);
|
||||
// Can be 8x larger because kPow2 may be as low as -3 (Rebind of a larger
|
||||
// type to u8 results in fractions).
|
||||
static_assert(kWhole <= 8 * HWY_MAX_N && kFrac <= 3, "Out of range");
|
||||
static_assert(kFrac == 0 || kWhole == 1, "If frac, whole must be 1");
|
||||
static_assert((kWhole & (kWhole - 1)) == 0 && kWhole != 0, "Not 2^x");
|
||||
// Important to check this here because kPow2 <= -64 causes confusing
|
||||
// compile errors (invalid shift count).
|
||||
static_assert(kPow2 >= HWY_MIN_POW2, "Forgot kPow2 recursion terminator?");
|
||||
// However, do NOT verify kPow2 <= HWY_MAX_POW2 - users should be able to
|
||||
// Rebind<uint64_t, ScalableTag<uint8_t, 3>> in order to discover that its
|
||||
// kPow2 is out of bounds.
|
||||
|
||||
public:
|
||||
// Upper bound on the number of lanes (tight if !HWY_HAVE_SCALABLE). In the
|
||||
// common case, N == kWhole, but if kFrac is nonzero, we deduct it from kPow2.
|
||||
// E.g. Rebind<uint32_t, Simd<uint8_t, 1, 0>> is Simd<uint32_t, 0x200001, 2>.
|
||||
// The resulting number of lanes is still 1 because this N represents 1/4
|
||||
// (the ratio of the sizes). Note that RVV requires kPow2 to be the ratio of
|
||||
// the sizes so that the correct LMUL overloads are chosen, even if N is
|
||||
// small enough that it would fit in an LMUL=1 vector.
|
||||
//
|
||||
// Cannot be an enum because GCC warns when using enums and non-enums in the
|
||||
// same expression. Cannot be a static constexpr function (MSVC limitation).
|
||||
// Rounded up to one so this is a valid array length.
|
||||
//
|
||||
// Do not use this directly - only 'public' so it is visible from the accessor
|
||||
// macro required by MSVC.
|
||||
static constexpr size_t kPrivateLanes =
|
||||
HWY_MAX(size_t{1}, detail::ScaleByPower(kWhole, kPow2 - kFrac));
|
||||
|
||||
constexpr size_t MaxLanes() const { return kPrivateLanes; }
|
||||
constexpr size_t MaxBytes() const { return kPrivateLanes * sizeof(Lane); }
|
||||
constexpr size_t MaxBlocks() const { return (MaxBytes() + 15) / 16; }
|
||||
// For SFINAE on RVV.
|
||||
constexpr int Pow2() const { return kPow2; }
|
||||
|
||||
// ------------------------------ Changing lane type or count
|
||||
// Do not use any of these directly. Anything used from member typedefs cannot
|
||||
// be made private, but functions only used within other functions can.
|
||||
|
||||
// Returns number of NewT lanes that fit within MaxBytes().
|
||||
template <typename NewT>
|
||||
static constexpr size_t RepartitionLanes() {
|
||||
// Round up to correctly handle larger NewT.
|
||||
return (kPrivateLanes * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
|
||||
}
|
||||
|
||||
// Returns the new kPow2 required for lanes of type NewT.
|
||||
template <typename NewT>
|
||||
static constexpr int RebindPow2() {
|
||||
return kPow2 +
|
||||
((sizeof(NewT) >= sizeof(T))
|
||||
? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
|
||||
: -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))));
|
||||
}
|
||||
|
||||
private:
|
||||
// Returns 0 or whole NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
|
||||
template <int kNewPow2, size_t kNewMaxLanes>
|
||||
static constexpr size_t WholeN() {
|
||||
return detail::ScaleByPower(kNewMaxLanes, -kNewPow2);
|
||||
}
|
||||
|
||||
// Returns fractional NewN such that kNewMaxLanes = NewN * 2^kNewPow2.
|
||||
template <int kNewPow2, size_t kNewMaxLanes>
|
||||
static constexpr size_t FracN() {
|
||||
// Only reached if kNewPow2 > CeilLog2(kNewMaxLanes) >= 0 (else WholeN
|
||||
// would not have been zero), but clamp to zero to avoid warnings. kFrac is
|
||||
// the difference, stored in the upper bits of N, and we also set kWhole =
|
||||
// 1 so that the new kPrivateLanes = kNewMaxLanes.
|
||||
static_assert(HWY_MAX_N <= (size_t{1} << 20), "Change bit shift");
|
||||
return static_cast<size_t>(
|
||||
1 + (HWY_MAX(0, kNewPow2 - static_cast<int>(CeilLog2(kNewMaxLanes)))
|
||||
<< 20));
|
||||
}
|
||||
|
||||
public:
|
||||
// Returns (whole or fractional) NewN, see above.
|
||||
template <int kNewPow2, size_t kNewMaxLanes>
|
||||
static constexpr size_t NewN() {
|
||||
// We require a fraction if inverting kNewPow2 results in 0.
|
||||
return WholeN<kNewPow2, kNewMaxLanes>() == 0
|
||||
? FracN<kNewPow2, kNewMaxLanes>()
|
||||
: WholeN<kNewPow2, kNewMaxLanes>();
|
||||
}
|
||||
|
||||
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
|
||||
template <typename NewT>
|
||||
using Rebind =
|
||||
Simd<NewT, NewN<RebindPow2<NewT>(), kPrivateLanes>(), RebindPow2<NewT>()>;
|
||||
|
||||
// Change lane type while keeping the same vector size, e.g. for MulEven.
|
||||
template <typename NewT>
|
||||
using Repartition =
|
||||
Simd<NewT, NewN<kPow2, RepartitionLanes<NewT>()>(), kPow2>;
|
||||
|
||||
// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
|
||||
using Half = Simd<T, N, kPow2 - 1>;
|
||||
|
||||
// Twice the lanes while keeping the same lane type, e.g. for Combine.
|
||||
using Twice = Simd<T, N, kPow2 + 1>;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T, size_t N, int kPow2>
|
||||
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
|
||||
return N == HWY_LANES(T) && kPow2 == 0;
|
||||
}
|
||||
|
||||
// Struct wrappers enable validation of arguments via static_assert.
|
||||
template <typename T, size_t N, int kPow2>
|
||||
struct ClampNAndPow2 {
|
||||
using type = Simd<T, HWY_MIN(N, HWY_MAX_N), HWY_MIN(kPow2, HWY_MAX_POW2)>;
|
||||
};
|
||||
|
||||
template <typename T, int kPow2>
|
||||
struct ScalableTagChecker {
|
||||
using type = typename ClampNAndPow2<T, HWY_LANES(T), kPow2>::type;
|
||||
};
|
||||
|
||||
template <typename T, size_t kLimit, int kPow2>
|
||||
struct CappedTagChecker {
|
||||
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
|
||||
// Safely handle non-power-of-two inputs by rounding down, which is allowed by
|
||||
// CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
|
||||
static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
|
||||
static constexpr size_t N = HWY_MIN(kLimitPow2, HWY_LANES(T));
|
||||
using type = typename ClampNAndPow2<T, N, kPow2>::type;
|
||||
};
|
||||
|
||||
template <typename T, size_t kNumLanes>
|
||||
struct FixedTagChecker {
|
||||
static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
|
||||
static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
|
||||
using type = Simd<T, kNumLanes, 0>;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// ------------------------------ Aliases for Simd<>
|
||||
|
||||
// Tag describing a full vector (kPow2 == 0: the most common usage, e.g. 1D
|
||||
// loops where the application does not care about the vector size) or a
|
||||
// fraction/multiple of one. Fractions (kPow2 < 0) are useful for arguments or
|
||||
// return values of type promotion and demotion. User-specified kPow2 is
|
||||
// interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
|
||||
template <typename T, int kPow2 = 0>
|
||||
using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
|
||||
|
||||
// Tag describing a vector with *up to* kLimit active lanes, even on targets
|
||||
// with scalable vectors and HWY_SCALAR. The runtime lane count `Lanes(tag)` may
|
||||
// be less than kLimit, and is 1 on HWY_SCALAR. This alias is typically used for
|
||||
// 1D loops with a relatively low application-defined upper bound, e.g. for 8x8
|
||||
// DCTs. However, it is better if data structures are designed to be
|
||||
// vector-length-agnostic (e.g. a hybrid SoA where there are chunks of `M >=
|
||||
// MaxLanes(d)` DC components followed by M AC1, .., and M AC63; this would
|
||||
// enable vector-length-agnostic loops using ScalableTag). User-specified kPow2
|
||||
// is interpreted as `HWY_MIN(kPow2, HWY_MAX_POW2)`.
|
||||
template <typename T, size_t kLimit, int kPow2 = 0>
|
||||
using CappedTag = typename detail::CappedTagChecker<T, kLimit, kPow2>::type;
|
||||
|
||||
#if !HWY_HAVE_SCALABLE
|
||||
// If the vector size is known, and the app knows it does not want more than
|
||||
// kLimit lanes, then capping can be beneficial. For example, AVX-512 has lower
|
||||
// IPC and potentially higher costs for unaligned load/store vs. 256-bit AVX2.
|
||||
template <typename T, size_t kLimit, int kPow2 = 0>
|
||||
using CappedTagIfFixed = CappedTag<T, kLimit, kPow2>;
|
||||
#else // HWY_HAVE_SCALABLE
|
||||
// .. whereas on RVV/SVE, the cost of clamping Lanes() may exceed the benefit.
|
||||
template <typename T, size_t kLimit, int kPow2 = 0>
|
||||
using CappedTagIfFixed = ScalableTag<T, kPow2>;
|
||||
#endif
|
||||
|
||||
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
|
||||
// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
|
||||
// two not exceeding `HWY_LANES(T)`.
|
||||
//
|
||||
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
|
||||
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
|
||||
// This is useful for data structures that rely on exactly 128-bit SIMD, but
|
||||
// these are discouraged because they cannot benefit from wider vectors.
|
||||
// Instead, applications would ideally define a larger problem size and loop
|
||||
// over it with the (unknown size) vectors from ScalableTag.
|
||||
//
|
||||
// + e.g. if the baseline is known to support SIMD, or the application requires
|
||||
// ops such as TableLookupBytes not supported by HWY_SCALAR.
|
||||
template <typename T, size_t kNumLanes>
|
||||
using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
|
||||
|
||||
// Convenience form for fixed sizes.
|
||||
template <typename T>
|
||||
using Full16 = Simd<T, 2 / sizeof(T), 0>;
|
||||
|
||||
template <typename T>
|
||||
using Full32 = Simd<T, 4 / sizeof(T), 0>;
|
||||
|
||||
template <typename T>
|
||||
using Full64 = Simd<T, 8 / sizeof(T), 0>;
|
||||
|
||||
template <typename T>
|
||||
using Full128 = Simd<T, 16 / sizeof(T), 0>;
|
||||
|
||||
// ------------------------------ Accessors for Simd<>
|
||||
|
||||
// Lane type.
|
||||
template <class D>
|
||||
using TFromD = typename D::T;
|
||||
|
||||
// Upper bound on the number of lanes, typically used for SFINAE conditions and
|
||||
// to allocate storage for targets with known vector sizes. Note: this may be a
|
||||
// loose bound, instead use Lanes() as the actual size for AllocateAligned.
|
||||
// MSVC workaround: use static constant directly instead of a function.
|
||||
#define HWY_MAX_LANES_D(D) D::kPrivateLanes
|
||||
|
||||
// Non-macro form of HWY_MAX_LANES_D in case that is preferable. WARNING: the
|
||||
// macro form may be required for MSVC, which has limitations on deducing
|
||||
// arguments.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
|
||||
return HWY_MAX_LANES_D(D);
|
||||
}
|
||||
|
||||
#if !HWY_HAVE_SCALABLE
|
||||
|
||||
// If non-scalable, this is constexpr; otherwise the target's header defines a
|
||||
// non-constexpr version of this function. This is the actual vector length,
|
||||
// used when advancing loop counters.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t Lanes(D) {
|
||||
return HWY_MAX_LANES_D(D);
|
||||
}
|
||||
|
||||
#endif // !HWY_HAVE_SCALABLE
|
||||
|
||||
// Tag for the same number of lanes as D, but with the LaneType T.
|
||||
template <class T, class D>
|
||||
using Rebind = typename D::template Rebind<T>;
|
||||
|
||||
template <class D>
|
||||
using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
|
||||
template <class D>
|
||||
using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
|
||||
template <class D>
|
||||
using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
|
||||
|
||||
// Tag for the same total size as D, but with the LaneType T.
|
||||
template <class T, class D>
|
||||
using Repartition = typename D::template Repartition<T>;
|
||||
|
||||
template <class D>
|
||||
using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
|
||||
template <class D>
|
||||
using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
|
||||
|
||||
// Tag for the same lane type as D, but half the lanes.
|
||||
template <class D>
|
||||
using Half = typename D::Half;
|
||||
|
||||
// Tag for the same lane type as D, but twice the lanes.
|
||||
template <class D>
|
||||
using Twice = typename D::Twice;
|
||||
|
||||
// Tag for a 16-byte block with the same lane type as D
|
||||
#if HWY_HAVE_SCALABLE
|
||||
namespace detail {
|
||||
|
||||
template <class D>
|
||||
class BlockDFromD_t {};
|
||||
|
||||
template <typename T, size_t N, int kPow2>
|
||||
class BlockDFromD_t<Simd<T, N, kPow2>> {
|
||||
using D = Simd<T, N, kPow2>;
|
||||
static constexpr int kNewPow2 = HWY_MIN(kPow2, 0);
|
||||
static constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), HWY_MAX_LANES_D(D));
|
||||
static constexpr size_t kNewN = D::template NewN<kNewPow2, kMaxLpb>();
|
||||
|
||||
public:
|
||||
using type = Simd<T, kNewN, kNewPow2>;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class D>
|
||||
using BlockDFromD = typename detail::BlockDFromD_t<RemoveConst<D>>::type;
|
||||
#else
|
||||
template <class D>
|
||||
using BlockDFromD =
|
||||
Simd<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), HWY_MAX_LANES_D(D)), 0>;
|
||||
#endif
|
||||
|
||||
// ------------------------------ Choosing overloads (SFINAE)
|
||||
|
||||
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
|
||||
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
|
||||
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
|
||||
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
|
||||
#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
|
||||
#define HWY_IF_FLOAT3264_D(D) HWY_IF_FLOAT3264(TFromD<D>)
|
||||
#define HWY_IF_NOT_FLOAT3264_D(D) HWY_IF_NOT_FLOAT3264(TFromD<D>)
|
||||
#define HWY_IF_SPECIAL_FLOAT_D(D) HWY_IF_SPECIAL_FLOAT(TFromD<D>)
|
||||
#define HWY_IF_NOT_SPECIAL_FLOAT_D(D) HWY_IF_NOT_SPECIAL_FLOAT(TFromD<D>)
|
||||
#define HWY_IF_FLOAT_OR_SPECIAL_D(D) HWY_IF_FLOAT_OR_SPECIAL(TFromD<D>)
|
||||
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_D(D) \
|
||||
HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>)
|
||||
|
||||
#define HWY_IF_T_SIZE_D(D, bytes) HWY_IF_T_SIZE(TFromD<D>, bytes)
|
||||
#define HWY_IF_NOT_T_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE(TFromD<D>, bytes)
|
||||
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array) \
|
||||
HWY_IF_T_SIZE_ONE_OF(TFromD<D>, bit_array)
|
||||
|
||||
#define HWY_IF_LANES_D(D, lanes) HWY_IF_LANES(HWY_MAX_LANES_D(D), lanes)
|
||||
#define HWY_IF_LANES_LE_D(D, lanes) HWY_IF_LANES_LE(HWY_MAX_LANES_D(D), lanes)
|
||||
#define HWY_IF_LANES_GT_D(D, lanes) HWY_IF_LANES_GT(HWY_MAX_LANES_D(D), lanes)
|
||||
#define HWY_IF_LANES_PER_BLOCK_D(D, lanes) \
|
||||
HWY_IF_LANES_PER_BLOCK( \
|
||||
TFromD<D>, HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>)), lanes)
|
||||
|
||||
#define HWY_IF_POW2_LE_D(D, pow2) hwy::EnableIf<D().Pow2() <= pow2>* = nullptr
|
||||
#define HWY_IF_POW2_GT_D(D, pow2) hwy::EnableIf<(D().Pow2() > pow2)>* = nullptr
|
||||
|
||||
#define HWY_IF_U8_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint8_t>()>* = nullptr
|
||||
#define HWY_IF_U16_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint16_t>()>* = nullptr
|
||||
#define HWY_IF_U32_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint32_t>()>* = nullptr
|
||||
#define HWY_IF_U64_D(D) hwy::EnableIf<IsSame<TFromD<D>, uint64_t>()>* = nullptr
|
||||
|
||||
#define HWY_IF_I8_D(D) hwy::EnableIf<IsSame<TFromD<D>, int8_t>()>* = nullptr
|
||||
#define HWY_IF_I16_D(D) hwy::EnableIf<IsSame<TFromD<D>, int16_t>()>* = nullptr
|
||||
#define HWY_IF_I32_D(D) hwy::EnableIf<IsSame<TFromD<D>, int32_t>()>* = nullptr
|
||||
#define HWY_IF_I64_D(D) hwy::EnableIf<IsSame<TFromD<D>, int64_t>()>* = nullptr
|
||||
|
||||
// Use instead of HWY_IF_T_SIZE_D to avoid ambiguity with float16_t/float/double
|
||||
// overloads.
|
||||
#define HWY_IF_UI16_D(D) HWY_IF_UI16(TFromD<D>)
|
||||
#define HWY_IF_UI32_D(D) HWY_IF_UI32(TFromD<D>)
|
||||
#define HWY_IF_UI64_D(D) HWY_IF_UI64(TFromD<D>)
|
||||
|
||||
#define HWY_IF_BF16_D(D) \
|
||||
hwy::EnableIf<IsSame<TFromD<D>, hwy::bfloat16_t>()>* = nullptr
|
||||
#define HWY_IF_F16_D(D) \
|
||||
hwy::EnableIf<IsSame<TFromD<D>, hwy::float16_t>()>* = nullptr
|
||||
#define HWY_IF_F32_D(D) hwy::EnableIf<IsSame<TFromD<D>, float>()>* = nullptr
|
||||
#define HWY_IF_F64_D(D) hwy::EnableIf<IsSame<TFromD<D>, double>()>* = nullptr
|
||||
|
||||
#define HWY_IF_V_SIZE_D(D, bytes) \
|
||||
HWY_IF_V_SIZE(TFromD<D>, HWY_MAX_LANES_D(D), bytes)
|
||||
#define HWY_IF_V_SIZE_LE_D(D, bytes) \
|
||||
HWY_IF_V_SIZE_LE(TFromD<D>, HWY_MAX_LANES_D(D), bytes)
|
||||
#define HWY_IF_V_SIZE_GT_D(D, bytes) \
|
||||
HWY_IF_V_SIZE_GT(TFromD<D>, HWY_MAX_LANES_D(D), bytes)
|
||||
|
||||
// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
|
||||
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
|
||||
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
|
||||
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
|
||||
#define HWY_IF_NOT_FLOAT_V(V) HWY_IF_NOT_FLOAT(TFromV<V>)
|
||||
#define HWY_IF_SPECIAL_FLOAT_V(V) HWY_IF_SPECIAL_FLOAT(TFromV<V>)
|
||||
#define HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V) \
|
||||
HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromV<V>)
|
||||
|
||||
#define HWY_IF_T_SIZE_V(V, bytes) HWY_IF_T_SIZE(TFromV<V>, bytes)
|
||||
#define HWY_IF_NOT_T_SIZE_V(V, bytes) HWY_IF_NOT_T_SIZE(TFromV<V>, bytes)
|
||||
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array) \
|
||||
HWY_IF_T_SIZE_ONE_OF(TFromV<V>, bit_array)
|
||||
|
||||
#define HWY_MAX_LANES_V(V) HWY_MAX_LANES_D(DFromV<V>)
|
||||
#define HWY_IF_V_SIZE_V(V, bytes) \
|
||||
HWY_IF_V_SIZE(TFromV<V>, HWY_MAX_LANES_V(V), bytes)
|
||||
#define HWY_IF_V_SIZE_LE_V(V, bytes) \
|
||||
HWY_IF_V_SIZE_LE(TFromV<V>, HWY_MAX_LANES_V(V), bytes)
|
||||
#define HWY_IF_V_SIZE_GT_V(V, bytes) \
|
||||
HWY_IF_V_SIZE_GT(TFromV<V>, HWY_MAX_LANES_V(V), bytes)
|
||||
|
||||
// Old names (deprecated)
|
||||
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_T_SIZE_D(D, bytes)
|
||||
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_T_SIZE_D(D, bytes)
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
125
deps/highway/include/hwy/ops/tuple-inl.h
vendored
Normal file
125
deps/highway/include/hwy/ops/tuple-inl.h
vendored
Normal file
|
@ -0,0 +1,125 @@
|
|||
// Copyright 2023 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Tuple support. Included by those ops/* that lack native tuple types, after
|
||||
// they define VFromD and before they use the tuples e.g. for LoadInterleaved2.
|
||||
// Assumes we are already in the HWY_NAMESPACE and under an include guard.
|
||||
|
||||
// If viewing this header standalone, define VFromD to avoid IDE warnings.
|
||||
// This is normally set by set_macros-inl.h before this header is included.
|
||||
#if !defined(HWY_NAMESPACE)
|
||||
#include "hwy/base.h"
|
||||
template <class D>
|
||||
using VFromD = int;
|
||||
#endif
|
||||
|
||||
// On SVE, Vec2..4 are aliases to built-in types.
|
||||
template <class D>
|
||||
struct Vec2 {
|
||||
VFromD<D> v0;
|
||||
VFromD<D> v1;
|
||||
};
|
||||
|
||||
template <class D>
|
||||
struct Vec3 {
|
||||
VFromD<D> v0;
|
||||
VFromD<D> v1;
|
||||
VFromD<D> v2;
|
||||
};
|
||||
|
||||
template <class D>
|
||||
struct Vec4 {
|
||||
VFromD<D> v0;
|
||||
VFromD<D> v1;
|
||||
VFromD<D> v2;
|
||||
VFromD<D> v3;
|
||||
};
|
||||
|
||||
// D arg is unused but allows deducing D.
|
||||
template <class D>
|
||||
HWY_API Vec2<D> Create2(D /* tag */, VFromD<D> v0, VFromD<D> v1) {
|
||||
return Vec2<D>{v0, v1};
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_API Vec3<D> Create3(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2) {
|
||||
return Vec3<D>{v0, v1, v2};
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_API Vec4<D> Create4(D /* tag */, VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
|
||||
VFromD<D> v3) {
|
||||
return Vec4<D>{v0, v1, v2, v3};
|
||||
}
|
||||
|
||||
template <size_t kIndex, class D>
|
||||
HWY_API VFromD<D> Get2(Vec2<D> tuple) {
|
||||
static_assert(kIndex < 2, "Tuple index out of bounds");
|
||||
return kIndex == 0 ? tuple.v0 : tuple.v1;
|
||||
}
|
||||
|
||||
template <size_t kIndex, class D>
|
||||
HWY_API VFromD<D> Get3(Vec3<D> tuple) {
|
||||
static_assert(kIndex < 3, "Tuple index out of bounds");
|
||||
return kIndex == 0 ? tuple.v0 : kIndex == 1 ? tuple.v1 : tuple.v2;
|
||||
}
|
||||
|
||||
template <size_t kIndex, class D>
|
||||
HWY_API VFromD<D> Get4(Vec4<D> tuple) {
|
||||
static_assert(kIndex < 4, "Tuple index out of bounds");
|
||||
return kIndex == 0 ? tuple.v0
|
||||
: kIndex == 1 ? tuple.v1
|
||||
: kIndex == 2 ? tuple.v2
|
||||
: tuple.v3;
|
||||
}
|
||||
|
||||
template <size_t kIndex, class D>
|
||||
HWY_API Vec2<D> Set2(Vec2<D> tuple, VFromD<D> val) {
|
||||
static_assert(kIndex < 2, "Tuple index out of bounds");
|
||||
if (kIndex == 0) {
|
||||
tuple.v0 = val;
|
||||
} else {
|
||||
tuple.v1 = val;
|
||||
}
|
||||
return tuple;
|
||||
}
|
||||
|
||||
template <size_t kIndex, class D>
|
||||
HWY_API Vec3<D> Set3(Vec3<D> tuple, VFromD<D> val) {
|
||||
static_assert(kIndex < 3, "Tuple index out of bounds");
|
||||
if (kIndex == 0) {
|
||||
tuple.v0 = val;
|
||||
} else if (kIndex == 1) {
|
||||
tuple.v1 = val;
|
||||
} else {
|
||||
tuple.v2 = val;
|
||||
}
|
||||
return tuple;
|
||||
}
|
||||
|
||||
template <size_t kIndex, class D>
|
||||
HWY_API Vec4<D> Set4(Vec4<D> tuple, VFromD<D> val) {
|
||||
static_assert(kIndex < 4, "Tuple index out of bounds");
|
||||
if (kIndex == 0) {
|
||||
tuple.v0 = val;
|
||||
} else if (kIndex == 1) {
|
||||
tuple.v1 = val;
|
||||
} else if (kIndex == 2) {
|
||||
tuple.v2 = val;
|
||||
} else {
|
||||
tuple.v3 = val;
|
||||
}
|
||||
return tuple;
|
||||
}
|
5718
deps/highway/include/hwy/ops/wasm_128-inl.h
vendored
Normal file
5718
deps/highway/include/hwy/ops/wasm_128-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
10756
deps/highway/include/hwy/ops/x86_128-inl.h
vendored
Normal file
10756
deps/highway/include/hwy/ops/x86_128-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
7428
deps/highway/include/hwy/ops/x86_256-inl.h
vendored
Normal file
7428
deps/highway/include/hwy/ops/x86_256-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
6733
deps/highway/include/hwy/ops/x86_512-inl.h
vendored
Normal file
6733
deps/highway/include/hwy/ops/x86_512-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
44
deps/highway/include/hwy/per_target.h
vendored
Normal file
44
deps/highway/include/hwy/per_target.h
vendored
Normal file
|
@ -0,0 +1,44 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_PER_TARGET_H_
|
||||
#define HIGHWAY_HWY_PER_TARGET_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
// Functions to query the capabilities of the target that will be called by
|
||||
// HWY_DYNAMIC_DISPATCH, which is not necessarily the current target.
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
|
||||
//
|
||||
// Do not cache the result, which may change after calling DisableTargets, or
|
||||
// if software requests a different vector size (e.g. when entering/exiting SME
|
||||
// streaming mode). Instead call this right before the code that depends on the
|
||||
// result, without any DisableTargets or SME transition in-between. Note that
|
||||
// this involves an indirect call, so prefer not to call this frequently nor
|
||||
// unnecessarily.
|
||||
HWY_DLLEXPORT size_t VectorBytes();
|
||||
|
||||
// Returns whether 16/64-bit floats are a supported lane type.
|
||||
HWY_DLLEXPORT bool HaveFloat16();
|
||||
HWY_DLLEXPORT bool HaveFloat64();
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_PER_TARGET_H_
|
62
deps/highway/include/hwy/print-inl.h
vendored
Normal file
62
deps/highway/include/hwy/print-inl.h
vendored
Normal file
|
@ -0,0 +1,62 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Print() function
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/print.h"
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_PRINT_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_PRINT_INL_H_
|
||||
#undef HIGHWAY_HWY_PRINT_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_PRINT_INL_H_
|
||||
#endif
|
||||
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Prints lanes around `lane`, in memory order.
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_API void Print(const D d, const char* caption, V v, size_t lane_u = 0,
|
||||
size_t max_lanes = 7) {
|
||||
const size_t N = Lanes(d);
|
||||
using T = TFromD<D>;
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
auto storage = AllocateAligned<T>(N);
|
||||
T* HWY_RESTRICT lanes = storage.get();
|
||||
#else
|
||||
// This works around an SVE compile error on GCC 11 and 12. Calling
|
||||
// AllocateAligned here would seem to require it be marked with HWY_ATTR.
|
||||
HWY_ALIGN T lanes[MaxLanes(d)];
|
||||
#endif
|
||||
Store(v, d, lanes);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
hwy::detail::PrintArray(info, caption, lanes, N, lane_u, max_lanes);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // per-target include guard
|
75
deps/highway/include/hwy/print.h
vendored
Normal file
75
deps/highway/include/hwy/print.h
vendored
Normal file
|
@ -0,0 +1,75 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HWY_PRINT_H_
|
||||
#define HWY_PRINT_H_
|
||||
|
||||
// Helpers for printing vector lanes.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
namespace detail {
|
||||
|
||||
// For implementing value comparisons etc. as type-erased functions to reduce
|
||||
// template bloat.
|
||||
struct TypeInfo {
|
||||
size_t sizeof_t;
|
||||
bool is_float;
|
||||
bool is_signed;
|
||||
bool is_bf16;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE TypeInfo MakeTypeInfo() {
|
||||
TypeInfo info;
|
||||
info.sizeof_t = sizeof(T);
|
||||
info.is_float = IsFloat<T>();
|
||||
info.is_signed = IsSigned<T>();
|
||||
info.is_bf16 = IsSame<T, bfloat16_t>();
|
||||
return info;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
|
||||
HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
|
||||
char* string100);
|
||||
|
||||
HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N,
|
||||
size_t lane_u = 0, size_t max_lanes = 7);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void PrintValue(T value) {
|
||||
char str[100];
|
||||
detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
|
||||
fprintf(stderr, "%s,", str);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void PrintArray(const T* value, size_t count) {
|
||||
detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
|
||||
count);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_PRINT_H_
|
148
deps/highway/include/hwy/robust_statistics.h
vendored
Normal file
148
deps/highway/include/hwy/robust_statistics.h
vendored
Normal file
|
@ -0,0 +1,148 @@
|
|||
// Copyright 2023 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_ROBUST_STATISTICS_H_
|
||||
#define HIGHWAY_HWY_ROBUST_STATISTICS_H_
|
||||
|
||||
#include <algorithm> // std::sort, std::find_if
|
||||
#include <limits>
|
||||
#include <utility> // std::pair
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
namespace robust_statistics {
|
||||
|
||||
// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
|
||||
// than std::sort for input distributions with very few unique values.
|
||||
template <class T>
|
||||
void CountingSort(T* values, size_t num_values) {
|
||||
// Unique values and their frequency (similar to flat_map).
|
||||
using Unique = std::pair<T, int>;
|
||||
std::vector<Unique> unique;
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
const T value = values[i];
|
||||
const auto pos =
|
||||
std::find_if(unique.begin(), unique.end(),
|
||||
[value](const Unique u) { return u.first == value; });
|
||||
if (pos == unique.end()) {
|
||||
unique.push_back(std::make_pair(value, 1));
|
||||
} else {
|
||||
++pos->second;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort in ascending order of value (pair.first).
|
||||
std::sort(unique.begin(), unique.end());
|
||||
|
||||
// Write that many copies of each unique value to the array.
|
||||
T* HWY_RESTRICT p = values;
|
||||
for (const auto& value_count : unique) {
|
||||
std::fill(p, p + value_count.second, value_count.first);
|
||||
p += value_count.second;
|
||||
}
|
||||
HWY_ASSERT(p == values + num_values);
|
||||
}
|
||||
|
||||
// @return i in [idx_begin, idx_begin + half_count) that minimizes
|
||||
// sorted[i + half_count] - sorted[i].
|
||||
template <typename T>
|
||||
size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
|
||||
const size_t half_count) {
|
||||
T min_range = std::numeric_limits<T>::max();
|
||||
size_t min_idx = 0;
|
||||
|
||||
for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
|
||||
HWY_ASSERT(sorted[idx] <= sorted[idx + half_count]);
|
||||
const T range = sorted[idx + half_count] - sorted[idx];
|
||||
if (range < min_range) {
|
||||
min_range = range;
|
||||
min_idx = idx;
|
||||
}
|
||||
}
|
||||
|
||||
return min_idx;
|
||||
}
|
||||
|
||||
// Returns an estimate of the mode by calling MinRange on successively
|
||||
// halved intervals. "sorted" must be in ascending order. This is the
|
||||
// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
|
||||
// estimator of the mode", with complexity O(N log N). The mode is less
|
||||
// affected by outliers in highly-skewed distributions than the median.
|
||||
// The averaging operation below assumes "T" is an unsigned integer type.
|
||||
template <typename T>
|
||||
T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
|
||||
size_t idx_begin = 0;
|
||||
size_t half_count = num_values / 2;
|
||||
while (half_count > 1) {
|
||||
idx_begin = MinRange(sorted, idx_begin, half_count);
|
||||
half_count >>= 1;
|
||||
}
|
||||
|
||||
const T x = sorted[idx_begin + 0];
|
||||
if (half_count == 0) {
|
||||
return x;
|
||||
}
|
||||
HWY_ASSERT(half_count == 1);
|
||||
const T average = (x + sorted[idx_begin + 1] + 1) / 2;
|
||||
return average;
|
||||
}
|
||||
|
||||
// Returns the mode. Side effect: sorts "values".
|
||||
template <typename T>
|
||||
T Mode(T* values, const size_t num_values) {
|
||||
CountingSort(values, num_values);
|
||||
return ModeOfSorted(values, num_values);
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
T Mode(T (&values)[N]) {
|
||||
return Mode(&values[0], N);
|
||||
}
|
||||
|
||||
// Returns the median value. Side effect: sorts "values".
|
||||
template <typename T>
|
||||
T Median(T* values, const size_t num_values) {
|
||||
HWY_ASSERT(num_values != 0);
|
||||
std::sort(values, values + num_values);
|
||||
const size_t half = num_values / 2;
|
||||
// Odd count: return middle
|
||||
if (num_values % 2) {
|
||||
return values[half];
|
||||
}
|
||||
// Even count: return average of middle two.
|
||||
return (values[half] + values[half - 1] + 1) / 2;
|
||||
}
|
||||
|
||||
// Returns a robust measure of variability.
|
||||
template <typename T>
|
||||
T MedianAbsoluteDeviation(const T* values, const size_t num_values,
|
||||
const T median) {
|
||||
HWY_ASSERT(num_values != 0);
|
||||
std::vector<T> abs_deviations;
|
||||
abs_deviations.reserve(num_values);
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
|
||||
static_cast<int64_t>(median));
|
||||
abs_deviations.push_back(static_cast<T>(abs));
|
||||
}
|
||||
return Median(abs_deviations.data(), num_values);
|
||||
}
|
||||
|
||||
} // namespace robust_statistics
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_ROBUST_STATISTICS_H_
|
338
deps/highway/include/hwy/targets.h
vendored
Normal file
338
deps/highway/include/hwy/targets.h
vendored
Normal file
|
@ -0,0 +1,338 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_TARGETS_H_
|
||||
#define HIGHWAY_HWY_TARGETS_H_
|
||||
|
||||
// Allows opting out of C++ standard library usage, which is not available in
|
||||
// some Compiler Explorer environments.
|
||||
#ifndef HWY_NO_LIBCXX
|
||||
#include <vector>
|
||||
#endif
|
||||
|
||||
// For SIMD module implementations and their callers. Defines which targets to
|
||||
// generate and call.
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/detect_targets.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX)
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns bitfield of enabled targets that are supported on this CPU; there is
|
||||
// always at least one such target, hence the return value is never 0. The
|
||||
// targets returned may change after calling DisableTargets. This function is
|
||||
// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
|
||||
// calls to it if there is only a single target enabled.
|
||||
HWY_DLLEXPORT int64_t SupportedTargets();
|
||||
|
||||
// Evaluates to a function call, or literal if there is a single target.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
|
||||
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
|
||||
#else
|
||||
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
|
||||
#endif
|
||||
|
||||
// Subsequent SupportedTargets will not return targets whose bit(s) are set in
|
||||
// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
|
||||
// instead return HWY_STATIC_TARGET (there must always be one target to call).
|
||||
//
|
||||
// This function is useful for disabling targets known to be buggy, or if the
|
||||
// best available target is undesirable (perhaps due to throttling or memory
|
||||
// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
|
||||
// function for iteratively enabling specific targets for testing.
|
||||
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
|
||||
|
||||
// Subsequent SupportedTargets will return the given set of targets, except
|
||||
// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
|
||||
// and return to the normal SupportedTargets behavior. Used to run tests for
|
||||
// all targets.
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
|
||||
|
||||
#ifndef HWY_NO_LIBCXX
|
||||
|
||||
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
|
||||
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
|
||||
// is affected by the current SetSupportedTargetsForTest() mock if any.
|
||||
HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
|
||||
std::vector<int64_t> ret;
|
||||
for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
|
||||
targets = targets & (targets - 1)) {
|
||||
int64_t current_target = targets & ~(targets - 1);
|
||||
ret.push_back(current_target);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif // HWY_NO_LIBCXX
|
||||
|
||||
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
||||
switch (target) {
|
||||
#if HWY_ARCH_X86
|
||||
case HWY_SSE2:
|
||||
return "SSE2";
|
||||
case HWY_SSSE3:
|
||||
return "SSSE3";
|
||||
case HWY_SSE4:
|
||||
return "SSE4";
|
||||
case HWY_AVX2:
|
||||
return "AVX2";
|
||||
case HWY_AVX3:
|
||||
return "AVX3";
|
||||
case HWY_AVX3_DL:
|
||||
return "AVX3_DL";
|
||||
case HWY_AVX3_ZEN4:
|
||||
return "AVX3_ZEN4";
|
||||
case HWY_AVX3_SPR:
|
||||
return "AVX3_SPR";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
case HWY_SVE2_128:
|
||||
return "SVE2_128";
|
||||
case HWY_SVE_256:
|
||||
return "SVE_256";
|
||||
case HWY_SVE2:
|
||||
return "SVE2";
|
||||
case HWY_SVE:
|
||||
return "SVE";
|
||||
case HWY_NEON:
|
||||
return "NEON";
|
||||
case HWY_NEON_WITHOUT_AES:
|
||||
return "NEON_WITHOUT_AES";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_PPC
|
||||
case HWY_PPC8:
|
||||
return "PPC8";
|
||||
case HWY_PPC9:
|
||||
return "PPC9";
|
||||
case HWY_PPC10:
|
||||
return "PPC10";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_WASM
|
||||
case HWY_WASM:
|
||||
return "WASM";
|
||||
case HWY_WASM_EMU256:
|
||||
return "WASM_EMU256";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_RVV
|
||||
case HWY_RVV:
|
||||
return "RVV";
|
||||
#endif
|
||||
|
||||
case HWY_EMU128:
|
||||
return "EMU128";
|
||||
case HWY_SCALAR:
|
||||
return "SCALAR";
|
||||
|
||||
default:
|
||||
return "Unknown"; // must satisfy gtest IsValidParamName()
|
||||
}
|
||||
}
|
||||
|
||||
// The maximum number of dynamic targets on any architecture is defined by
|
||||
// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
|
||||
|
||||
// For the ChosenTarget mask and index we use a different bit arrangement than
|
||||
// in the HWY_TARGETS mask. Only the targets involved in the current
|
||||
// architecture are used in this mask, and therefore only the least significant
|
||||
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
|
||||
// significant bit is set when the mask is not initialized, the next
|
||||
// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
|
||||
// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
|
||||
// that position and the next more significant bit is used for HWY_SCALAR (if
|
||||
// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
|
||||
// define equivalent values for HWY_TARGETS in this representation.
|
||||
// This mask representation allows to use ctz() on this mask and obtain a small
|
||||
// number that's used as an index of the table for dynamic dispatch. In this
|
||||
// way the first entry is used when the mask is uninitialized, the following
|
||||
// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
|
||||
// scalar.
|
||||
|
||||
// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
|
||||
|
||||
// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
|
||||
// current architecture.
|
||||
#define HWY_CHOSEN_TARGET_SHIFT(X) \
|
||||
((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
|
||||
((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
|
||||
<< 1)
|
||||
|
||||
// The HWY_TARGETS mask in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_TARGETS \
|
||||
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Maximum number of dynamic targets, changing this value is an ABI incompatible
|
||||
// change
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 15
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
|
||||
// These must match the order in which the HWY_TARGETS are defined
|
||||
// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
|
||||
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
|
||||
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
|
||||
// corresponds to the best target. Don't include a "," at the end of the list.
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3_SPR(func_name), /* AVX3_SPR */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3_ZEN4(func_name), /* AVX3_ZEN4 */ \
|
||||
HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
|
||||
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
|
||||
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
|
||||
nullptr, /* AVX */ \
|
||||
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
|
||||
HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
|
||||
nullptr, /* reserved - SSE3? */ \
|
||||
HWY_CHOOSE_SSE2(func_name) /* SSE2 */
|
||||
|
||||
#elif HWY_ARCH_ARM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 15
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
|
||||
HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \
|
||||
HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
|
||||
HWY_CHOOSE_SVE(func_name), /* SVE */ \
|
||||
HWY_CHOOSE_NEON(func_name), /* NEON */ \
|
||||
HWY_CHOOSE_NEON_WITHOUT_AES(func_name) /* NEON without AES */
|
||||
|
||||
#elif HWY_ARCH_RVV
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 9
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_RVV(func_name), /* RVV */ \
|
||||
nullptr /* reserved */
|
||||
|
||||
#elif HWY_ARCH_PPC
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 9
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_PPC10(func_name), /* PPC10 */ \
|
||||
HWY_CHOOSE_PPC9(func_name), /* PPC9 */ \
|
||||
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
|
||||
nullptr, /* reserved (VSX or AltiVec) */ \
|
||||
nullptr /* reserved (VSX or AltiVec) */
|
||||
|
||||
#elif HWY_ARCH_WASM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 9
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
|
||||
HWY_CHOOSE_WASM(func_name), /* WASM */ \
|
||||
nullptr /* reserved */
|
||||
|
||||
#else
|
||||
// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
|
||||
// still creating single-entry tables in HWY_EXPORT to ensure portability.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 1
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
|
||||
#endif
|
||||
|
||||
// Bitfield of supported and enabled targets. The format differs from that of
|
||||
// HWY_TARGETS; the lowest bit governs the first function pointer (which is
|
||||
// special in that it calls FunctionCache, then Update, then dispatches to the
|
||||
// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
|
||||
// GetChosenTarget), thread-safe except on RVV.
|
||||
struct ChosenTarget {
|
||||
public:
|
||||
// Reset bits according to `targets` (typically the return value of
|
||||
// SupportedTargets()). Postcondition: IsInitialized() == true.
|
||||
void Update(int64_t targets) {
|
||||
// These are `targets` shifted downwards, see above. Also include SCALAR
|
||||
// (corresponds to the last entry in the function table) as fallback.
|
||||
StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
|
||||
}
|
||||
|
||||
// Reset to the uninitialized state, so that FunctionCache will call Update
|
||||
// during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
|
||||
void DeInit() { StoreMask(1); }
|
||||
|
||||
// Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
|
||||
// function was called, which we check in tests.
|
||||
bool IsInitialized() const { return LoadMask() != 1; }
|
||||
|
||||
// Return the index in the dynamic dispatch table to be used by the current
|
||||
// CPU. Note that this method must be in the header file so it uses the value
|
||||
// of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
|
||||
// calls it, which may be different from others. This means we only enable
|
||||
// those targets that were actually compiled in this module.
|
||||
size_t HWY_INLINE GetIndex() const {
|
||||
return hwy::Num0BitsBelowLS1Bit_Nonzero64(
|
||||
static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
|
||||
}
|
||||
|
||||
private:
|
||||
// TODO(janwas): remove RVV once <atomic> is available
|
||||
#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX)
|
||||
int64_t LoadMask() const { return mask_; }
|
||||
void StoreMask(int64_t mask) { mask_ = mask; }
|
||||
|
||||
int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0.
|
||||
#else
|
||||
int64_t LoadMask() const { return mask_.load(); }
|
||||
void StoreMask(int64_t mask) { mask_.store(mask); }
|
||||
|
||||
std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0.
|
||||
#endif // HWY_ARCH_RVV
|
||||
};
|
||||
|
||||
// For internal use (e.g. by FunctionCache and DisableTargets).
|
||||
HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_TARGETS_H_
|
200
deps/highway/include/hwy/timer-inl.h
vendored
Normal file
200
deps/highway/include/hwy/timer-inl.h
vendored
Normal file
|
@ -0,0 +1,200 @@
|
|||
// Copyright 2023 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// High-resolution and high-precision timer
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_TIMER_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_TIMER_INL_H_
|
||||
#undef HIGHWAY_HWY_TIMER_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_TIMER_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/timer.h"
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif // NOMINMAX
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <mach/mach.h>
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
|
||||
#if defined(__HAIKU__)
|
||||
#include <OS.h>
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
||||
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include <time.h> // clock_gettime
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace timer {
|
||||
|
||||
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
|
||||
// unsigned to guarantee wraparound on overflow.
|
||||
using Ticks = uint64_t;
|
||||
|
||||
// Start/Stop return absolute timestamps and must be placed immediately before
|
||||
// and after the region to measure. We provide separate Start/Stop functions
|
||||
// because they use different fences.
|
||||
//
|
||||
// Background: RDTSC is not 'serializing'; earlier instructions may complete
|
||||
// after it, and/or later instructions may complete before it. 'Fences' ensure
|
||||
// regions' elapsed times are independent of such reordering. The only
|
||||
// documented unprivileged serializing instruction is CPUID, which acts as a
|
||||
// full fence (no reordering across it in either direction). Unfortunately
|
||||
// the latency of CPUID varies wildly (perhaps made worse by not initializing
|
||||
// its EAX input). Because it cannot reliably be deducted from the region's
|
||||
// elapsed time, it must not be included in the region to measure (i.e.
|
||||
// between the two RDTSC).
|
||||
//
|
||||
// The newer RDTSCP is sometimes described as serializing, but it actually
|
||||
// only serves as a half-fence with release semantics. Although all
|
||||
// instructions in the region will complete before the final timestamp is
|
||||
// captured, subsequent instructions may leak into the region and increase the
|
||||
// elapsed time. Inserting another fence after the final RDTSCP would prevent
|
||||
// such reordering without affecting the measured region.
|
||||
//
|
||||
// Fortunately, such a fence exists. The LFENCE instruction is only documented
|
||||
// to delay later loads until earlier loads are visible. However, Intel's
|
||||
// reference manual says it acts as a full fence (waiting until all earlier
|
||||
// instructions have completed, and delaying later instructions until it
|
||||
// completes). AMD assigns the same behavior to MFENCE.
|
||||
//
|
||||
// We need a fence before the initial RDTSC to prevent earlier instructions
|
||||
// from leaking into the region, and arguably another after RDTSC to avoid
|
||||
// region instructions from completing before the timestamp is recorded.
|
||||
// When surrounded by fences, the additional RDTSCP half-fence provides no
|
||||
// benefit, so the initial timestamp can be recorded via RDTSC, which has
|
||||
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
|
||||
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
|
||||
//
|
||||
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
|
||||
// However, Stop+Stop includes an LFENCE in the region measurements, which
|
||||
// adds a delay dependent on earlier loads. The combination of Start+Stop
|
||||
// is faster than Start+Start and more consistent than Stop+Stop because
|
||||
// the first LFENCE already delayed subsequent loads before the measured
|
||||
// region. This combination seems not to have been considered in prior work:
|
||||
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
|
||||
//
|
||||
// Note: performance counters can measure 'exact' instructions-retired or
|
||||
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
|
||||
// requires fences. Unfortunately, it is not accessible on all OSes and we
|
||||
// prefer to avoid kernel-mode drivers. Performance counters are also affected
|
||||
// by several under/over-count errata, so we use the TSC instead.
|
||||
|
||||
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
|
||||
// divide by InvariantTicksPerSecond.
|
||||
inline Ticks Start() {
|
||||
Ticks t;
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
||||
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
||||
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
t = __rdtsc();
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
#elif HWY_ARCH_X86_64
|
||||
asm volatile(
|
||||
"lfence\n\t"
|
||||
"rdtsc\n\t"
|
||||
"shl $32, %%rdx\n\t"
|
||||
"or %%rdx, %0\n\t"
|
||||
"lfence"
|
||||
: "=a"(t)
|
||||
:
|
||||
// "memory" avoids reordering. rdx = TSC >> 32.
|
||||
// "cc" = flags modified by SHL.
|
||||
: "rdx", "memory", "cc");
|
||||
#elif HWY_ARCH_RVV
|
||||
asm volatile("rdtime %0" : "=r"(t));
|
||||
#elif defined(_WIN32) || defined(_WIN64)
|
||||
LARGE_INTEGER counter;
|
||||
(void)QueryPerformanceCounter(&counter);
|
||||
t = counter.QuadPart;
|
||||
#elif defined(__APPLE__)
|
||||
t = mach_absolute_time();
|
||||
#elif defined(__HAIKU__)
|
||||
t = system_time_nsecs(); // since boot
|
||||
#else // POSIX
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
// WARNING: on x86, caller must check HasRDTSCP before using this!
|
||||
inline Ticks Stop() {
|
||||
uint64_t t;
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__)
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
||||
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
||||
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
unsigned aux;
|
||||
t = __rdtscp(&aux);
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
#elif HWY_ARCH_X86_64
|
||||
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
|
||||
asm volatile(
|
||||
"rdtscp\n\t"
|
||||
"shl $32, %%rdx\n\t"
|
||||
"or %%rdx, %0\n\t"
|
||||
"lfence"
|
||||
: "=a"(t)
|
||||
:
|
||||
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
|
||||
// "cc" = flags modified by SHL.
|
||||
: "rcx", "rdx", "memory", "cc");
|
||||
#else
|
||||
t = Start();
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
} // namespace timer
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // per-target include guard
|
55
deps/highway/include/hwy/timer.h
vendored
Normal file
55
deps/highway/include/hwy/timer.h
vendored
Normal file
|
@ -0,0 +1,55 @@
|
|||
// Copyright 2023 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_TIMER_H_
|
||||
#define HIGHWAY_HWY_TIMER_H_
|
||||
|
||||
// Platform-specific timer functions. Provides Now() and functions for
|
||||
// interpreting and converting the timer-inl.h Ticks.
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
namespace platform {
|
||||
|
||||
// Returns current timestamp [in seconds] relative to an unspecified origin.
|
||||
// Features: monotonic (no negative elapsed time), steady (unaffected by system
|
||||
// time changes), high-resolution (on the order of microseconds).
|
||||
// Uses InvariantTicksPerSecond and the baseline version of timer::Start().
|
||||
HWY_DLLEXPORT double Now();
|
||||
|
||||
// Functions for use with timer-inl.h:
|
||||
|
||||
// Returns whether it is safe to call timer::Stop without executing an illegal
|
||||
// instruction; if false, fills cpu100 (a pointer to a 100 character buffer)
|
||||
// with the CPU brand string or an empty string if unknown.
|
||||
HWY_DLLEXPORT bool HaveTimerStop(char* cpu100);
|
||||
|
||||
// Returns tick rate, useful for converting timer::Ticks to seconds. Invariant
|
||||
// means the tick counter frequency is independent of CPU throttling or sleep.
|
||||
// This call may be expensive, callers should cache the result.
|
||||
HWY_DLLEXPORT double InvariantTicksPerSecond();
|
||||
|
||||
// Returns ticks elapsed in back to back timer calls, i.e. a function of the
|
||||
// timer resolution (minimum measurable difference) and overhead.
|
||||
// This call is expensive, callers should cache the result.
|
||||
HWY_DLLEXPORT uint64_t TimerResolution();
|
||||
|
||||
} // namespace platform
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_TIMER_H_
|
19
deps/highway/lib/cmake/hwy/hwy-config-release.cmake
vendored
Normal file
19
deps/highway/lib/cmake/hwy/hwy-config-release.cmake
vendored
Normal file
|
@ -0,0 +1,19 @@
|
|||
#----------------------------------------------------------------
|
||||
# Generated CMake target import file for configuration "Release".
|
||||
#----------------------------------------------------------------
|
||||
|
||||
# Commands may need to know the format version.
|
||||
set(CMAKE_IMPORT_FILE_VERSION 1)
|
||||
|
||||
# Import target "hwy::hwy" for configuration "Release"
|
||||
set_property(TARGET hwy::hwy APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
|
||||
set_target_properties(hwy::hwy PROPERTIES
|
||||
IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
|
||||
IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libhwy.a"
|
||||
)
|
||||
|
||||
list(APPEND _cmake_import_check_targets hwy::hwy )
|
||||
list(APPEND _cmake_import_check_files_for_hwy::hwy "${_IMPORT_PREFIX}/lib/libhwy.a" )
|
||||
|
||||
# Commands beyond this point should not need to know the version.
|
||||
set(CMAKE_IMPORT_FILE_VERSION)
|
70
deps/highway/lib/cmake/hwy/hwy-config-version.cmake
vendored
Normal file
70
deps/highway/lib/cmake/hwy/hwy-config-version.cmake
vendored
Normal file
|
@ -0,0 +1,70 @@
|
|||
# This is a basic version file for the Config-mode of find_package().
|
||||
# It is used by write_basic_package_version_file() as input file for configure_file()
|
||||
# to create a version-file which can be installed along a config.cmake file.
|
||||
#
|
||||
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
|
||||
# the requested version string are exactly the same and it sets
|
||||
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version,
|
||||
# but only if the requested major version is the same as the current one.
|
||||
# The variable CVF_VERSION must be set before calling configure_file().
|
||||
|
||||
|
||||
set(PACKAGE_VERSION "1.0.6")
|
||||
|
||||
if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
else()
|
||||
|
||||
if("1.0.6" MATCHES "^([0-9]+)\\.")
|
||||
set(CVF_VERSION_MAJOR "${CMAKE_MATCH_1}")
|
||||
if(NOT CVF_VERSION_MAJOR VERSION_EQUAL 0)
|
||||
string(REGEX REPLACE "^0+" "" CVF_VERSION_MAJOR "${CVF_VERSION_MAJOR}")
|
||||
endif()
|
||||
else()
|
||||
set(CVF_VERSION_MAJOR "1.0.6")
|
||||
endif()
|
||||
|
||||
if(PACKAGE_FIND_VERSION_RANGE)
|
||||
# both endpoints of the range must have the expected major version
|
||||
math (EXPR CVF_VERSION_MAJOR_NEXT "${CVF_VERSION_MAJOR} + 1")
|
||||
if (NOT PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR
|
||||
OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX_MAJOR STREQUAL CVF_VERSION_MAJOR)
|
||||
OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND NOT PACKAGE_FIND_VERSION_MAX VERSION_LESS_EQUAL CVF_VERSION_MAJOR_NEXT)))
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
elseif(PACKAGE_FIND_VERSION_MIN_MAJOR STREQUAL CVF_VERSION_MAJOR
|
||||
AND ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS_EQUAL PACKAGE_FIND_VERSION_MAX)
|
||||
OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MAX)))
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
else()
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
endif()
|
||||
else()
|
||||
if(PACKAGE_FIND_VERSION_MAJOR STREQUAL CVF_VERSION_MAJOR)
|
||||
set(PACKAGE_VERSION_COMPATIBLE TRUE)
|
||||
else()
|
||||
set(PACKAGE_VERSION_COMPATIBLE FALSE)
|
||||
endif()
|
||||
|
||||
if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
|
||||
set(PACKAGE_VERSION_EXACT TRUE)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
# if the installed project requested no architecture check, don't perform the check
|
||||
if("FALSE")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
|
||||
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "8" STREQUAL "")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
|
||||
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "8")
|
||||
math(EXPR installedBits "8 * 8")
|
||||
set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
|
||||
set(PACKAGE_VERSION_UNSUITABLE TRUE)
|
||||
endif()
|
104
deps/highway/lib/cmake/hwy/hwy-config.cmake
vendored
Normal file
104
deps/highway/lib/cmake/hwy/hwy-config.cmake
vendored
Normal file
|
@ -0,0 +1,104 @@
|
|||
# Generated by CMake
|
||||
|
||||
if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
|
||||
message(FATAL_ERROR "CMake >= 2.8.0 required")
|
||||
endif()
|
||||
if(CMAKE_VERSION VERSION_LESS "2.8.3")
|
||||
message(FATAL_ERROR "CMake >= 2.8.3 required")
|
||||
endif()
|
||||
cmake_policy(PUSH)
|
||||
cmake_policy(VERSION 2.8.3...3.23)
|
||||
#----------------------------------------------------------------
|
||||
# Generated CMake target import file.
|
||||
#----------------------------------------------------------------
|
||||
|
||||
# Commands may need to know the format version.
|
||||
set(CMAKE_IMPORT_FILE_VERSION 1)
|
||||
|
||||
# Protect against multiple inclusion, which would fail when already imported targets are added once more.
|
||||
set(_cmake_targets_defined "")
|
||||
set(_cmake_targets_not_defined "")
|
||||
set(_cmake_expected_targets "")
|
||||
foreach(_cmake_expected_target IN ITEMS hwy::hwy)
|
||||
list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
|
||||
if(TARGET "${_cmake_expected_target}")
|
||||
list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
|
||||
else()
|
||||
list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
|
||||
endif()
|
||||
endforeach()
|
||||
unset(_cmake_expected_target)
|
||||
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
|
||||
unset(_cmake_targets_defined)
|
||||
unset(_cmake_targets_not_defined)
|
||||
unset(_cmake_expected_targets)
|
||||
unset(CMAKE_IMPORT_FILE_VERSION)
|
||||
cmake_policy(POP)
|
||||
return()
|
||||
endif()
|
||||
if(NOT _cmake_targets_defined STREQUAL "")
|
||||
string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
|
||||
string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
|
||||
message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
|
||||
endif()
|
||||
unset(_cmake_targets_defined)
|
||||
unset(_cmake_targets_not_defined)
|
||||
unset(_cmake_expected_targets)
|
||||
|
||||
|
||||
# Compute the installation prefix relative to this file.
|
||||
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
|
||||
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
|
||||
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
|
||||
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
|
||||
if(_IMPORT_PREFIX STREQUAL "/")
|
||||
set(_IMPORT_PREFIX "")
|
||||
endif()
|
||||
|
||||
# Create imported target hwy::hwy
|
||||
add_library(hwy::hwy STATIC IMPORTED)
|
||||
|
||||
set_target_properties(hwy::hwy PROPERTIES
|
||||
INTERFACE_COMPILE_DEFINITIONS "TOOLCHAIN_MISS_SYS_AUXV_H;TOOLCHAIN_MISS_ASM_HWCAP_H;HWY_STATIC_DEFINE"
|
||||
INTERFACE_COMPILE_FEATURES "cxx_std_11"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
|
||||
)
|
||||
|
||||
# Load information for each installed configuration.
|
||||
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/hwy-config-*.cmake")
|
||||
foreach(_cmake_config_file IN LISTS _cmake_config_files)
|
||||
include("${_cmake_config_file}")
|
||||
endforeach()
|
||||
unset(_cmake_config_file)
|
||||
unset(_cmake_config_files)
|
||||
|
||||
# Cleanup temporary variables.
|
||||
set(_IMPORT_PREFIX)
|
||||
|
||||
# Loop over all imported files and verify that they actually exist
|
||||
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
|
||||
foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
|
||||
if(NOT EXISTS "${_cmake_file}")
|
||||
message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
|
||||
\"${_cmake_file}\"
|
||||
but this file does not exist. Possible reasons include:
|
||||
* The file was deleted, renamed, or moved to another location.
|
||||
* An install or uninstall procedure did not complete successfully.
|
||||
* The installation package was faulty and contained
|
||||
\"${CMAKE_CURRENT_LIST_FILE}\"
|
||||
but not all the files it references.
|
||||
")
|
||||
endif()
|
||||
endforeach()
|
||||
unset(_cmake_file)
|
||||
unset("_cmake_import_check_files_for_${_cmake_target}")
|
||||
endforeach()
|
||||
unset(_cmake_target)
|
||||
unset(_cmake_import_check_targets)
|
||||
|
||||
# This file does not depend on other imported targets which have
|
||||
# been exported from the same project but in a separate export set.
|
||||
|
||||
# Commands beyond this point should not need to know the version.
|
||||
set(CMAKE_IMPORT_FILE_VERSION)
|
||||
cmake_policy(POP)
|
BIN
deps/highway/lib/libhwy.a
vendored
Normal file
BIN
deps/highway/lib/libhwy.a
vendored
Normal file
Binary file not shown.
10
deps/highway/lib/pkgconfig/libhwy.pc
vendored
Normal file
10
deps/highway/lib/pkgconfig/libhwy.pc
vendored
Normal file
|
@ -0,0 +1,10 @@
|
|||
prefix=
|
||||
exec_prefix=${prefix}
|
||||
libdir=${exec_prefix}/lib
|
||||
includedir=${prefix}/include
|
||||
|
||||
Name: libhwy
|
||||
Description: Efficient and performance-portable SIMD wrapper
|
||||
Version: 1.0.6
|
||||
Libs: -L${libdir} -lhwy
|
||||
Cflags: -I${includedir} -DHWY_STATIC_DEFINE
|
Loading…
Reference in a new issue