[simd] Improve portability for aarch64

No need for compiler flags for 64-bit arm, and the few intrinsics that
are actually needed are in a different header.
This commit is contained in:
Bill Currie 2023-03-25 16:29:46 +09:00
parent 98fb9e0942
commit 56344e478d
10 changed files with 80 additions and 80 deletions

View file

@ -98,37 +98,41 @@ AC_ARG_ENABLE(optimize,
optimize=yes optimize=yes
) )
AC_ARG_ENABLE(simd, if test "x$host_cpu" = xaarch64; then
AS_HELP_STRING([--enable-simd@<:@=arg@:.@], simd=neon
[enable SIMD support (default auto)]), else
[], AC_ARG_ENABLE(simd,
[enable_simd=yes] AS_HELP_STRING([--enable-simd@<:@=arg@:.@],
) [enable SIMD support (default auto)]),
[],
[enable_simd=yes]
)
case "$enable_simd" in case "$enable_simd" in
no) no)
simd=no simd=no
;; ;;
sse|sse2|avx|avx2) sse|sse2|avx|avx2)
QF_CC_OPTION(-m$enable_simd) QF_CC_OPTION(-m$enable_simd)
simd=$enable_simd simd=$enable_simd
;; ;;
yes) yes)
for simd in avx2 avx sse2 sse; do for simd in avx2 avx sse2 sse; do
if lscpu | grep -q -w $simd; then if lscpu | grep -q -w $simd; then
QF_CC_OPTION(-m$simd) QF_CC_OPTION(-m$simd)
break break
fi fi
done done
;; ;;
esac esac
case "$simd" in case "$simd" in
avx*) avx*)
;; ;;
*) *)
QF_CC_OPTION(-Wno-psabi) QF_CC_OPTION(-Wno-psabi)
;; ;;
esac esac
fi
AC_MSG_CHECKING(for optimization) AC_MSG_CHECKING(for optimization)
if test "x$optimize" = xyes -a "x$leave_cflags_alone" != "xyes"; then if test "x$optimize" = xyes -a "x$leave_cflags_alone" != "xyes"; then

View file

@ -28,8 +28,6 @@
#ifndef __QF_simd_mat4f_h #ifndef __QF_simd_mat4f_h
#define __QF_simd_mat4f_h #define __QF_simd_mat4f_h
#include <immintrin.h>
#include "QF/simd/types.h" #include "QF/simd/types.h"
GNU89INLINE inline void maddf (mat4f_t c, const mat4f_t a, const mat4f_t b); GNU89INLINE inline void maddf (mat4f_t c, const mat4f_t a, const mat4f_t b);
@ -172,27 +170,27 @@ mat4fquat (mat4f_t m, vec4f_t q)
#undef m #undef m
{ {
vec4f_t a = xq; vec4f_t a = xq;
vec4f_t b = _mm_xor_ps (shuff103 (yq), (__m128) mpm); vec4f_t b = (vec4f_t) ((vec4i_t) shuff103 (yq) ^ mpm);
vec4f_t c = _mm_xor_ps (shuff230 (zq), (__m128) pmm); vec4f_t c = (vec4f_t) ((vec4i_t) shuff230 (zq) ^ pmm);
vec4f_t d = _mm_xor_ps (shuff321 (wq), (__m128) mmp); vec4f_t d = (vec4f_t) ((vec4i_t) shuff321 (wq) ^ mmp);
// column: ww + xx - yy - zz // 2xy + 2wz // 2zx - 2wy // 0 // column: ww + xx - yy - zz // 2xy + 2wz // 2zx - 2wy // 0
m[0] = _mm_and_ps (a + b - c - d, (__m128) mask); m[0] = (vec4f_t) ((vec4i_t) (a + b - c - d) & mask);
} }
{ {
vec4f_t a = _mm_xor_ps (shuff103 (xq), (__m128) mpm); vec4f_t a = (vec4f_t) ((vec4i_t) shuff103 (xq) ^ mpm);
vec4f_t b = yq; vec4f_t b = yq;
vec4f_t c = _mm_xor_ps (shuff321 (zq), (__m128) mmp); vec4f_t c = (vec4f_t) ((vec4i_t) shuff321 (zq) ^ mmp);
vec4f_t d = _mm_xor_ps (shuff230 (wq), (__m128) pmm); vec4f_t d = (vec4f_t) ((vec4i_t) shuff230 (wq) ^ pmm);
// column: 2xy - 2wz // ww - xx + yy - zz // 2yz + 2wx // 0 // column: 2xy - 2wz // ww - xx + yy - zz // 2yz + 2wx // 0
m[1] = _mm_and_ps (b + c - a - d, (__m128) mask); m[1] = (vec4f_t) ((vec4i_t) (b + c - a - d) & mask);
} }
{ {
vec4f_t a = _mm_xor_ps (shuff230 (xq), (__m128) pmm); vec4f_t a = (vec4f_t) ((vec4i_t) shuff230 (xq) ^ pmm);
vec4f_t b = _mm_xor_ps (shuff321 (yq), (__m128) mmp); vec4f_t b = (vec4f_t) ((vec4i_t) shuff321 (yq) ^ mmp);
vec4f_t c = zq; vec4f_t c = zq;
vec4f_t d = _mm_xor_ps (shuff103 (wq), (__m128) mpm); vec4f_t d = (vec4f_t) ((vec4i_t) shuff103 (wq) ^ mpm);
// column: 2xz + 2wy // 2yz - 2wx // ww - xx - yy + zz // 0 // column: 2xz + 2wy // 2yz - 2wx // ww - xx - yy + zz // 0
m[2] = _mm_and_ps (a - b + c - d, (__m128) mask); m[2] = (vec4f_t) ((vec4i_t) (a - b + c - d) & mask);
} }
m[3] = (vec4f_t) { 0, 0, 0, 1 }; m[3] = (vec4f_t) { 0, 0, 0, 1 };
} }

View file

@ -105,30 +105,4 @@ typedef struct vspheref_s {
float radius; float radius;
} vspheref_t; } vspheref_t;
#include <immintrin.h>
#ifndef __SSE__
#define _mm_xor_ps __qf_mm_xor_ps
#define _mm_and_ps __qf_mm_and_ps
GNU89INLINE inline __m128 _mm_xor_ps (__m128 a, __m128 b);
GNU89INLINE inline __m128 _mm_and_ps (__m128 a, __m128 b);
#ifndef IMPLEMENT_MAT4F_Funcs
GNU89INLINE inline
#else
VISIBLE
#endif
__m128 _mm_xor_ps (__m128 a, __m128 b)
{
return (__m128) ((vec4i_t) a ^ (vec4i_t) b);
}
#ifndef IMPLEMENT_MAT4F_Funcs
GNU89INLINE inline
#else
VISIBLE
#endif
__m128 _mm_and_ps (__m128 a, __m128 b)
{
return (__m128) ((vec4i_t) a & (vec4i_t) b);
}
#endif
#endif//__QF_simd_types_h #endif//__QF_simd_types_h

View file

@ -29,7 +29,11 @@
#ifndef __QF_simd_vec2d_h #ifndef __QF_simd_vec2d_h
#define __QF_simd_vec2d_h #define __QF_simd_vec2d_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h> #include <immintrin.h>
#endif
#include "QF/simd/types.h" #include "QF/simd/types.h"
@ -50,7 +54,11 @@ VISIBLE
vec2d_t vec2d_t
vsqrt2d (vec2d_t v) vsqrt2d (vec2d_t v)
{ {
#ifdef __aarch64__
return vsqrtq_f64 (v);
#else
return _mm_sqrt_pd (v); return _mm_sqrt_pd (v);
#endif
} }
#ifndef IMPLEMENT_VEC2D_Funcs #ifndef IMPLEMENT_VEC2D_Funcs

View file

@ -29,7 +29,11 @@
#ifndef __QF_simd_vec2f_h #ifndef __QF_simd_vec2f_h
#define __QF_simd_vec2f_h #define __QF_simd_vec2f_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h> #include <immintrin.h>
#endif
#include <math.h> #include <math.h>
#include "QF/simd/types.h" #include "QF/simd/types.h"
@ -67,9 +71,13 @@ VISIBLE
vec2f_t vec2f_t
vsqrt2f (vec2f_t v) vsqrt2f (vec2f_t v)
{ {
#ifdef __aarch64__
return vsqrt_f32 (v);
#else
vec4f_t t = { v[0], v[1], 0, 0 }; vec4f_t t = { v[0], v[1], 0, 0 };
t = _mm_sqrt_ps (t); t = _mm_sqrt_ps (t);
return (vec2f_t) { t[0], t[1] }; return (vec2f_t) { t[0], t[1] };
#endif
} }
#ifndef IMPLEMENT_VEC2F_Funcs #ifndef IMPLEMENT_VEC2F_Funcs

View file

@ -28,8 +28,6 @@
#ifndef __QF_simd_vec2i_h #ifndef __QF_simd_vec2i_h
#define __QF_simd_vec2i_h #define __QF_simd_vec2i_h
#include <immintrin.h>
#include <math.h>
#include "QF/simd/types.h" #include "QF/simd/types.h"
@ -59,7 +57,7 @@ VISIBLE
int int
any2i (vec2i_t v) any2i (vec2i_t v)
{ {
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0}); vec2i_t t = v == (vec2i_t) {0, 0};
#ifndef __SSSE3__ #ifndef __SSSE3__
return (t[0] + t[1]) > -2; return (t[0] + t[1]) > -2;
#else #else
@ -75,7 +73,7 @@ VISIBLE
int int
all2i (vec2i_t v) all2i (vec2i_t v)
{ {
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0}); vec2i_t t = v == (vec2i_t) {0, 0};
#ifndef __SSSE3__ #ifndef __SSSE3__
return (t[0] + t[1]) == 0; return (t[0] + t[1]) == 0;
#else #else
@ -91,7 +89,7 @@ VISIBLE
int int
none2i (vec2i_t v) none2i (vec2i_t v)
{ {
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0}); vec2i_t t = v == (vec2i_t) {0, 0};
#ifndef __SSSE3__ #ifndef __SSSE3__
return (t[0] + t[1]) == -2; return (t[0] + t[1]) == -2;
#else #else

View file

@ -28,7 +28,11 @@
#ifndef __QF_simd_vec4d_h #ifndef __QF_simd_vec4d_h
#define __QF_simd_vec4d_h #define __QF_simd_vec4d_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h> #include <immintrin.h>
#endif
#include "QF/simd/types.h" #include "QF/simd/types.h"
#include "QF/simd/vec2d.h" #include "QF/simd/vec2d.h"

View file

@ -28,7 +28,11 @@
#ifndef __QF_simd_vec4f_h #ifndef __QF_simd_vec4f_h
#define __QF_simd_vec4f_h #define __QF_simd_vec4f_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h> #include <immintrin.h>
#endif
#include <math.h> #include <math.h>
#include "QF/simd/types.h" #include "QF/simd/types.h"
@ -121,12 +125,16 @@ VISIBLE
vec4f_t vec4f_t
vsqrt4f (vec4f_t v) vsqrt4f (vec4f_t v)
{ {
#ifdef __aarch64__
return vsqrtq_f32 (v);
#else
#ifndef __SSE__ #ifndef __SSE__
vec4f_t r = { sqrtf (v[0]), sqrtf (v[1]), sqrtf (v[2]), sqrtf (v[3]) }; vec4f_t r = { sqrtf (v[0]), sqrtf (v[1]), sqrtf (v[2]), sqrtf (v[3]) };
return r; return r;
#else #else
return _mm_sqrt_ps (v); return _mm_sqrt_ps (v);
#endif #endif
#endif
} }
#ifndef IMPLEMENT_VEC4F_Funcs #ifndef IMPLEMENT_VEC4F_Funcs

View file

@ -28,7 +28,6 @@
#ifndef __QF_simd_vec4i_h #ifndef __QF_simd_vec4i_h
#define __QF_simd_vec4i_h #define __QF_simd_vec4i_h
#include <immintrin.h>
#include <math.h> #include <math.h>
#include "QF/simd/types.h" #include "QF/simd/types.h"

View file

@ -242,14 +242,13 @@ split_edge (const vec4f_t *points, const vec4f_t *dists,
// "nan" because 0x7fffffff is nan when viewed as a float // "nan" because 0x7fffffff is nan when viewed as a float
static const vec4i_t onenan = {0x3f800000,0x3f800000,0x3f800000,~0u >> 1}; static const vec4i_t onenan = {0x3f800000,0x3f800000,0x3f800000,~0u >> 1};
static const vec4i_t nan = { ~0u >> 1, ~0u >> 1, ~0u >> 1, ~0u >> 1}; static const vec4i_t nan = { ~0u >> 1, ~0u >> 1, ~0u >> 1, ~0u >> 1};
vec4i_t x = _mm_and_ps (split, (__m128) nan) == (__m128) onenan; vec4i_t x = ((vec4i_t) split & nan) == onenan;
// plane vector has -dist in w // plane vector has -dist in w
vec4f_t y = _mm_and_ps (split, (__m128) x) * -split[3]; vec4f_t y = (vec4f_t) ((vec4i_t) split & x) * -split[3];
#ifdef __SSE3__ #ifdef __SSE3__
mid = _mm_blendv_ps (mid, y, (__m128) x); mid = _mm_blendv_ps (mid, y, (__m128) x);
#else #else
mid = (vec4f_t) ((vec4i_t) _mm_and_ps (y, (__m128) x) | mid = (vec4f_t) (((vec4i_t) y & x) | ((vec4i_t) mid & ~x));
(vec4i_t) _mm_and_ps (mid, (__m128) ~x));
#endif #endif
// if (isnan (mid[0])) *(int *) 0 = 0; // if (isnan (mid[0])) *(int *) 0 = 0;
return mid; return mid;