[simd] Improve portability for aarch64

No need for compiler flags for 64-bit arm, and the few intrinsics that
are actually needed are in a different header.
This commit is contained in:
Bill Currie 2023-03-25 16:29:46 +09:00
parent 98fb9e0942
commit 56344e478d
10 changed files with 80 additions and 80 deletions

View file

@ -98,14 +98,17 @@ AC_ARG_ENABLE(optimize,
optimize=yes
)
AC_ARG_ENABLE(simd,
if test "x$host_cpu" = xaarch64; then
simd=neon
else
AC_ARG_ENABLE(simd,
AS_HELP_STRING([--enable-simd@<:@=arg@:.@],
[enable SIMD support (default auto)]),
[],
[enable_simd=yes]
)
)
case "$enable_simd" in
case "$enable_simd" in
no)
simd=no
;;
@ -121,14 +124,15 @@ case "$enable_simd" in
fi
done
;;
esac
case "$simd" in
esac
case "$simd" in
avx*)
;;
*)
QF_CC_OPTION(-Wno-psabi)
;;
esac
esac
fi
AC_MSG_CHECKING(for optimization)
if test "x$optimize" = xyes -a "x$leave_cflags_alone" != "xyes"; then

View file

@ -28,8 +28,6 @@
#ifndef __QF_simd_mat4f_h
#define __QF_simd_mat4f_h
#include <immintrin.h>
#include "QF/simd/types.h"
GNU89INLINE inline void maddf (mat4f_t c, const mat4f_t a, const mat4f_t b);
@ -172,27 +170,27 @@ mat4fquat (mat4f_t m, vec4f_t q)
#undef m
{
vec4f_t a = xq;
vec4f_t b = _mm_xor_ps (shuff103 (yq), (__m128) mpm);
vec4f_t c = _mm_xor_ps (shuff230 (zq), (__m128) pmm);
vec4f_t d = _mm_xor_ps (shuff321 (wq), (__m128) mmp);
vec4f_t b = (vec4f_t) ((vec4i_t) shuff103 (yq) ^ mpm);
vec4f_t c = (vec4f_t) ((vec4i_t) shuff230 (zq) ^ pmm);
vec4f_t d = (vec4f_t) ((vec4i_t) shuff321 (wq) ^ mmp);
// column: ww + xx - yy - zz // 2xy + 2wz // 2zx - 2wy // 0
m[0] = _mm_and_ps (a + b - c - d, (__m128) mask);
m[0] = (vec4f_t) ((vec4i_t) (a + b - c - d) & mask);
}
{
vec4f_t a = _mm_xor_ps (shuff103 (xq), (__m128) mpm);
vec4f_t a = (vec4f_t) ((vec4i_t) shuff103 (xq) ^ mpm);
vec4f_t b = yq;
vec4f_t c = _mm_xor_ps (shuff321 (zq), (__m128) mmp);
vec4f_t d = _mm_xor_ps (shuff230 (wq), (__m128) pmm);
vec4f_t c = (vec4f_t) ((vec4i_t) shuff321 (zq) ^ mmp);
vec4f_t d = (vec4f_t) ((vec4i_t) shuff230 (wq) ^ pmm);
// column: 2xy - 2wz // ww - xx + yy - zz // 2yz + 2wx // 0
m[1] = _mm_and_ps (b + c - a - d, (__m128) mask);
m[1] = (vec4f_t) ((vec4i_t) (b + c - a - d) & mask);
}
{
vec4f_t a = _mm_xor_ps (shuff230 (xq), (__m128) pmm);
vec4f_t b = _mm_xor_ps (shuff321 (yq), (__m128) mmp);
vec4f_t a = (vec4f_t) ((vec4i_t) shuff230 (xq) ^ pmm);
vec4f_t b = (vec4f_t) ((vec4i_t) shuff321 (yq) ^ mmp);
vec4f_t c = zq;
vec4f_t d = _mm_xor_ps (shuff103 (wq), (__m128) mpm);
vec4f_t d = (vec4f_t) ((vec4i_t) shuff103 (wq) ^ mpm);
// column: 2xz + 2wy // 2yz - 2wx // ww - xx - yy + zz // 0
m[2] = _mm_and_ps (a - b + c - d, (__m128) mask);
m[2] = (vec4f_t) ((vec4i_t) (a - b + c - d) & mask);
}
m[3] = (vec4f_t) { 0, 0, 0, 1 };
}

View file

@ -105,30 +105,4 @@ typedef struct vspheref_s {
float radius;
} vspheref_t;
#include <immintrin.h>
#ifndef __SSE__
#define _mm_xor_ps __qf_mm_xor_ps
#define _mm_and_ps __qf_mm_and_ps
GNU89INLINE inline __m128 _mm_xor_ps (__m128 a, __m128 b);
GNU89INLINE inline __m128 _mm_and_ps (__m128 a, __m128 b);
#ifndef IMPLEMENT_MAT4F_Funcs
GNU89INLINE inline
#else
VISIBLE
#endif
__m128 _mm_xor_ps (__m128 a, __m128 b)
{
return (__m128) ((vec4i_t) a ^ (vec4i_t) b);
}
#ifndef IMPLEMENT_MAT4F_Funcs
GNU89INLINE inline
#else
VISIBLE
#endif
__m128 _mm_and_ps (__m128 a, __m128 b)
{
return (__m128) ((vec4i_t) a & (vec4i_t) b);
}
#endif
#endif//__QF_simd_types_h

View file

@ -29,7 +29,11 @@
#ifndef __QF_simd_vec2d_h
#define __QF_simd_vec2d_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h>
#endif
#include "QF/simd/types.h"
@ -50,7 +54,11 @@ VISIBLE
vec2d_t
vsqrt2d (vec2d_t v)
{
#ifdef __aarch64__
return vsqrtq_f64 (v);
#else
return _mm_sqrt_pd (v);
#endif
}
#ifndef IMPLEMENT_VEC2D_Funcs

View file

@ -29,7 +29,11 @@
#ifndef __QF_simd_vec2f_h
#define __QF_simd_vec2f_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h>
#endif
#include <math.h>
#include "QF/simd/types.h"
@ -67,9 +71,13 @@ VISIBLE
vec2f_t
vsqrt2f (vec2f_t v)
{
#ifdef __aarch64__
return vsqrt_f32 (v);
#else
vec4f_t t = { v[0], v[1], 0, 0 };
t = _mm_sqrt_ps (t);
return (vec2f_t) { t[0], t[1] };
#endif
}
#ifndef IMPLEMENT_VEC2F_Funcs

View file

@ -28,8 +28,6 @@
#ifndef __QF_simd_vec2i_h
#define __QF_simd_vec2i_h
#include <immintrin.h>
#include <math.h>
#include "QF/simd/types.h"
@ -59,7 +57,7 @@ VISIBLE
int
any2i (vec2i_t v)
{
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0});
vec2i_t t = v == (vec2i_t) {0, 0};
#ifndef __SSSE3__
return (t[0] + t[1]) > -2;
#else
@ -75,7 +73,7 @@ VISIBLE
int
all2i (vec2i_t v)
{
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0});
vec2i_t t = v == (vec2i_t) {0, 0};
#ifndef __SSSE3__
return (t[0] + t[1]) == 0;
#else
@ -91,7 +89,7 @@ VISIBLE
int
none2i (vec2i_t v)
{
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0});
vec2i_t t = v == (vec2i_t) {0, 0};
#ifndef __SSSE3__
return (t[0] + t[1]) == -2;
#else

View file

@ -28,7 +28,11 @@
#ifndef __QF_simd_vec4d_h
#define __QF_simd_vec4d_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h>
#endif
#include "QF/simd/types.h"
#include "QF/simd/vec2d.h"

View file

@ -28,7 +28,11 @@
#ifndef __QF_simd_vec4f_h
#define __QF_simd_vec4f_h
#ifdef __aarch64__
#include <arm_neon.h>
#else
#include <immintrin.h>
#endif
#include <math.h>
#include "QF/simd/types.h"
@ -121,12 +125,16 @@ VISIBLE
vec4f_t
vsqrt4f (vec4f_t v)
{
#ifdef __aarch64__
return vsqrtq_f32 (v);
#else
#ifndef __SSE__
vec4f_t r = { sqrtf (v[0]), sqrtf (v[1]), sqrtf (v[2]), sqrtf (v[3]) };
return r;
#else
return _mm_sqrt_ps (v);
#endif
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs

View file

@ -28,7 +28,6 @@
#ifndef __QF_simd_vec4i_h
#define __QF_simd_vec4i_h
#include <immintrin.h>
#include <math.h>
#include "QF/simd/types.h"

View file

@ -242,14 +242,13 @@ split_edge (const vec4f_t *points, const vec4f_t *dists,
// "nan" because 0x7fffffff is nan when viewed as a float
static const vec4i_t onenan = {0x3f800000,0x3f800000,0x3f800000,~0u >> 1};
static const vec4i_t nan = { ~0u >> 1, ~0u >> 1, ~0u >> 1, ~0u >> 1};
vec4i_t x = _mm_and_ps (split, (__m128) nan) == (__m128) onenan;
vec4i_t x = ((vec4i_t) split & nan) == onenan;
// plane vector has -dist in w
vec4f_t y = _mm_and_ps (split, (__m128) x) * -split[3];
vec4f_t y = (vec4f_t) ((vec4i_t) split & x) * -split[3];
#ifdef __SSE3__
mid = _mm_blendv_ps (mid, y, (__m128) x);
#else
mid = (vec4f_t) ((vec4i_t) _mm_and_ps (y, (__m128) x) |
(vec4i_t) _mm_and_ps (mid, (__m128) ~x));
mid = (vec4f_t) (((vec4i_t) y & x) | ((vec4i_t) mid & ~x));
#endif
// if (isnan (mid[0])) *(int *) 0 = 0;
return mid;