[simd] Get the new functions working on older hardware

In some cases, gcc-11 does a good enough job translating normal looking
C expressions so use just those, but other times need to dig around for
an appropriate intrinsic.

Also, now need to disable psapi warnings when compiling for anything
less than avx.
This commit is contained in:
Bill Currie 2022-01-07 11:46:45 +09:00
parent aee31a8be5
commit 23613ca985
6 changed files with 127 additions and 11 deletions

View file

@ -92,7 +92,6 @@ AC_ARG_ENABLE(simd,
case "$enable_simd" in
no)
QF_CC_OPTION(-Wno-psabi)
simd=no
;;
sse|sse2|avx|avx2)
@ -108,6 +107,13 @@ case "$enable_simd" in
done
;;
esac
case "$simd" in
avx*)
;;
*)
QF_CC_OPTION(-Wno-psabi)
;;
esac
AC_MSG_CHECKING(for optimization)
if test "x$optimize" = xyes -a "x$leave_cflags_alone" != "xyes"; then

View file

@ -61,7 +61,14 @@ VISIBLE
vec2d_t
vceil2d (vec2d_t v)
{
#ifndef __SSE4_1__
return (vec2d_t) {
ceil (v[0]),
ceil (v[1]),
};
#else
return _mm_ceil_pd (v);
#endif
}
#ifndef IMPLEMENT_VEC2D_Funcs
@ -72,7 +79,14 @@ VISIBLE
vec2d_t
vfloor2d (vec2d_t v)
{
#ifndef __SSE4_1__
return (vec2d_t) {
floor (v[0]),
floor (v[1]),
};
#else
return _mm_floor_pd (v);
#endif
}
#ifndef IMPLEMENT_VEC2D_Funcs
@ -83,7 +97,14 @@ VISIBLE
vec2d_t
vtrunc2d (vec2d_t v)
{
#ifndef __SSE4_1__
return (vec2d_t) {
trunc (v[0]),
trunc (v[1]),
};
#else
return _mm_round_pd (v, _MM_FROUND_TRUNC);
#endif
}
#ifndef IMPLEMENT_VEC2D_Funcs
@ -95,7 +116,8 @@ vec2d_t
dot2d (vec2d_t a, vec2d_t b)
{
vec2d_t c = a * b;
c = _mm_hadd_pd (c, c);
// gcc-11 does a good job with hadd
c = (vec2d_t) { c[0] + c[1], c[0] + c[1] };
return c;
}
@ -109,7 +131,11 @@ cmuld (vec2d_t a, vec2d_t b)
{
vec2d_t c1 = a * b[0];
vec2d_t c2 = a * b[1];
#ifndef __SSE3__
return (vec2d_t) { c1[0] - c2[1], c1[1] + c2[0] };
#else
return _mm_addsub_pd (c1, (vec2d_t) { c2[1], c2[0] });
#endif
}
#endif//__QF_simd_vec2d_h

View file

@ -80,9 +80,16 @@ VISIBLE
vec2f_t
vceil2f (vec2f_t v)
{
#ifndef __SSE4_1__
return (vec2f_t) {
ceilf (v[0]),
ceilf (v[1]),
};
#else
vec4f_t t = { v[0], v[1], 0, 0 };
t = _mm_ceil_ps (t);
return (vec2f_t) { t[0], t[1] };
#endif
}
#ifndef IMPLEMENT_VEC2F_Funcs
@ -93,9 +100,16 @@ VISIBLE
vec2f_t
vfloor2f (vec2f_t v)
{
#ifndef __SSE4_1__
return (vec2f_t) {
floorf (v[0]),
floorf (v[1]),
};
#else
vec4f_t t = { v[0], v[1], 0, 0 };
t = _mm_floor_ps (t);
return (vec2f_t) { t[0], t[1] };
#endif
}
#ifndef IMPLEMENT_VEC2F_Funcs
@ -106,9 +120,16 @@ VISIBLE
vec2f_t
vtrunc2f (vec2f_t v)
{
#ifndef __SSE4_1__
return (vec2f_t) {
truncf (v[0]),
truncf (v[1]),
};
#else
vec4f_t t = { v[0], v[1], 0, 0 };
t = _mm_round_ps (t, _MM_FROUND_TRUNC);
return (vec2f_t) { t[0], t[1] };
#endif
}
#ifndef IMPLEMENT_VEC2F_Funcs

View file

@ -60,7 +60,11 @@ int
any2i (vec2i_t v)
{
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0});
#ifndef __SSSE3__
return (t[0] + t[1]) > -2;
#else
return _mm_hadd_pi32 (t, t)[0] > -2;
#endif
}
#ifndef IMPLEMENT_VEC2I_Funcs
@ -72,7 +76,11 @@ int
all2i (vec2i_t v)
{
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0});
#ifndef __SSSE3__
return (t[0] + t[1]) == 0;
#else
return _mm_hadd_pi32 (t, t)[0] == 0;
#endif
}
#ifndef IMPLEMENT_VEC2I_Funcs
@ -84,7 +92,11 @@ int
none2i (vec2i_t v)
{
vec2i_t t = _m_pcmpeqd (v, (vec2i_t) {0, 0});
#ifndef __SSSE3__
return (t[0] + t[1]) == -2;
#else
return _mm_hadd_pi32 (t, t)[0] == -2;
#endif
}
#endif//__QF_simd_vec2i_h

View file

@ -31,6 +31,7 @@
#include <immintrin.h>
#include "QF/simd/types.h"
#include "QF/simd/vec2d.h"
GNU89INLINE inline vec4d_t vsqrt4d (vec4d_t v) __attribute__((const));
GNU89INLINE inline vec4d_t vceil4d (vec4d_t v) __attribute__((const));
@ -107,7 +108,15 @@ VISIBLE
vec4d_t
vsqrt4d (vec4d_t v)
{
#ifndef __AVX__
vec2d_t xy = { v[0], v[1] };
vec2d_t zw = { v[2], v[3] };
xy = vsqrt2d (xy);
zw = vsqrt2d (zw);
return (vec4d_t) { xy[0], xy[1], zw[0], zw[1] };
#else
return _mm256_sqrt_pd (v);
#endif
}
#ifndef IMPLEMENT_VEC4D_Funcs
@ -118,7 +127,15 @@ VISIBLE
vec4d_t
vceil4d (vec4d_t v)
{
#ifndef __AVX__
vec2d_t xy = { v[0], v[1] };
vec2d_t zw = { v[2], v[3] };
xy = vceil2d (xy);
zw = vceil2d (zw);
return (vec4d_t) { xy[0], xy[1], zw[0], zw[1] };
#else
return _mm256_ceil_pd (v);
#endif
}
#ifndef IMPLEMENT_VEC4D_Funcs
@ -129,7 +146,15 @@ VISIBLE
vec4d_t
vfloor4d (vec4d_t v)
{
#ifndef __AVX__
vec2d_t xy = { v[0], v[1] };
vec2d_t zw = { v[2], v[3] };
xy = vfloor2d (xy);
zw = vfloor2d (zw);
return (vec4d_t) { xy[0], xy[1], zw[0], zw[1] };
#else
return _mm256_floor_pd (v);
#endif
}
#ifndef IMPLEMENT_VEC4D_Funcs
@ -140,7 +165,15 @@ VISIBLE
vec4d_t
vtrunc4d (vec4d_t v)
{
#ifndef __AVX__
vec2d_t xy = { v[0], v[1] };
vec2d_t zw = { v[2], v[3] };
xy = vtrunc2d (xy);
zw = vtrunc2d (zw);
return (vec4d_t) { xy[0], xy[1], zw[0], zw[1] };
#else
return _mm256_round_pd (v, _MM_FROUND_TRUNC);
#endif
}
#ifndef IMPLEMENT_VEC4D_Funcs
@ -167,7 +200,11 @@ vec4d_t
dotd (vec4d_t a, vec4d_t b)
{
vec4d_t c = a * b;
#ifndef __AVX__
c = (vec4d_t) { c[0] + c[1], c[0] + c[1], c[2] + c[3], c[2] + c[3] };
#else
c = _mm256_hadd_pd (c, c);
#endif
static const vec4l_t A = {2, 3, 0, 1};
c += __builtin_shuffle(c, A);
return c;
@ -202,8 +239,12 @@ qvmuld (vec4d_t q, vec4d_t v)
double s = q[3];
// zero the scalar of the quaternion. Results in an extra operation, but
// avoids adding precision issues.
#ifndef __AVX__
q = (vec4d_t) { q[0], q[1], q[2], 0 };
#else
vec4d_t z = {};
q = _mm256_blend_pd (q, z, 0x08);
#endif
vec4d_t c = crossd (q, v);
vec4d_t qv = dotd (q, v); // q.w is 0 so v.w is irrelevant
vec4d_t qq = dotd (q, q);
@ -224,8 +265,12 @@ vqmuld (vec4d_t v, vec4d_t q)
double s = q[3];
// zero the scalar of the quaternion. Results in an extra operation, but
// avoids adding precision issues.
#ifndef __AVX__
q = (vec4d_t) { q[0], q[1], q[2], 0 };
#else
vec4d_t z = {};
q = _mm256_blend_pd (q, z, 0x08);
#endif
vec4d_t c = crossd (q, v);
vec4d_t qv = dotd (q, v); // q.w is 0 so v.w is irrelevant
vec4d_t qq = dotd (q, q);
@ -262,7 +307,7 @@ qconjd (vec4d_t q)
{
const uint64_t sign = UINT64_C(1) << 63;
const vec4l_t neg = { sign, sign, sign, 0 };
return _mm256_xor_pd (q, (__m256d) neg);
return (vec4d_t) ((vec4l_t) q ^ neg);
}
#ifndef IMPLEMENT_VEC4D_Funcs

View file

@ -62,10 +62,12 @@ VISIBLE
int
any4i (vec4i_t v)
{
#ifndef __SSE4_1__
vec4i_t t = (v != (vec4i_t) {});
return (t[0] + t[1] + t[2] + t[3]) != 0;
#else
return !__builtin_ia32_ptestz128 ((__v2di)v, (__v2di)v);
/*vec4i_t t = (v != (vec4i_t) {});
t = __builtin_ia32_phaddd128 (t, t);
return __builtin_ia32_phaddd128 (t, t)[0] != 0;*/
#endif
}
#ifndef IMPLEMENT_VEC2I_Funcs
@ -77,9 +79,11 @@ int
all4i (vec4i_t v)
{
vec4i_t t = (v == (vec4i_t) {});
#ifndef __SSE4_1__
return (t[0] + t[1] + t[2] + t[3]) == 0;
#else
return __builtin_ia32_ptestz128 ((__v2di)t, (__v2di)t);
/*t = __builtin_ia32_phaddd128 (t, t);
return __builtin_ia32_phaddd128 (t, t)[0] == 0;*/
#endif
}
#ifndef IMPLEMENT_VEC2I_Funcs
@ -90,10 +94,12 @@ VISIBLE
int
none4i (vec4i_t v)
{
#ifndef __SSE4_1__
vec4i_t t = (v != (vec4i_t) {});
return (t[0] + t[1] + t[2] + t[3]) == 0;
#else
return __builtin_ia32_ptestz128 ((__v2di)v, (__v2di)v);
/*vec4i_t t = (v != (vec4i_t) {});
t = __builtin_ia32_phaddd128 (t, t);
return __builtin_ia32_phaddd128 (t, t)[0] == 0;*/
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs