[util] Get vectors working for non-SSE archs

GCC does a fairly nice job of producing code for vector types when the
hardware doesn't support SIMD, but it seems to break certain math
optimization rules due to excess precision (?). Still, it works well
enough for the core engine, but may not be well suited to the tools.
However, so far, only qfvis uses vector types (and it's not tested yet),
and tools should probably be used on suitable machines anyway (not
forces, of course).
This commit is contained in:
Bill Currie 2021-05-24 15:02:18 +09:00
parent a461c09586
commit 778c07e91f
7 changed files with 383 additions and 224 deletions

View file

@ -81,7 +81,10 @@ AC_ARG_ENABLE(optimize,
optimize=yes
)
QF_CC_OPTION(-mavx2)
QF_CC_OPTION(-Wno-psabi)
dnl QF_CC_OPTION(-msse2)
dnl QF_CC_OPTION(-Wno-psabi)
dnl QF_CC_OPTION(-mavx2)
dnl fma is not used as it is the equivalent of turning on
dnl -funsafe-math-optimizations
dnl QF_CC_OPTION(-mfma)

View file

@ -79,7 +79,11 @@ VEC_TYPE (float, vec4f_t);
VEC_TYPE (int, vec4i_t);
#define VEC4D_FMT "[%.17g, %.17g, %.17g, %.17g]"
#if __WORDSIZE == 64
#define VEC4L_FMT "[%ld, %ld, %ld, %ld]"
#else
#define VEC4L_FMT "[%lld, %lld, %lld, %lld]"
#endif
#define VEC4F_FMT "[%.9g, %.9g, %.9g, %.9g]"
#define VEC4I_FMT "[%d, %d, %d, %d]"
#define VEC4_EXP(v) (v)[0], (v)[1], (v)[2], (v)[3]
@ -94,4 +98,30 @@ typedef struct vspheref_s {
float radius;
} vspheref_t;
#include <immintrin.h>
#ifndef __SSE__
#define _mm_xor_ps __qf_mm_xor_ps
#define _mm_and_ps __qf_mm_and_ps
GNU89INLINE inline __m128 _mm_xor_ps (__m128 a, __m128 b);
GNU89INLINE inline __m128 _mm_and_ps (__m128 a, __m128 b);
#ifndef IMPLEMENT_MAT4F_Funcs
GNU89INLINE inline
#else
VISIBLE
#endif
__m128 _mm_xor_ps (__m128 a, __m128 b)
{
return (__m128) ((vec4i_t) a ^ (vec4i_t) b);
}
#ifndef IMPLEMENT_MAT4F_Funcs
GNU89INLINE inline
#else
VISIBLE
#endif
__m128 _mm_and_ps (__m128 a, __m128 b)
{
return (__m128) ((vec4i_t) a & (vec4i_t) b);
}
#endif
#endif//__QF_simd_types_h

View file

@ -28,6 +28,7 @@
#ifndef __QF_simd_vec4d_h
#define __QF_simd_vec4d_h
#ifdef __AVX__
#include <immintrin.h>
#include "QF/simd/types.h"
@ -292,4 +293,6 @@ storevec3d (double v3[3], vec4d_t v4)
v3[2] = v4[2];
}
#endif
#endif//__QF_simd_vec4d_h

View file

@ -110,7 +110,11 @@ vabsf (vec4f_t v)
{
const uint32_t nan = ~0u >> 1;
const vec4i_t abs = { nan, nan, nan, nan };
#ifndef __SSE__
return (vec4f_t) ((vec4i_t) v & abs);
#else
return _mm_and_ps (v, (__m128) abs);
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -121,7 +125,12 @@ VISIBLE
vec4f_t
vsqrtf (vec4f_t v)
{
#ifndef __SSE__
vec4f_t r = { sqrtf (v[0]), sqrtf (v[1]), sqrtf (v[2]), sqrtf (v[3]) };
return r;
#else
return _mm_sqrt_ps (v);
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -132,7 +141,16 @@ VISIBLE
vec4f_t
vceilf (vec4f_t v)
{
#ifndef __SSE4_1__
return (vec4f_t) {
ceilf (v[0]),
ceilf (v[1]),
ceilf (v[2]),
ceilf (v[3])
};
#else
return _mm_ceil_ps (v);
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -143,7 +161,16 @@ VISIBLE
vec4f_t
vfloorf (vec4f_t v)
{
#ifndef __SSE4_1__
return (vec4f_t) {
floorf (v[0]),
floorf (v[1]),
floorf (v[2]),
floorf (v[3])
};
#else
return _mm_floor_ps (v);
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -154,7 +181,16 @@ VISIBLE
vec4f_t
vtruncf (vec4f_t v)
{
#ifndef __SSE4_1__
return (vec4f_t) {
truncf (v[0]),
truncf (v[1]),
truncf (v[2]),
truncf (v[3])
};
#else
return _mm_round_ps (v, _MM_FROUND_TRUNC);
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -179,8 +215,13 @@ vec4f_t
dotf (vec4f_t a, vec4f_t b)
{
vec4f_t c = a * b;
#ifndef __SSE3__
float x = c[0] + c[1] + c[2] + c[3];
c = (vec4f_t) { x, x, x, x };
#else
c = _mm_hadd_ps (c, c);
c = _mm_hadd_ps (c, c);
#endif
return c;
}
@ -197,7 +238,11 @@ qmulf (vec4f_t a, vec4f_t b)
vec4f_t c = crossf (a, b) + a * b[3] + a[3] * b;
vec4f_t d = dotf (a, b);
// zero out the vector component of dot product so only the scalar remains
#ifndef __SSE4_1__
d = (vec4f_t) { 0, 0, 0, d[3] };
#else
d = _mm_insert_ps (d, d, 0xf7);
#endif
return c - d;
}
@ -212,7 +257,11 @@ qvmulf (vec4f_t q, vec4f_t v)
float s = q[3];
// zero the scalar of the quaternion. Results in an extra operation, but
// avoids adding precision issues.
#ifndef __SSE4_1__
q[3] = 0;
#else
q = _mm_insert_ps (q, q, 0xf8);
#endif
vec4f_t c = crossf (q, v);
vec4f_t qv = dotf (q, v); // q.w is 0 so v.w is irrelevant
vec4f_t qq = dotf (q, q);
@ -231,7 +280,11 @@ vqmulf (vec4f_t v, vec4f_t q)
float s = q[3];
// zero the scalar of the quaternion. Results in an extra operation, but
// avoids adding precision issues.
#ifndef __SSE4_1__
q[3] = 0;
#else
q = _mm_insert_ps (q, q, 0xf8);
#endif
vec4f_t c = crossf (q, v);
vec4f_t qv = dotf (q, v); // q.w is 0 so v.w is irrelevant
vec4f_t qq = dotf (q, q);
@ -266,7 +319,11 @@ vec4f_t
qconjf (vec4f_t q)
{
const vec4i_t neg = { 1u << 31, 1u << 31, 1u << 31, 0 };
#ifndef __SSE__
return (vec4f_t) ((vec4i_t) q ^ neg);
#else
return _mm_xor_ps (q, (__m128) neg);
#endif
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -299,6 +356,9 @@ loadvec3f (const float v3[3])
{
vec4f_t v4;
#ifndef __SSE4_1__
v4 = (vec4f_t) { v3[0], v3[1], v3[2], 0 };
#else
// this had to be in asm otherwise gcc thinks v4 is only partially
// initialized, and gcc 10 does not use the zero flags when generating
// the code, resulting in a memory access to load a 0 into v4[3]
@ -311,6 +371,7 @@ loadvec3f (const float v3[3])
"
: "=v"(v4)
: "m"(v3[0]), "m"(v3[1]), "m"(v3[2]));
#endif
return v4;
}

View file

@ -48,15 +48,19 @@
#define s05 0.70710678118654757
#ifdef __AVX__
typedef struct {
int line;
vec4d_t (*op) (vec4d_t a, vec4d_t b);
vec4d_t a;
vec4d_t b;
vec4d_t expect;
vec4d_t ulp_errors;
} vec4d_test_t;
#endif
typedef struct {
int line;
vec4f_t (*op) (vec4f_t a, vec4f_t b);
vec4f_t a;
vec4f_t b;
@ -65,6 +69,7 @@ typedef struct {
} vec4f_test_t;
typedef struct {
int line;
void (*op) (mat4f_t c, const mat4f_t a, const mat4f_t b);
mat4f_t a;
mat4f_t b;
@ -73,6 +78,7 @@ typedef struct {
} mat4f_test_t;
typedef struct {
int line;
vec4f_t (*op) (const mat4f_t a, vec4f_t b);
mat4f_t a;
vec4f_t b;
@ -81,12 +87,14 @@ typedef struct {
} mv4f_test_t;
typedef struct {
int line;
void (*op) (mat4f_t m, vec4f_t q);
vec4f_t q;
mat4f_t expect;
mat4f_t ulp_errors;
} mq4f_test_t;
#ifdef __AVX__
static vec4d_t tvtruncd (vec4d_t v, vec4d_t ignore)
{
return vtruncd (v);
@ -106,6 +114,7 @@ static vec4d_t tqconjd (vec4d_t v, vec4d_t ignore)
{
return qconjd (v);
}
#endif
static vec4f_t tvtruncf (vec4f_t v, vec4f_t ignore)
{
@ -147,290 +156,320 @@ static vec4f_t tmagnitude3f (vec4f_t v, vec4f_t ignore)
return magnitude3f (v);
}
#define T(t...) { __LINE__, t }
#ifdef __AVX__
static vec4d_test_t vec4d_tests[] = {
// 3D dot products
{ dotd, right, right, one },
{ dotd, right, forward, zero },
{ dotd, right, up, zero },
{ dotd, forward, right, zero },
{ dotd, forward, forward, one },
{ dotd, forward, up, zero },
{ dotd, up, right, zero },
{ dotd, up, forward, zero },
{ dotd, up, up, one },
T(dotd, right, right, one ),
T(dotd, right, forward, zero ),
T(dotd, right, up, zero ),
T(dotd, forward, right, zero ),
T(dotd, forward, forward, one ),
T(dotd, forward, up, zero ),
T(dotd, up, right, zero ),
T(dotd, up, forward, zero ),
T(dotd, up, up, one ),
// one is 4D, so its self dot product is 4
{ dotd, one, one, { 4, 4, 4, 4} },
{ dotd, one, none, {-4, -4, -4, -4} },
T(dotd, one, one, { 4, 4, 4, 4} ),
T(dotd, one, none, {-4, -4, -4, -4} ),
// 3D cross products
{ crossd, right, right, zero },
{ crossd, right, forward, up },
{ crossd, right, up, nforward },
{ crossd, forward, right, nup },
{ crossd, forward, forward, zero },
{ crossd, forward, up, right },
{ crossd, up, right, forward },
{ crossd, up, forward, nright },
{ crossd, up, up, zero },
T(crossd, right, right, zero ),
T(crossd, right, forward, up ),
T(crossd, right, up, nforward ),
T(crossd, forward, right, nup ),
T(crossd, forward, forward, zero ),
T(crossd, forward, up, right ),
T(crossd, up, right, forward ),
T(crossd, up, forward, nright ),
T(crossd, up, up, zero ),
// double whammy tests: cross product with an angled vector and
// ensuring that a 4d vector (non-zero w component) does not affect
// the result, including the result's w component remaining zero.
{ crossd, right, one, { 0, -1, 1} },
{ crossd, forward, one, { 1, 0, -1} },
{ crossd, up, one, {-1, 1, 0} },
{ crossd, one, right, { 0, 1, -1} },
{ crossd, one, forward, {-1, 0, 1} },
{ crossd, one, up, { 1, -1, 0} },
T(crossd, right, one, { 0, -1, 1} ),
T(crossd, forward, one, { 1, 0, -1} ),
T(crossd, up, one, {-1, 1, 0} ),
T(crossd, one, right, { 0, 1, -1} ),
T(crossd, one, forward, {-1, 0, 1} ),
T(crossd, one, up, { 1, -1, 0} ),
// This one fails when optimizing with -mfma (which is why fma is not
// used): ulp errors in z and w
{ crossd, qtest, qtest, {0, 0, 0, 0} },
T(crossd, qtest, qtest, {0, 0, 0, 0} ),
{ qmuld, qident, qident, qident },
{ qmuld, qident, right, right },
{ qmuld, qident, forward, forward },
{ qmuld, qident, up, up },
{ qmuld, right, qident, right },
{ qmuld, forward, qident, forward },
{ qmuld, up, qident, up },
{ qmuld, right, right, nqident },
{ qmuld, right, forward, up },
{ qmuld, right, up, nforward },
{ qmuld, forward, right, nup },
{ qmuld, forward, forward, nqident },
{ qmuld, forward, up, right },
{ qmuld, up, right, forward },
{ qmuld, up, forward, nright },
{ qmuld, up, up, nqident },
{ qmuld, one, one, { 2, 2, 2, -2 } },
{ qmuld, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } },
T(qmuld, qident, qident, qident ),
T(qmuld, qident, right, right ),
T(qmuld, qident, forward, forward ),
T(qmuld, qident, up, up ),
T(qmuld, right, qident, right ),
T(qmuld, forward, qident, forward ),
T(qmuld, up, qident, up ),
T(qmuld, right, right, nqident ),
T(qmuld, right, forward, up ),
T(qmuld, right, up, nforward ),
T(qmuld, forward, right, nup ),
T(qmuld, forward, forward, nqident ),
T(qmuld, forward, up, right ),
T(qmuld, up, right, forward ),
T(qmuld, up, forward, nright ),
T(qmuld, up, up, nqident ),
T(qmuld, one, one, { 2, 2, 2, -2 } ),
T(qmuld, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } ),
// This one fails when optimizing with -mfma (which is why fma is not
// used): ulp error in z
{ qmuld, qtest, qtest, {0.768, 0.576, 0, -0.28} },
T(qmuld, qtest, qtest, {0.768, 0.576, 0, -0.28} ),
// The one vector is not unit (magnitude 2), so using it as a rotation
// quaternion results in scaling by 4. However, it still has the effect
// of rotating 120 degrees around the axis equidistant from the three
// orthogonal axes such that x->y->z->x
{ qvmuld, one, right, { 0, 4, 0, 0 } },
{ qvmuld, one, forward, { 0, 0, 4, 0 } },
{ qvmuld, one, up, { 4, 0, 0, 0 } },
{ qvmuld, one, {1,1,1,0}, { 4, 4, 4, 0 } },
{ qvmuld, one, one, { 4, 4, 4, -2 } },
T(qvmuld, one, right, { 0, 4, 0, 0 } ),
T(qvmuld, one, forward, { 0, 0, 4, 0 } ),
T(qvmuld, one, up, { 4, 0, 0, 0 } ),
T(qvmuld, one, {1,1,1,0}, { 4, 4, 4, 0 } ),
T(qvmuld, one, one, { 4, 4, 4, -2 } ),
// inverse rotation, so x->z->y->x
{ vqmuld, right, one, { 0, 0, 4, 0 } },
{ vqmuld, forward, one, { 4, 0, 0, 0 } },
{ vqmuld, up, one, { 0, 4, 0, 0 } },
{ vqmuld, {1,1,1,0}, one, { 4, 4, 4, 0 } },
{ vqmuld, one, one, { 4, 4, 4, -2 } },
T(vqmuld, right, one, { 0, 0, 4, 0 } ),
T(vqmuld, forward, one, { 4, 0, 0, 0 } ),
T(vqmuld, up, one, { 0, 4, 0, 0 } ),
T(vqmuld, {1,1,1,0}, one, { 4, 4, 4, 0 } ),
T(vqmuld, one, one, { 4, 4, 4, -2 } ),
// The half vector is unit.
{ qvmuld, half, right, forward },
{ qvmuld, half, forward, up },
{ qvmuld, half, up, right },
{ qvmuld, half, {1,1,1,0}, { 1, 1, 1, 0 } },
T(qvmuld, half, right, forward ),
T(qvmuld, half, forward, up ),
T(qvmuld, half, up, right ),
T(qvmuld, half, {1,1,1,0}, { 1, 1, 1, 0 } ),
// inverse
{ vqmuld, right, half, up },
{ vqmuld, forward, half, right },
{ vqmuld, up, half, forward },
{ vqmuld, {1,1,1,0}, half, { 1, 1, 1, 0 } },
T(vqmuld, right, half, up ),
T(vqmuld, forward, half, right ),
T(vqmuld, up, half, forward ),
T(vqmuld, {1,1,1,0}, half, { 1, 1, 1, 0 } ),
// one is a 4D vector and qvmuld is meant for 3D vectors. However, it
// seems that the vector's w has no effect on the 3d portion of the
// result, but the result's w is cosine of the full rotation angle
// scaled by quaternion magnitude and vector w
{ qvmuld, half, one, { 1, 1, 1, -0.5 } },
{ qvmuld, half, {2,2,2,2}, { 2, 2, 2, -1 } },
{ qvmuld, qtest, right, {0.5392, 0.6144, -0.576, 0} },
{ qvmuld, qtest, forward, {0.6144, 0.1808, 0.768, 0},
{0, -2.7e-17, 0, 0} },
{ qvmuld, qtest, up, {0.576, -0.768, -0.28, 0} },
T(qvmuld, half, one, { 1, 1, 1, -0.5 } ),
T(qvmuld, half, {2,2,2,2}, { 2, 2, 2, -1 } ),
T(qvmuld, qtest, right, {0.5392, 0.6144, -0.576, 0} ),
T(qvmuld, qtest, forward, {0.6144, 0.1808, 0.768, 0},
{0, -2.7e-17, 0, 0} ),
T(qvmuld, qtest, up, {0.576, -0.768, -0.28, 0} ),
// inverse
{ vqmuld, one, half, { 1, 1, 1, -0.5 } },
{ vqmuld, {2,2,2,2}, half, { 2, 2, 2, -1 } },
{ vqmuld, right, qtest, {0.5392, 0.6144, 0.576, 0} },
{ vqmuld, forward, qtest, {0.6144, 0.1808, -0.768, 0},
{0, -2.7e-17, 0, 0} },
{ vqmuld, up, qtest, {-0.576, 0.768, -0.28, 0} },
T(vqmuld, one, half, { 1, 1, 1, -0.5 } ),
T(vqmuld, {2,2,2,2}, half, { 2, 2, 2, -1 } ),
T(vqmuld, right, qtest, {0.5392, 0.6144, 0.576, 0} ),
T(vqmuld, forward, qtest, {0.6144, 0.1808, -0.768, 0},
{0, -2.7e-17, 0, 0} ),
T(vqmuld, up, qtest, {-0.576, 0.768, -0.28, 0} ),
{ qrotd, right, right, qident },
{ qrotd, right, forward, { 0, 0, s05, s05 },
{0, 0, -1.1e-16, 0} },
{ qrotd, right, up, { 0, -s05, 0, s05 },
{0, 1.1e-16, 0, 0} },
{ qrotd, forward, right, { 0, 0, -s05, s05 },
{0, 0, 1.1e-16, 0} },
{ qrotd, forward, forward, qident },
{ qrotd, forward, up, { s05, 0, 0, s05 },
{-1.1e-16, 0, 0, 0} },
{ qrotd, up, right, { 0, s05, 0, s05 },
{0, -1.1e-16, 0, 0} },
{ qrotd, up, forward, { -s05, 0, 0, s05 },
{ 1.1e-16, 0, 0, 0} },
{ qrotd, up, up, qident },
T(qrotd, right, right, qident ),
T(qrotd, right, forward, { 0, 0, s05, s05 },
{0, 0, -1.1e-16, 0} ),
T(qrotd, right, up, { 0, -s05, 0, s05 },
{0, 1.1e-16, 0, 0} ),
T(qrotd, forward, right, { 0, 0, -s05, s05 },
{0, 0, 1.1e-16, 0} ),
T(qrotd, forward, forward, qident ),
T(qrotd, forward, up, { s05, 0, 0, s05 },
{-1.1e-16, 0, 0, 0} ),
T(qrotd, up, right, { 0, s05, 0, s05 },
{0, -1.1e-16, 0, 0} ),
T(qrotd, up, forward, { -s05, 0, 0, s05 },
{ 1.1e-16, 0, 0, 0} ),
T(qrotd, up, up, qident ),
{ tvtruncd, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } },
{ tvceild, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } },
{ tvfloord, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } },
{ tqconjd, one, {}, { -1, -1, -1, 1 } },
T(tvtruncd, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } ),
T(tvceild, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } ),
T(tvfloord, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } ),
T(tqconjd, one, {}, { -1, -1, -1, 1 } ),
};
#define num_vec4d_tests (sizeof (vec4d_tests) / (sizeof (vec4d_tests[0])))
#endif
static vec4f_test_t vec4f_tests[] = {
// 3D dot products
{ dotf, right, right, one },
{ dotf, right, forward, zero },
{ dotf, right, up, zero },
{ dotf, forward, right, zero },
{ dotf, forward, forward, one },
{ dotf, forward, up, zero },
{ dotf, up, right, zero },
{ dotf, up, forward, zero },
{ dotf, up, up, one },
T(dotf, right, right, one ),
T(dotf, right, forward, zero ),
T(dotf, right, up, zero ),
T(dotf, forward, right, zero ),
T(dotf, forward, forward, one ),
T(dotf, forward, up, zero ),
T(dotf, up, right, zero ),
T(dotf, up, forward, zero ),
T(dotf, up, up, one ),
// one is 4D, so its self dot product is 4
{ dotf, one, one, { 4, 4, 4, 4} },
{ dotf, one, none, {-4, -4, -4, -4} },
T(dotf, one, one, { 4, 4, 4, 4} ),
T(dotf, one, none, {-4, -4, -4, -4} ),
// 3D cross products
{ crossf, right, right, zero },
{ crossf, right, forward, up },
{ crossf, right, up, nforward },
{ crossf, forward, right, nup },
{ crossf, forward, forward, zero },
{ crossf, forward, up, right },
{ crossf, up, right, forward },
{ crossf, up, forward, nright },
{ crossf, up, up, zero },
T(crossf, right, right, zero ),
T(crossf, right, forward, up ),
T(crossf, right, up, nforward ),
T(crossf, forward, right, nup ),
T(crossf, forward, forward, zero ),
T(crossf, forward, up, right ),
T(crossf, up, right, forward ),
T(crossf, up, forward, nright ),
T(crossf, up, up, zero ),
// double whammy tests: cross product with an angled vector and
// ensuring that a 4d vector (non-zero w component) does not affect
// the result, including the result's w component remaining zero.
{ crossf, right, one, { 0, -1, 1} },
{ crossf, forward, one, { 1, 0, -1} },
{ crossf, up, one, {-1, 1, 0} },
{ crossf, one, right, { 0, 1, -1} },
{ crossf, one, forward, {-1, 0, 1} },
{ crossf, one, up, { 1, -1, 0} },
{ crossf, qtest, qtest, {0, 0, 0, 0} },
T(crossf, right, one, { 0, -1, 1} ),
T(crossf, forward, one, { 1, 0, -1} ),
T(crossf, up, one, {-1, 1, 0} ),
T(crossf, one, right, { 0, 1, -1} ),
T(crossf, one, forward, {-1, 0, 1} ),
T(crossf, one, up, { 1, -1, 0} ),
T(crossf, qtest, qtest, {0, 0, 0, 0} ),
{ qmulf, qident, qident, qident },
{ qmulf, qident, right, right },
{ qmulf, qident, forward, forward },
{ qmulf, qident, up, up },
{ qmulf, right, qident, right },
{ qmulf, forward, qident, forward },
{ qmulf, up, qident, up },
{ qmulf, right, right, nqident },
{ qmulf, right, forward, up },
{ qmulf, right, up, nforward },
{ qmulf, forward, right, nup },
{ qmulf, forward, forward, nqident },
{ qmulf, forward, up, right },
{ qmulf, up, right, forward },
{ qmulf, up, forward, nright },
{ qmulf, up, up, nqident },
{ qmulf, one, one, { 2, 2, 2, -2 } },
{ qmulf, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } },
{ qmulf, qtest, qtest, {0.768, 0.576, 0, -0.28},
{0, 6e-8, 0, 3e-8} },
T(qmulf, qident, qident, qident ),
T(qmulf, qident, right, right ),
T(qmulf, qident, forward, forward ),
T(qmulf, qident, up, up ),
T(qmulf, right, qident, right ),
T(qmulf, forward, qident, forward ),
T(qmulf, up, qident, up ),
T(qmulf, right, right, nqident ),
T(qmulf, right, forward, up ),
T(qmulf, right, up, nforward ),
T(qmulf, forward, right, nup ),
T(qmulf, forward, forward, nqident ),
T(qmulf, forward, up, right ),
T(qmulf, up, right, forward ),
T(qmulf, up, forward, nright ),
T(qmulf, up, up, nqident ),
T(qmulf, one, one, { 2, 2, 2, -2 } ),
T(qmulf, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } ),
T(qmulf, qtest, qtest, {0.768, 0.576, 0, -0.28},
#ifndef __SSE__
{0, 6e-8, 0, 6e-8}
#else
{0, 6e-8, 0, 3e-8}
#endif
),
// The one vector is not unit (magnitude 2), so using it as a rotation
// quaternion results in scaling by 4. However, it still has the effect
// of rotating 120 degrees around the axis equidistant from the three
// orthogonal axes such that x->y->z->x
{ qvmulf, one, right, { 0, 4, 0, 0 } },
{ qvmulf, one, forward, { 0, 0, 4, 0 } },
{ qvmulf, one, up, { 4, 0, 0, 0 } },
{ qvmulf, one, {1,1,1,0}, { 4, 4, 4, 0 } },
{ qvmulf, one, one, { 4, 4, 4, -2 } },
T(qvmulf, one, right, { 0, 4, 0, 0 } ),
T(qvmulf, one, forward, { 0, 0, 4, 0 } ),
T(qvmulf, one, up, { 4, 0, 0, 0 } ),
T(qvmulf, one, {1,1,1,0}, { 4, 4, 4, 0 } ),
T(qvmulf, one, one, { 4, 4, 4, -2 } ),
// inverse rotation, so x->z->y->x
{ vqmulf, right, one, { 0, 0, 4, 0 } },
{ vqmulf, forward, one, { 4, 0, 0, 0 } },
{ vqmulf, up, one, { 0, 4, 0, 0 } },
{ vqmulf, {1,1,1,0}, one, { 4, 4, 4, 0 } },
{ vqmulf, one, one, { 4, 4, 4, -2 } },
T(vqmulf, right, one, { 0, 0, 4, 0 } ),
T(vqmulf, forward, one, { 4, 0, 0, 0 } ),
T(vqmulf, up, one, { 0, 4, 0, 0 } ),
T(vqmulf, {1,1,1,0}, one, { 4, 4, 4, 0 } ),
T(vqmulf, one, one, { 4, 4, 4, -2 } ),
//
{ qvmulf, qtest, right, {0.5392, 0.6144, -0.576, 0},
{0, -5.9e-8, -6e-8, 0} },
{ qvmulf, qtest, forward, {0.6144, 0.1808, 0.768, 0},
{-5.9e-8, 1.5e-8, 0, 0} },
{ qvmulf, qtest, up, {0.576, -0.768, -0.28, 0},
{6e-8, 0, 3e-8, 0} },
{ vqmulf, right, qtest, {0.5392, 0.6144, 0.576, 0},
{0, -5.9e-8, 5.9e-8, 0} },
{ vqmulf, forward, qtest, {0.6144, 0.1808, -0.768, 0},
{-5.9e-8, 1.5e-8, 0, 0} },
{ vqmulf, up, qtest, {-0.576, 0.768, -0.28, 0},
{-5.9e-8, 0, 3e-8, 0} },
T(qvmulf, qtest, right, {0.5392, 0.6144, -0.576, 0},
{0, -5.9e-8, -6e-8, 0} ),
T(qvmulf, qtest, forward, {0.6144, 0.1808, 0.768, 0},
#ifndef __SSE__
{-5.9e-8, 3e-8, 0, 0}
#else
{-5.9e-8, 1.5e-8, 0, 0}
#endif
),
T(qvmulf, qtest, up, {0.576, -0.768, -0.28, 0},
#ifndef __SSE__
{6e-8, 0, 6e-8, 0}
#else
{6e-8, 0, 3e-8, 0}
#endif
),
T(vqmulf, right, qtest, {0.5392, 0.6144, 0.576, 0},
{0, -5.9e-8, 5.9e-8, 0} ),
T(vqmulf, forward, qtest, {0.6144, 0.1808, -0.768, 0},
#ifndef __SSE__
{-5.9e-8, 3e-8, 0, 0}
#else
{-5.9e-8, 1.5e-8, 0, 0}
#endif
),
T(vqmulf, up, qtest, {-0.576, 0.768, -0.28, 0},
#ifndef __SSE__
{-5.9e-8, 0, 6e-8, 0}
#else
{-5.9e-8, 0, 3e-8, 0}
#endif
),
{ qrotf, right, right, qident },
{ qrotf, right, forward, { 0, 0, s05, s05 } },
{ qrotf, right, up, { 0, -s05, 0, s05 } },
{ qrotf, forward, right, { 0, 0, -s05, s05 } },
{ qrotf, forward, forward, qident },
{ qrotf, forward, up, { s05, 0, 0, s05 } },
{ qrotf, up, right, { 0, s05, 0, s05 } },
{ qrotf, up, forward, { -s05, 0, 0, s05 } },
{ qrotf, up, up, qident },
T(qrotf, right, right, qident ),
T(qrotf, right, forward, { 0, 0, s05, s05 } ),
T(qrotf, right, up, { 0, -s05, 0, s05 } ),
T(qrotf, forward, right, { 0, 0, -s05, s05 } ),
T(qrotf, forward, forward, qident ),
T(qrotf, forward, up, { s05, 0, 0, s05 } ),
T(qrotf, up, right, { 0, s05, 0, s05 } ),
T(qrotf, up, forward, { -s05, 0, 0, s05 } ),
T(qrotf, up, up, qident ),
{ tvabsf, pmpi, {}, pi },
{ tvsqrtf, { 1, 4, 9, 16}, {}, {1, 2, 3, 4} },
{ tvtruncf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } },
{ tvceilf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } },
{ tvfloorf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } },
{ tqconjf, one, {}, { -1, -1, -1, 1 } },
{ tmagnitudef, { 3, 4, 12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, 4, 12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, 4, -12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, 4, -12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, -4, 12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, -4, 12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, -4, -12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { 3, -4, -12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, 4, 12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, 4, 12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, 4, -12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, 4, -12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, -4, 12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, -4, 12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, -4, -12, 84}, {}, {85, 85, 85, 85} },
{ tmagnitudef, { -3, -4, -12, -84}, {}, {85, 85, 85, 85} },
{ tmagnitude3f, { -3, -4, -12, -84}, {}, {13, 13, 13, 13} },
T(tvabsf, pmpi, {}, pi ),
T(tvsqrtf, { 1, 4, 9, 16}, {}, {1, 2, 3, 4} ),
T(tvtruncf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } ),
T(tvceilf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } ),
T(tvfloorf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } ),
T(tqconjf, one, {}, { -1, -1, -1, 1 } ),
T(tmagnitudef, { 3, 4, 12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, 4, 12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, 4, -12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, 4, -12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, -4, 12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, -4, 12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, -4, -12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { 3, -4, -12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, 4, 12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, 4, 12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, 4, -12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, 4, -12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, -4, 12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, -4, 12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, -4, -12, 84}, {}, {85, 85, 85, 85} ),
T(tmagnitudef, { -3, -4, -12, -84}, {}, {85, 85, 85, 85} ),
T(tmagnitude3f, { -3, -4, -12, -84}, {}, {13, 13, 13, 13} ),
};
#define num_vec4f_tests (sizeof (vec4f_tests) / (sizeof (vec4f_tests[0])))
static mat4f_test_t mat4f_tests[] = {
{ mmulf, identity, identity, identity },
{ mmulf, rotate120, identity, rotate120 },
{ mmulf, identity, rotate120, rotate120 },
{ mmulf, rotate120, rotate120, rotate240 },
{ mmulf, rotate120, rotate240, identity },
{ mmulf, rotate240, rotate120, identity },
T(mmulf, identity, identity, identity ),
T(mmulf, rotate120, identity, rotate120 ),
T(mmulf, identity, rotate120, rotate120 ),
T(mmulf, rotate120, rotate120, rotate240 ),
T(mmulf, rotate120, rotate240, identity ),
T(mmulf, rotate240, rotate120, identity ),
};
#define num_mat4f_tests (sizeof (mat4f_tests) / (sizeof (mat4f_tests[0])))
static mv4f_test_t mv4f_tests[] = {
{ mvmulf, identity, { 1, 0, 0, 0 }, { 1, 0, 0, 0 } },
{ mvmulf, identity, { 0, 1, 0, 0 }, { 0, 1, 0, 0 } },
{ mvmulf, identity, { 0, 0, 1, 0 }, { 0, 0, 1, 0 } },
{ mvmulf, identity, { 0, 0, 0, 1 }, { 0, 0, 0, 1 } },
{ mvmulf, rotate120, { 1, 2, 3, 4 }, { 3, 1, 2, 4 } },
{ mvmulf, rotate240, { 1, 2, 3, 4 }, { 2, 3, 1, 4 } },
T(mvmulf, identity, { 1, 0, 0, 0 }, { 1, 0, 0, 0 } ),
T(mvmulf, identity, { 0, 1, 0, 0 }, { 0, 1, 0, 0 } ),
T(mvmulf, identity, { 0, 0, 1, 0 }, { 0, 0, 1, 0 } ),
T(mvmulf, identity, { 0, 0, 0, 1 }, { 0, 0, 0, 1 } ),
T(mvmulf, rotate120, { 1, 2, 3, 4 }, { 3, 1, 2, 4 } ),
T(mvmulf, rotate240, { 1, 2, 3, 4 }, { 2, 3, 1, 4 } ),
};
#define num_mv4f_tests (sizeof (mv4f_tests) / (sizeof (mv4f_tests[0])))
// expect filled in using non-simd QuatToMatrix (has its own tests)
static mq4f_test_t mq4f_tests[] = {
{ mat4fquat, { 0, 0, 0, 1 } },
{ mat4fquat, { 0.5, 0.5, 0.5, 0.5 } },
{ mat4fquat, { 0.5, 0.5, -0.5, 0.5 } },
{ mat4fquat, { 0.5, -0.5, 0.5, 0.5 } },
{ mat4fquat, { 0.5, -0.5, -0.5, 0.5 } },
{ mat4fquat, { -0.5, 0.5, 0.5, 0.5 } },
{ mat4fquat, { -0.5, 0.5, -0.5, 0.5 } },
{ mat4fquat, { -0.5, -0.5, 0.5, 0.5 } },
{ mat4fquat, { -0.5, -0.5, -0.5, 0.5 } },
T(mat4fquat, { 0, 0, 0, 1 } ),
T(mat4fquat, { 0.5, 0.5, 0.5, 0.5 } ),
T(mat4fquat, { 0.5, 0.5, -0.5, 0.5 } ),
T(mat4fquat, { 0.5, -0.5, 0.5, 0.5 } ),
T(mat4fquat, { 0.5, -0.5, -0.5, 0.5 } ),
T(mat4fquat, { -0.5, 0.5, 0.5, 0.5 } ),
T(mat4fquat, { -0.5, 0.5, -0.5, 0.5 } ),
T(mat4fquat, { -0.5, -0.5, 0.5, 0.5 } ),
T(mat4fquat, { -0.5, -0.5, -0.5, 0.5 } ),
};
#define num_mq4f_tests (sizeof (mq4f_tests) / (sizeof (mq4f_tests[0])))
#ifdef __AVX__
static int
run_vec4d_tests (void)
{
@ -443,7 +482,7 @@ run_vec4d_tests (void)
vec4l_t res = result != expect;
if (res[0] || res[1] || res[2] || res[3]) {
ret |= 1;
printf ("\nrun_vec4d_tests %zd\n", i);
printf ("\nrun_vec4d_tests %zd, line %d\n", i, test->line);
printf ("a: " VEC4D_FMT "\n", VEC4_EXP(test->a));
printf ("b: " VEC4D_FMT "\n", VEC4_EXP(test->b));
printf ("r: " VEC4D_FMT "\n", VEC4_EXP(result));
@ -455,6 +494,7 @@ run_vec4d_tests (void)
}
return ret;
}
#endif
static int
run_vec4f_tests (void)
@ -465,10 +505,10 @@ run_vec4f_tests (void)
__auto_type test = &vec4f_tests[i];
vec4f_t result = test->op (test->a, test->b);
vec4f_t expect = test->expect + test->ulp_errors;
vec4i_t res = result != expect;
vec4i_t res = (vec4i_t) result != (vec4i_t) expect;
if (res[0] || res[1] || res[2] || res[3]) {
ret |= 1;
printf ("\nrun_vec4f_tests %zd\n", i);
printf ("\nrun_vec4f_tests %zd, line %d\n", i, test->line);
printf ("a: " VEC4F_FMT "\n", VEC4_EXP(test->a));
printf ("b: " VEC4F_FMT "\n", VEC4_EXP(test->b));
printf ("r: " VEC4F_FMT "\n", VEC4_EXP(result));
@ -502,7 +542,7 @@ run_mat4f_tests (void)
}
if (fail) {
ret |= 1;
printf ("\nrun_mat4f_tests %zd\n", i);
printf ("\nrun_mat4f_tests %zd, line %d\n", i, test->line);
printf ("a: " VEC4F_FMT "\n", MAT4_ROW(test->a, 0));
printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 1));
printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 2));
@ -549,7 +589,7 @@ run_mv4f_tests (void)
if (res[0] || res[1] || res[2] || res[3]) {
ret |= 1;
printf ("\nrun_mv4f_tests %zd\n", i);
printf ("\nrun_mv4f_tests %zd, line %d\n", i, test->line);
printf ("a: " VEC4F_FMT "\n", MAT4_ROW(test->a, 0));
printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 1));
printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 2));
@ -595,7 +635,7 @@ run_mq4f_tests (void)
}
if (fail) {
ret |= 1;
printf ("\nrun_mq4f_tests %zd\n", i);
printf ("\nrun_mq4f_tests %zd, line %d\n", i, test->line);
printf ("q: " VEC4F_FMT "\n", VEC4_EXP(test->q));
printf ("r: " VEC4F_FMT "\n", MAT4_ROW(result, 0));
printf (" " VEC4F_FMT "\n", MAT4_ROW(result, 1));
@ -626,7 +666,9 @@ int
main (void)
{
int ret = 0;
#ifdef __AVX__
ret |= run_vec4d_tests ();
#endif
ret |= run_vec4f_tests ();
ret |= run_mat4f_tests ();
ret |= run_mv4f_tests ();

View file

@ -86,6 +86,7 @@ SimpleFlood (basethread_t *thread, portal_t *srcportal, int clusternum)
static inline int
test_sphere (const vspheref_t *sphere, vec4f_t plane)
{
#ifdef __SSE3__
const vec4f_t zero = {};
float r = sphere->radius;
vec4f_t eps = { r, r, r, r };
@ -94,6 +95,12 @@ test_sphere (const vspheref_t *sphere, vec4f_t plane)
c = (vec4i_t) _mm_hsub_epi32 ((__m128i) c, (__m128i) c);
return c[0];
#else
float d = DotProduct (sphere->center, plane) + plane[3];
int front = (d >= sphere->radius);
int back = (d <= -sphere->radius);
return front - back;
#endif
}
void

View file

@ -214,12 +214,20 @@ NewFlippedWinding (threaddata_t *thread, const winding_t *w)
static vec4i_t
signeps (vec4f_t dist)
{
#ifdef __SSE3__
const vec4f_t zero = {};
const vec4f_t eps = { ON_EPSILON, ON_EPSILON, ON_EPSILON, ON_EPSILON };
vec4f_t d = _mm_addsub_ps (zero, dist);
vec4i_t c = (d - eps) > 0;
c = (vec4i_t) _mm_hsub_epi32 ((__m128i) c, (__m128i) c);
return c;
#else
float d = dist[0];
int front = (d >= ON_EPSILON);
int back = (d <= -ON_EPSILON);
int i = front - back;
return (vec4i_t) { i, i, i, i };
#endif
}
static vec4f_t
@ -246,7 +254,12 @@ split_edge (const vec4f_t *points, const vec4f_t *dists,
vec4i_t x = _mm_and_ps (split, (__m128) nan) == onenan;
// plane vector has -dist in w
vec4f_t y = _mm_and_ps (split, (__m128) x) * -split[3];
#ifdef __SSE3__
mid = _mm_blendv_ps (mid, y, (__m128) x);
#else
mid = (vec4f_t) ((vec4i_t) _mm_and_ps (y, (__m128) x) |
(vec4i_t) _mm_and_ps (mid, (__m128) ~x));
#endif
if (isnan (mid[0])) *(int *) 0 = 0;
return mid;
}