From 778c07e91fb975645116a0e10829722a55459f62 Mon Sep 17 00:00:00 2001 From: Bill Currie Date: Mon, 24 May 2021 15:02:18 +0900 Subject: [PATCH] [util] Get vectors working for non-SSE archs GCC does a fairly nice job of producing code for vector types when the hardware doesn't support SIMD, but it seems to break certain math optimization rules due to excess precision (?). Still, it works well enough for the core engine, but may not be well suited to the tools. However, so far, only qfvis uses vector types (and it's not tested yet), and tools should probably be used on suitable machines anyway (not forces, of course). --- config.d/compiling.m4 | 5 +- include/QF/simd/types.h | 30 +++ include/QF/simd/vec4d.h | 3 + include/QF/simd/vec4f.h | 61 +++++ libs/util/test/test-simd.c | 488 ++++++++++++++++++---------------- tools/qfvis/source/base-vis.c | 7 + tools/qfvis/source/qfvis.c | 13 + 7 files changed, 383 insertions(+), 224 deletions(-) diff --git a/config.d/compiling.m4 b/config.d/compiling.m4 index b432c1709..9f8dcf1e0 100644 --- a/config.d/compiling.m4 +++ b/config.d/compiling.m4 @@ -81,7 +81,10 @@ AC_ARG_ENABLE(optimize, optimize=yes ) -QF_CC_OPTION(-mavx2) +QF_CC_OPTION(-Wno-psabi) +dnl QF_CC_OPTION(-msse2) +dnl QF_CC_OPTION(-Wno-psabi) +dnl QF_CC_OPTION(-mavx2) dnl fma is not used as it is the equivalent of turning on dnl -funsafe-math-optimizations dnl QF_CC_OPTION(-mfma) diff --git a/include/QF/simd/types.h b/include/QF/simd/types.h index c5499f5bb..bfe484ace 100644 --- a/include/QF/simd/types.h +++ b/include/QF/simd/types.h @@ -79,7 +79,11 @@ VEC_TYPE (float, vec4f_t); VEC_TYPE (int, vec4i_t); #define VEC4D_FMT "[%.17g, %.17g, %.17g, %.17g]" +#if __WORDSIZE == 64 #define VEC4L_FMT "[%ld, %ld, %ld, %ld]" +#else +#define VEC4L_FMT "[%lld, %lld, %lld, %lld]" +#endif #define VEC4F_FMT "[%.9g, %.9g, %.9g, %.9g]" #define VEC4I_FMT "[%d, %d, %d, %d]" #define VEC4_EXP(v) (v)[0], (v)[1], (v)[2], (v)[3] @@ -94,4 +98,30 @@ typedef struct vspheref_s { float radius; } vspheref_t; +#include +#ifndef __SSE__ +#define _mm_xor_ps __qf_mm_xor_ps +#define _mm_and_ps __qf_mm_and_ps +GNU89INLINE inline __m128 _mm_xor_ps (__m128 a, __m128 b); +GNU89INLINE inline __m128 _mm_and_ps (__m128 a, __m128 b); +#ifndef IMPLEMENT_MAT4F_Funcs +GNU89INLINE inline +#else +VISIBLE +#endif +__m128 _mm_xor_ps (__m128 a, __m128 b) +{ + return (__m128) ((vec4i_t) a ^ (vec4i_t) b); +} +#ifndef IMPLEMENT_MAT4F_Funcs +GNU89INLINE inline +#else +VISIBLE +#endif +__m128 _mm_and_ps (__m128 a, __m128 b) +{ + return (__m128) ((vec4i_t) a & (vec4i_t) b); +} +#endif + #endif//__QF_simd_types_h diff --git a/include/QF/simd/vec4d.h b/include/QF/simd/vec4d.h index 01fa42d88..f1c6d6a49 100644 --- a/include/QF/simd/vec4d.h +++ b/include/QF/simd/vec4d.h @@ -28,6 +28,7 @@ #ifndef __QF_simd_vec4d_h #define __QF_simd_vec4d_h +#ifdef __AVX__ #include #include "QF/simd/types.h" @@ -292,4 +293,6 @@ storevec3d (double v3[3], vec4d_t v4) v3[2] = v4[2]; } +#endif + #endif//__QF_simd_vec4d_h diff --git a/include/QF/simd/vec4f.h b/include/QF/simd/vec4f.h index 82addbfaf..07498d706 100644 --- a/include/QF/simd/vec4f.h +++ b/include/QF/simd/vec4f.h @@ -110,7 +110,11 @@ vabsf (vec4f_t v) { const uint32_t nan = ~0u >> 1; const vec4i_t abs = { nan, nan, nan, nan }; +#ifndef __SSE__ + return (vec4f_t) ((vec4i_t) v & abs); +#else return _mm_and_ps (v, (__m128) abs); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -121,7 +125,12 @@ VISIBLE vec4f_t vsqrtf (vec4f_t v) { +#ifndef __SSE__ + vec4f_t r = { sqrtf (v[0]), sqrtf (v[1]), sqrtf (v[2]), sqrtf (v[3]) }; + return r; +#else return _mm_sqrt_ps (v); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -132,7 +141,16 @@ VISIBLE vec4f_t vceilf (vec4f_t v) { +#ifndef __SSE4_1__ + return (vec4f_t) { + ceilf (v[0]), + ceilf (v[1]), + ceilf (v[2]), + ceilf (v[3]) + }; +#else return _mm_ceil_ps (v); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -143,7 +161,16 @@ VISIBLE vec4f_t vfloorf (vec4f_t v) { +#ifndef __SSE4_1__ + return (vec4f_t) { + floorf (v[0]), + floorf (v[1]), + floorf (v[2]), + floorf (v[3]) + }; +#else return _mm_floor_ps (v); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -154,7 +181,16 @@ VISIBLE vec4f_t vtruncf (vec4f_t v) { +#ifndef __SSE4_1__ + return (vec4f_t) { + truncf (v[0]), + truncf (v[1]), + truncf (v[2]), + truncf (v[3]) + }; +#else return _mm_round_ps (v, _MM_FROUND_TRUNC); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -179,8 +215,13 @@ vec4f_t dotf (vec4f_t a, vec4f_t b) { vec4f_t c = a * b; +#ifndef __SSE3__ + float x = c[0] + c[1] + c[2] + c[3]; + c = (vec4f_t) { x, x, x, x }; +#else c = _mm_hadd_ps (c, c); c = _mm_hadd_ps (c, c); +#endif return c; } @@ -197,7 +238,11 @@ qmulf (vec4f_t a, vec4f_t b) vec4f_t c = crossf (a, b) + a * b[3] + a[3] * b; vec4f_t d = dotf (a, b); // zero out the vector component of dot product so only the scalar remains +#ifndef __SSE4_1__ + d = (vec4f_t) { 0, 0, 0, d[3] }; +#else d = _mm_insert_ps (d, d, 0xf7); +#endif return c - d; } @@ -212,7 +257,11 @@ qvmulf (vec4f_t q, vec4f_t v) float s = q[3]; // zero the scalar of the quaternion. Results in an extra operation, but // avoids adding precision issues. +#ifndef __SSE4_1__ + q[3] = 0; +#else q = _mm_insert_ps (q, q, 0xf8); +#endif vec4f_t c = crossf (q, v); vec4f_t qv = dotf (q, v); // q.w is 0 so v.w is irrelevant vec4f_t qq = dotf (q, q); @@ -231,7 +280,11 @@ vqmulf (vec4f_t v, vec4f_t q) float s = q[3]; // zero the scalar of the quaternion. Results in an extra operation, but // avoids adding precision issues. +#ifndef __SSE4_1__ + q[3] = 0; +#else q = _mm_insert_ps (q, q, 0xf8); +#endif vec4f_t c = crossf (q, v); vec4f_t qv = dotf (q, v); // q.w is 0 so v.w is irrelevant vec4f_t qq = dotf (q, q); @@ -266,7 +319,11 @@ vec4f_t qconjf (vec4f_t q) { const vec4i_t neg = { 1u << 31, 1u << 31, 1u << 31, 0 }; +#ifndef __SSE__ + return (vec4f_t) ((vec4i_t) q ^ neg); +#else return _mm_xor_ps (q, (__m128) neg); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -299,6 +356,9 @@ loadvec3f (const float v3[3]) { vec4f_t v4; +#ifndef __SSE4_1__ + v4 = (vec4f_t) { v3[0], v3[1], v3[2], 0 }; +#else // this had to be in asm otherwise gcc thinks v4 is only partially // initialized, and gcc 10 does not use the zero flags when generating // the code, resulting in a memory access to load a 0 into v4[3] @@ -311,6 +371,7 @@ loadvec3f (const float v3[3]) " : "=v"(v4) : "m"(v3[0]), "m"(v3[1]), "m"(v3[2])); +#endif return v4; } diff --git a/libs/util/test/test-simd.c b/libs/util/test/test-simd.c index ebaab413f..ceb9ad7d4 100644 --- a/libs/util/test/test-simd.c +++ b/libs/util/test/test-simd.c @@ -48,15 +48,19 @@ #define s05 0.70710678118654757 +#ifdef __AVX__ typedef struct { + int line; vec4d_t (*op) (vec4d_t a, vec4d_t b); vec4d_t a; vec4d_t b; vec4d_t expect; vec4d_t ulp_errors; } vec4d_test_t; +#endif typedef struct { + int line; vec4f_t (*op) (vec4f_t a, vec4f_t b); vec4f_t a; vec4f_t b; @@ -65,6 +69,7 @@ typedef struct { } vec4f_test_t; typedef struct { + int line; void (*op) (mat4f_t c, const mat4f_t a, const mat4f_t b); mat4f_t a; mat4f_t b; @@ -73,6 +78,7 @@ typedef struct { } mat4f_test_t; typedef struct { + int line; vec4f_t (*op) (const mat4f_t a, vec4f_t b); mat4f_t a; vec4f_t b; @@ -81,12 +87,14 @@ typedef struct { } mv4f_test_t; typedef struct { + int line; void (*op) (mat4f_t m, vec4f_t q); vec4f_t q; mat4f_t expect; mat4f_t ulp_errors; } mq4f_test_t; +#ifdef __AVX__ static vec4d_t tvtruncd (vec4d_t v, vec4d_t ignore) { return vtruncd (v); @@ -106,6 +114,7 @@ static vec4d_t tqconjd (vec4d_t v, vec4d_t ignore) { return qconjd (v); } +#endif static vec4f_t tvtruncf (vec4f_t v, vec4f_t ignore) { @@ -147,290 +156,320 @@ static vec4f_t tmagnitude3f (vec4f_t v, vec4f_t ignore) return magnitude3f (v); } +#define T(t...) { __LINE__, t } + +#ifdef __AVX__ static vec4d_test_t vec4d_tests[] = { // 3D dot products - { dotd, right, right, one }, - { dotd, right, forward, zero }, - { dotd, right, up, zero }, - { dotd, forward, right, zero }, - { dotd, forward, forward, one }, - { dotd, forward, up, zero }, - { dotd, up, right, zero }, - { dotd, up, forward, zero }, - { dotd, up, up, one }, + T(dotd, right, right, one ), + T(dotd, right, forward, zero ), + T(dotd, right, up, zero ), + T(dotd, forward, right, zero ), + T(dotd, forward, forward, one ), + T(dotd, forward, up, zero ), + T(dotd, up, right, zero ), + T(dotd, up, forward, zero ), + T(dotd, up, up, one ), // one is 4D, so its self dot product is 4 - { dotd, one, one, { 4, 4, 4, 4} }, - { dotd, one, none, {-4, -4, -4, -4} }, + T(dotd, one, one, { 4, 4, 4, 4} ), + T(dotd, one, none, {-4, -4, -4, -4} ), // 3D cross products - { crossd, right, right, zero }, - { crossd, right, forward, up }, - { crossd, right, up, nforward }, - { crossd, forward, right, nup }, - { crossd, forward, forward, zero }, - { crossd, forward, up, right }, - { crossd, up, right, forward }, - { crossd, up, forward, nright }, - { crossd, up, up, zero }, + T(crossd, right, right, zero ), + T(crossd, right, forward, up ), + T(crossd, right, up, nforward ), + T(crossd, forward, right, nup ), + T(crossd, forward, forward, zero ), + T(crossd, forward, up, right ), + T(crossd, up, right, forward ), + T(crossd, up, forward, nright ), + T(crossd, up, up, zero ), // double whammy tests: cross product with an angled vector and // ensuring that a 4d vector (non-zero w component) does not affect // the result, including the result's w component remaining zero. - { crossd, right, one, { 0, -1, 1} }, - { crossd, forward, one, { 1, 0, -1} }, - { crossd, up, one, {-1, 1, 0} }, - { crossd, one, right, { 0, 1, -1} }, - { crossd, one, forward, {-1, 0, 1} }, - { crossd, one, up, { 1, -1, 0} }, + T(crossd, right, one, { 0, -1, 1} ), + T(crossd, forward, one, { 1, 0, -1} ), + T(crossd, up, one, {-1, 1, 0} ), + T(crossd, one, right, { 0, 1, -1} ), + T(crossd, one, forward, {-1, 0, 1} ), + T(crossd, one, up, { 1, -1, 0} ), // This one fails when optimizing with -mfma (which is why fma is not // used): ulp errors in z and w - { crossd, qtest, qtest, {0, 0, 0, 0} }, + T(crossd, qtest, qtest, {0, 0, 0, 0} ), - { qmuld, qident, qident, qident }, - { qmuld, qident, right, right }, - { qmuld, qident, forward, forward }, - { qmuld, qident, up, up }, - { qmuld, right, qident, right }, - { qmuld, forward, qident, forward }, - { qmuld, up, qident, up }, - { qmuld, right, right, nqident }, - { qmuld, right, forward, up }, - { qmuld, right, up, nforward }, - { qmuld, forward, right, nup }, - { qmuld, forward, forward, nqident }, - { qmuld, forward, up, right }, - { qmuld, up, right, forward }, - { qmuld, up, forward, nright }, - { qmuld, up, up, nqident }, - { qmuld, one, one, { 2, 2, 2, -2 } }, - { qmuld, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } }, + T(qmuld, qident, qident, qident ), + T(qmuld, qident, right, right ), + T(qmuld, qident, forward, forward ), + T(qmuld, qident, up, up ), + T(qmuld, right, qident, right ), + T(qmuld, forward, qident, forward ), + T(qmuld, up, qident, up ), + T(qmuld, right, right, nqident ), + T(qmuld, right, forward, up ), + T(qmuld, right, up, nforward ), + T(qmuld, forward, right, nup ), + T(qmuld, forward, forward, nqident ), + T(qmuld, forward, up, right ), + T(qmuld, up, right, forward ), + T(qmuld, up, forward, nright ), + T(qmuld, up, up, nqident ), + T(qmuld, one, one, { 2, 2, 2, -2 } ), + T(qmuld, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } ), // This one fails when optimizing with -mfma (which is why fma is not // used): ulp error in z - { qmuld, qtest, qtest, {0.768, 0.576, 0, -0.28} }, + T(qmuld, qtest, qtest, {0.768, 0.576, 0, -0.28} ), // The one vector is not unit (magnitude 2), so using it as a rotation // quaternion results in scaling by 4. However, it still has the effect // of rotating 120 degrees around the axis equidistant from the three // orthogonal axes such that x->y->z->x - { qvmuld, one, right, { 0, 4, 0, 0 } }, - { qvmuld, one, forward, { 0, 0, 4, 0 } }, - { qvmuld, one, up, { 4, 0, 0, 0 } }, - { qvmuld, one, {1,1,1,0}, { 4, 4, 4, 0 } }, - { qvmuld, one, one, { 4, 4, 4, -2 } }, + T(qvmuld, one, right, { 0, 4, 0, 0 } ), + T(qvmuld, one, forward, { 0, 0, 4, 0 } ), + T(qvmuld, one, up, { 4, 0, 0, 0 } ), + T(qvmuld, one, {1,1,1,0}, { 4, 4, 4, 0 } ), + T(qvmuld, one, one, { 4, 4, 4, -2 } ), // inverse rotation, so x->z->y->x - { vqmuld, right, one, { 0, 0, 4, 0 } }, - { vqmuld, forward, one, { 4, 0, 0, 0 } }, - { vqmuld, up, one, { 0, 4, 0, 0 } }, - { vqmuld, {1,1,1,0}, one, { 4, 4, 4, 0 } }, - { vqmuld, one, one, { 4, 4, 4, -2 } }, + T(vqmuld, right, one, { 0, 0, 4, 0 } ), + T(vqmuld, forward, one, { 4, 0, 0, 0 } ), + T(vqmuld, up, one, { 0, 4, 0, 0 } ), + T(vqmuld, {1,1,1,0}, one, { 4, 4, 4, 0 } ), + T(vqmuld, one, one, { 4, 4, 4, -2 } ), // The half vector is unit. - { qvmuld, half, right, forward }, - { qvmuld, half, forward, up }, - { qvmuld, half, up, right }, - { qvmuld, half, {1,1,1,0}, { 1, 1, 1, 0 } }, + T(qvmuld, half, right, forward ), + T(qvmuld, half, forward, up ), + T(qvmuld, half, up, right ), + T(qvmuld, half, {1,1,1,0}, { 1, 1, 1, 0 } ), // inverse - { vqmuld, right, half, up }, - { vqmuld, forward, half, right }, - { vqmuld, up, half, forward }, - { vqmuld, {1,1,1,0}, half, { 1, 1, 1, 0 } }, + T(vqmuld, right, half, up ), + T(vqmuld, forward, half, right ), + T(vqmuld, up, half, forward ), + T(vqmuld, {1,1,1,0}, half, { 1, 1, 1, 0 } ), // one is a 4D vector and qvmuld is meant for 3D vectors. However, it // seems that the vector's w has no effect on the 3d portion of the // result, but the result's w is cosine of the full rotation angle // scaled by quaternion magnitude and vector w - { qvmuld, half, one, { 1, 1, 1, -0.5 } }, - { qvmuld, half, {2,2,2,2}, { 2, 2, 2, -1 } }, - { qvmuld, qtest, right, {0.5392, 0.6144, -0.576, 0} }, - { qvmuld, qtest, forward, {0.6144, 0.1808, 0.768, 0}, - {0, -2.7e-17, 0, 0} }, - { qvmuld, qtest, up, {0.576, -0.768, -0.28, 0} }, + T(qvmuld, half, one, { 1, 1, 1, -0.5 } ), + T(qvmuld, half, {2,2,2,2}, { 2, 2, 2, -1 } ), + T(qvmuld, qtest, right, {0.5392, 0.6144, -0.576, 0} ), + T(qvmuld, qtest, forward, {0.6144, 0.1808, 0.768, 0}, + {0, -2.7e-17, 0, 0} ), + T(qvmuld, qtest, up, {0.576, -0.768, -0.28, 0} ), // inverse - { vqmuld, one, half, { 1, 1, 1, -0.5 } }, - { vqmuld, {2,2,2,2}, half, { 2, 2, 2, -1 } }, - { vqmuld, right, qtest, {0.5392, 0.6144, 0.576, 0} }, - { vqmuld, forward, qtest, {0.6144, 0.1808, -0.768, 0}, - {0, -2.7e-17, 0, 0} }, - { vqmuld, up, qtest, {-0.576, 0.768, -0.28, 0} }, + T(vqmuld, one, half, { 1, 1, 1, -0.5 } ), + T(vqmuld, {2,2,2,2}, half, { 2, 2, 2, -1 } ), + T(vqmuld, right, qtest, {0.5392, 0.6144, 0.576, 0} ), + T(vqmuld, forward, qtest, {0.6144, 0.1808, -0.768, 0}, + {0, -2.7e-17, 0, 0} ), + T(vqmuld, up, qtest, {-0.576, 0.768, -0.28, 0} ), - { qrotd, right, right, qident }, - { qrotd, right, forward, { 0, 0, s05, s05 }, - {0, 0, -1.1e-16, 0} }, - { qrotd, right, up, { 0, -s05, 0, s05 }, - {0, 1.1e-16, 0, 0} }, - { qrotd, forward, right, { 0, 0, -s05, s05 }, - {0, 0, 1.1e-16, 0} }, - { qrotd, forward, forward, qident }, - { qrotd, forward, up, { s05, 0, 0, s05 }, - {-1.1e-16, 0, 0, 0} }, - { qrotd, up, right, { 0, s05, 0, s05 }, - {0, -1.1e-16, 0, 0} }, - { qrotd, up, forward, { -s05, 0, 0, s05 }, - { 1.1e-16, 0, 0, 0} }, - { qrotd, up, up, qident }, + T(qrotd, right, right, qident ), + T(qrotd, right, forward, { 0, 0, s05, s05 }, + {0, 0, -1.1e-16, 0} ), + T(qrotd, right, up, { 0, -s05, 0, s05 }, + {0, 1.1e-16, 0, 0} ), + T(qrotd, forward, right, { 0, 0, -s05, s05 }, + {0, 0, 1.1e-16, 0} ), + T(qrotd, forward, forward, qident ), + T(qrotd, forward, up, { s05, 0, 0, s05 }, + {-1.1e-16, 0, 0, 0} ), + T(qrotd, up, right, { 0, s05, 0, s05 }, + {0, -1.1e-16, 0, 0} ), + T(qrotd, up, forward, { -s05, 0, 0, s05 }, + { 1.1e-16, 0, 0, 0} ), + T(qrotd, up, up, qident ), - { tvtruncd, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } }, - { tvceild, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } }, - { tvfloord, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } }, - { tqconjd, one, {}, { -1, -1, -1, 1 } }, + T(tvtruncd, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } ), + T(tvceild, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } ), + T(tvfloord, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } ), + T(tqconjd, one, {}, { -1, -1, -1, 1 } ), }; #define num_vec4d_tests (sizeof (vec4d_tests) / (sizeof (vec4d_tests[0]))) +#endif static vec4f_test_t vec4f_tests[] = { // 3D dot products - { dotf, right, right, one }, - { dotf, right, forward, zero }, - { dotf, right, up, zero }, - { dotf, forward, right, zero }, - { dotf, forward, forward, one }, - { dotf, forward, up, zero }, - { dotf, up, right, zero }, - { dotf, up, forward, zero }, - { dotf, up, up, one }, + T(dotf, right, right, one ), + T(dotf, right, forward, zero ), + T(dotf, right, up, zero ), + T(dotf, forward, right, zero ), + T(dotf, forward, forward, one ), + T(dotf, forward, up, zero ), + T(dotf, up, right, zero ), + T(dotf, up, forward, zero ), + T(dotf, up, up, one ), // one is 4D, so its self dot product is 4 - { dotf, one, one, { 4, 4, 4, 4} }, - { dotf, one, none, {-4, -4, -4, -4} }, + T(dotf, one, one, { 4, 4, 4, 4} ), + T(dotf, one, none, {-4, -4, -4, -4} ), // 3D cross products - { crossf, right, right, zero }, - { crossf, right, forward, up }, - { crossf, right, up, nforward }, - { crossf, forward, right, nup }, - { crossf, forward, forward, zero }, - { crossf, forward, up, right }, - { crossf, up, right, forward }, - { crossf, up, forward, nright }, - { crossf, up, up, zero }, + T(crossf, right, right, zero ), + T(crossf, right, forward, up ), + T(crossf, right, up, nforward ), + T(crossf, forward, right, nup ), + T(crossf, forward, forward, zero ), + T(crossf, forward, up, right ), + T(crossf, up, right, forward ), + T(crossf, up, forward, nright ), + T(crossf, up, up, zero ), // double whammy tests: cross product with an angled vector and // ensuring that a 4d vector (non-zero w component) does not affect // the result, including the result's w component remaining zero. - { crossf, right, one, { 0, -1, 1} }, - { crossf, forward, one, { 1, 0, -1} }, - { crossf, up, one, {-1, 1, 0} }, - { crossf, one, right, { 0, 1, -1} }, - { crossf, one, forward, {-1, 0, 1} }, - { crossf, one, up, { 1, -1, 0} }, - { crossf, qtest, qtest, {0, 0, 0, 0} }, + T(crossf, right, one, { 0, -1, 1} ), + T(crossf, forward, one, { 1, 0, -1} ), + T(crossf, up, one, {-1, 1, 0} ), + T(crossf, one, right, { 0, 1, -1} ), + T(crossf, one, forward, {-1, 0, 1} ), + T(crossf, one, up, { 1, -1, 0} ), + T(crossf, qtest, qtest, {0, 0, 0, 0} ), - { qmulf, qident, qident, qident }, - { qmulf, qident, right, right }, - { qmulf, qident, forward, forward }, - { qmulf, qident, up, up }, - { qmulf, right, qident, right }, - { qmulf, forward, qident, forward }, - { qmulf, up, qident, up }, - { qmulf, right, right, nqident }, - { qmulf, right, forward, up }, - { qmulf, right, up, nforward }, - { qmulf, forward, right, nup }, - { qmulf, forward, forward, nqident }, - { qmulf, forward, up, right }, - { qmulf, up, right, forward }, - { qmulf, up, forward, nright }, - { qmulf, up, up, nqident }, - { qmulf, one, one, { 2, 2, 2, -2 } }, - { qmulf, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } }, - { qmulf, qtest, qtest, {0.768, 0.576, 0, -0.28}, - {0, 6e-8, 0, 3e-8} }, + T(qmulf, qident, qident, qident ), + T(qmulf, qident, right, right ), + T(qmulf, qident, forward, forward ), + T(qmulf, qident, up, up ), + T(qmulf, right, qident, right ), + T(qmulf, forward, qident, forward ), + T(qmulf, up, qident, up ), + T(qmulf, right, right, nqident ), + T(qmulf, right, forward, up ), + T(qmulf, right, up, nforward ), + T(qmulf, forward, right, nup ), + T(qmulf, forward, forward, nqident ), + T(qmulf, forward, up, right ), + T(qmulf, up, right, forward ), + T(qmulf, up, forward, nright ), + T(qmulf, up, up, nqident ), + T(qmulf, one, one, { 2, 2, 2, -2 } ), + T(qmulf, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } ), + T(qmulf, qtest, qtest, {0.768, 0.576, 0, -0.28}, +#ifndef __SSE__ + {0, 6e-8, 0, 6e-8} +#else + {0, 6e-8, 0, 3e-8} +#endif + ), // The one vector is not unit (magnitude 2), so using it as a rotation // quaternion results in scaling by 4. However, it still has the effect // of rotating 120 degrees around the axis equidistant from the three // orthogonal axes such that x->y->z->x - { qvmulf, one, right, { 0, 4, 0, 0 } }, - { qvmulf, one, forward, { 0, 0, 4, 0 } }, - { qvmulf, one, up, { 4, 0, 0, 0 } }, - { qvmulf, one, {1,1,1,0}, { 4, 4, 4, 0 } }, - { qvmulf, one, one, { 4, 4, 4, -2 } }, + T(qvmulf, one, right, { 0, 4, 0, 0 } ), + T(qvmulf, one, forward, { 0, 0, 4, 0 } ), + T(qvmulf, one, up, { 4, 0, 0, 0 } ), + T(qvmulf, one, {1,1,1,0}, { 4, 4, 4, 0 } ), + T(qvmulf, one, one, { 4, 4, 4, -2 } ), // inverse rotation, so x->z->y->x - { vqmulf, right, one, { 0, 0, 4, 0 } }, - { vqmulf, forward, one, { 4, 0, 0, 0 } }, - { vqmulf, up, one, { 0, 4, 0, 0 } }, - { vqmulf, {1,1,1,0}, one, { 4, 4, 4, 0 } }, - { vqmulf, one, one, { 4, 4, 4, -2 } }, + T(vqmulf, right, one, { 0, 0, 4, 0 } ), + T(vqmulf, forward, one, { 4, 0, 0, 0 } ), + T(vqmulf, up, one, { 0, 4, 0, 0 } ), + T(vqmulf, {1,1,1,0}, one, { 4, 4, 4, 0 } ), + T(vqmulf, one, one, { 4, 4, 4, -2 } ), // - { qvmulf, qtest, right, {0.5392, 0.6144, -0.576, 0}, - {0, -5.9e-8, -6e-8, 0} }, - { qvmulf, qtest, forward, {0.6144, 0.1808, 0.768, 0}, - {-5.9e-8, 1.5e-8, 0, 0} }, - { qvmulf, qtest, up, {0.576, -0.768, -0.28, 0}, - {6e-8, 0, 3e-8, 0} }, - { vqmulf, right, qtest, {0.5392, 0.6144, 0.576, 0}, - {0, -5.9e-8, 5.9e-8, 0} }, - { vqmulf, forward, qtest, {0.6144, 0.1808, -0.768, 0}, - {-5.9e-8, 1.5e-8, 0, 0} }, - { vqmulf, up, qtest, {-0.576, 0.768, -0.28, 0}, - {-5.9e-8, 0, 3e-8, 0} }, + T(qvmulf, qtest, right, {0.5392, 0.6144, -0.576, 0}, + {0, -5.9e-8, -6e-8, 0} ), + T(qvmulf, qtest, forward, {0.6144, 0.1808, 0.768, 0}, +#ifndef __SSE__ + {-5.9e-8, 3e-8, 0, 0} +#else + {-5.9e-8, 1.5e-8, 0, 0} +#endif + ), + T(qvmulf, qtest, up, {0.576, -0.768, -0.28, 0}, +#ifndef __SSE__ + {6e-8, 0, 6e-8, 0} +#else + {6e-8, 0, 3e-8, 0} +#endif + ), + T(vqmulf, right, qtest, {0.5392, 0.6144, 0.576, 0}, + {0, -5.9e-8, 5.9e-8, 0} ), + T(vqmulf, forward, qtest, {0.6144, 0.1808, -0.768, 0}, +#ifndef __SSE__ + {-5.9e-8, 3e-8, 0, 0} +#else + {-5.9e-8, 1.5e-8, 0, 0} +#endif + ), + T(vqmulf, up, qtest, {-0.576, 0.768, -0.28, 0}, +#ifndef __SSE__ + {-5.9e-8, 0, 6e-8, 0} +#else + {-5.9e-8, 0, 3e-8, 0} +#endif + ), - { qrotf, right, right, qident }, - { qrotf, right, forward, { 0, 0, s05, s05 } }, - { qrotf, right, up, { 0, -s05, 0, s05 } }, - { qrotf, forward, right, { 0, 0, -s05, s05 } }, - { qrotf, forward, forward, qident }, - { qrotf, forward, up, { s05, 0, 0, s05 } }, - { qrotf, up, right, { 0, s05, 0, s05 } }, - { qrotf, up, forward, { -s05, 0, 0, s05 } }, - { qrotf, up, up, qident }, + T(qrotf, right, right, qident ), + T(qrotf, right, forward, { 0, 0, s05, s05 } ), + T(qrotf, right, up, { 0, -s05, 0, s05 } ), + T(qrotf, forward, right, { 0, 0, -s05, s05 } ), + T(qrotf, forward, forward, qident ), + T(qrotf, forward, up, { s05, 0, 0, s05 } ), + T(qrotf, up, right, { 0, s05, 0, s05 } ), + T(qrotf, up, forward, { -s05, 0, 0, s05 } ), + T(qrotf, up, up, qident ), - { tvabsf, pmpi, {}, pi }, - { tvsqrtf, { 1, 4, 9, 16}, {}, {1, 2, 3, 4} }, - { tvtruncf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } }, - { tvceilf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } }, - { tvfloorf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } }, - { tqconjf, one, {}, { -1, -1, -1, 1 } }, - { tmagnitudef, { 3, 4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, 4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, 4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, 4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitude3f, { -3, -4, -12, -84}, {}, {13, 13, 13, 13} }, + T(tvabsf, pmpi, {}, pi ), + T(tvsqrtf, { 1, 4, 9, 16}, {}, {1, 2, 3, 4} ), + T(tvtruncf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } ), + T(tvceilf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } ), + T(tvfloorf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } ), + T(tqconjf, one, {}, { -1, -1, -1, 1 } ), + T(tmagnitudef, { 3, 4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, 4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, 4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, 4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitude3f, { -3, -4, -12, -84}, {}, {13, 13, 13, 13} ), }; #define num_vec4f_tests (sizeof (vec4f_tests) / (sizeof (vec4f_tests[0]))) static mat4f_test_t mat4f_tests[] = { - { mmulf, identity, identity, identity }, - { mmulf, rotate120, identity, rotate120 }, - { mmulf, identity, rotate120, rotate120 }, - { mmulf, rotate120, rotate120, rotate240 }, - { mmulf, rotate120, rotate240, identity }, - { mmulf, rotate240, rotate120, identity }, + T(mmulf, identity, identity, identity ), + T(mmulf, rotate120, identity, rotate120 ), + T(mmulf, identity, rotate120, rotate120 ), + T(mmulf, rotate120, rotate120, rotate240 ), + T(mmulf, rotate120, rotate240, identity ), + T(mmulf, rotate240, rotate120, identity ), }; #define num_mat4f_tests (sizeof (mat4f_tests) / (sizeof (mat4f_tests[0]))) static mv4f_test_t mv4f_tests[] = { - { mvmulf, identity, { 1, 0, 0, 0 }, { 1, 0, 0, 0 } }, - { mvmulf, identity, { 0, 1, 0, 0 }, { 0, 1, 0, 0 } }, - { mvmulf, identity, { 0, 0, 1, 0 }, { 0, 0, 1, 0 } }, - { mvmulf, identity, { 0, 0, 0, 1 }, { 0, 0, 0, 1 } }, - { mvmulf, rotate120, { 1, 2, 3, 4 }, { 3, 1, 2, 4 } }, - { mvmulf, rotate240, { 1, 2, 3, 4 }, { 2, 3, 1, 4 } }, + T(mvmulf, identity, { 1, 0, 0, 0 }, { 1, 0, 0, 0 } ), + T(mvmulf, identity, { 0, 1, 0, 0 }, { 0, 1, 0, 0 } ), + T(mvmulf, identity, { 0, 0, 1, 0 }, { 0, 0, 1, 0 } ), + T(mvmulf, identity, { 0, 0, 0, 1 }, { 0, 0, 0, 1 } ), + T(mvmulf, rotate120, { 1, 2, 3, 4 }, { 3, 1, 2, 4 } ), + T(mvmulf, rotate240, { 1, 2, 3, 4 }, { 2, 3, 1, 4 } ), }; #define num_mv4f_tests (sizeof (mv4f_tests) / (sizeof (mv4f_tests[0]))) // expect filled in using non-simd QuatToMatrix (has its own tests) static mq4f_test_t mq4f_tests[] = { - { mat4fquat, { 0, 0, 0, 1 } }, - { mat4fquat, { 0.5, 0.5, 0.5, 0.5 } }, - { mat4fquat, { 0.5, 0.5, -0.5, 0.5 } }, - { mat4fquat, { 0.5, -0.5, 0.5, 0.5 } }, - { mat4fquat, { 0.5, -0.5, -0.5, 0.5 } }, - { mat4fquat, { -0.5, 0.5, 0.5, 0.5 } }, - { mat4fquat, { -0.5, 0.5, -0.5, 0.5 } }, - { mat4fquat, { -0.5, -0.5, 0.5, 0.5 } }, - { mat4fquat, { -0.5, -0.5, -0.5, 0.5 } }, + T(mat4fquat, { 0, 0, 0, 1 } ), + T(mat4fquat, { 0.5, 0.5, 0.5, 0.5 } ), + T(mat4fquat, { 0.5, 0.5, -0.5, 0.5 } ), + T(mat4fquat, { 0.5, -0.5, 0.5, 0.5 } ), + T(mat4fquat, { 0.5, -0.5, -0.5, 0.5 } ), + T(mat4fquat, { -0.5, 0.5, 0.5, 0.5 } ), + T(mat4fquat, { -0.5, 0.5, -0.5, 0.5 } ), + T(mat4fquat, { -0.5, -0.5, 0.5, 0.5 } ), + T(mat4fquat, { -0.5, -0.5, -0.5, 0.5 } ), }; #define num_mq4f_tests (sizeof (mq4f_tests) / (sizeof (mq4f_tests[0]))) +#ifdef __AVX__ static int run_vec4d_tests (void) { @@ -443,7 +482,7 @@ run_vec4d_tests (void) vec4l_t res = result != expect; if (res[0] || res[1] || res[2] || res[3]) { ret |= 1; - printf ("\nrun_vec4d_tests %zd\n", i); + printf ("\nrun_vec4d_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4D_FMT "\n", VEC4_EXP(test->a)); printf ("b: " VEC4D_FMT "\n", VEC4_EXP(test->b)); printf ("r: " VEC4D_FMT "\n", VEC4_EXP(result)); @@ -455,6 +494,7 @@ run_vec4d_tests (void) } return ret; } +#endif static int run_vec4f_tests (void) @@ -465,10 +505,10 @@ run_vec4f_tests (void) __auto_type test = &vec4f_tests[i]; vec4f_t result = test->op (test->a, test->b); vec4f_t expect = test->expect + test->ulp_errors; - vec4i_t res = result != expect; + vec4i_t res = (vec4i_t) result != (vec4i_t) expect; if (res[0] || res[1] || res[2] || res[3]) { ret |= 1; - printf ("\nrun_vec4f_tests %zd\n", i); + printf ("\nrun_vec4f_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4F_FMT "\n", VEC4_EXP(test->a)); printf ("b: " VEC4F_FMT "\n", VEC4_EXP(test->b)); printf ("r: " VEC4F_FMT "\n", VEC4_EXP(result)); @@ -502,7 +542,7 @@ run_mat4f_tests (void) } if (fail) { ret |= 1; - printf ("\nrun_mat4f_tests %zd\n", i); + printf ("\nrun_mat4f_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4F_FMT "\n", MAT4_ROW(test->a, 0)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 1)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 2)); @@ -549,7 +589,7 @@ run_mv4f_tests (void) if (res[0] || res[1] || res[2] || res[3]) { ret |= 1; - printf ("\nrun_mv4f_tests %zd\n", i); + printf ("\nrun_mv4f_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4F_FMT "\n", MAT4_ROW(test->a, 0)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 1)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 2)); @@ -595,7 +635,7 @@ run_mq4f_tests (void) } if (fail) { ret |= 1; - printf ("\nrun_mq4f_tests %zd\n", i); + printf ("\nrun_mq4f_tests %zd, line %d\n", i, test->line); printf ("q: " VEC4F_FMT "\n", VEC4_EXP(test->q)); printf ("r: " VEC4F_FMT "\n", MAT4_ROW(result, 0)); printf (" " VEC4F_FMT "\n", MAT4_ROW(result, 1)); @@ -626,7 +666,9 @@ int main (void) { int ret = 0; +#ifdef __AVX__ ret |= run_vec4d_tests (); +#endif ret |= run_vec4f_tests (); ret |= run_mat4f_tests (); ret |= run_mv4f_tests (); diff --git a/tools/qfvis/source/base-vis.c b/tools/qfvis/source/base-vis.c index 089b196f9..877f3f97d 100644 --- a/tools/qfvis/source/base-vis.c +++ b/tools/qfvis/source/base-vis.c @@ -86,6 +86,7 @@ SimpleFlood (basethread_t *thread, portal_t *srcportal, int clusternum) static inline int test_sphere (const vspheref_t *sphere, vec4f_t plane) { +#ifdef __SSE3__ const vec4f_t zero = {}; float r = sphere->radius; vec4f_t eps = { r, r, r, r }; @@ -94,6 +95,12 @@ test_sphere (const vspheref_t *sphere, vec4f_t plane) c = (vec4i_t) _mm_hsub_epi32 ((__m128i) c, (__m128i) c); return c[0]; +#else + float d = DotProduct (sphere->center, plane) + plane[3]; + int front = (d >= sphere->radius); + int back = (d <= -sphere->radius); + return front - back; +#endif } void diff --git a/tools/qfvis/source/qfvis.c b/tools/qfvis/source/qfvis.c index a1dc5ce33..156d1650f 100644 --- a/tools/qfvis/source/qfvis.c +++ b/tools/qfvis/source/qfvis.c @@ -214,12 +214,20 @@ NewFlippedWinding (threaddata_t *thread, const winding_t *w) static vec4i_t signeps (vec4f_t dist) { +#ifdef __SSE3__ const vec4f_t zero = {}; const vec4f_t eps = { ON_EPSILON, ON_EPSILON, ON_EPSILON, ON_EPSILON }; vec4f_t d = _mm_addsub_ps (zero, dist); vec4i_t c = (d - eps) > 0; c = (vec4i_t) _mm_hsub_epi32 ((__m128i) c, (__m128i) c); return c; +#else + float d = dist[0]; + int front = (d >= ON_EPSILON); + int back = (d <= -ON_EPSILON); + int i = front - back; + return (vec4i_t) { i, i, i, i }; +#endif } static vec4f_t @@ -246,7 +254,12 @@ split_edge (const vec4f_t *points, const vec4f_t *dists, vec4i_t x = _mm_and_ps (split, (__m128) nan) == onenan; // plane vector has -dist in w vec4f_t y = _mm_and_ps (split, (__m128) x) * -split[3]; +#ifdef __SSE3__ mid = _mm_blendv_ps (mid, y, (__m128) x); +#else + mid = (vec4f_t) ((vec4i_t) _mm_and_ps (y, (__m128) x) | + (vec4i_t) _mm_and_ps (mid, (__m128) ~x)); +#endif if (isnan (mid[0])) *(int *) 0 = 0; return mid; }