diff --git a/config.d/compiling.m4 b/config.d/compiling.m4 index b432c1709..9f8dcf1e0 100644 --- a/config.d/compiling.m4 +++ b/config.d/compiling.m4 @@ -81,7 +81,10 @@ AC_ARG_ENABLE(optimize, optimize=yes ) -QF_CC_OPTION(-mavx2) +QF_CC_OPTION(-Wno-psabi) +dnl QF_CC_OPTION(-msse2) +dnl QF_CC_OPTION(-Wno-psabi) +dnl QF_CC_OPTION(-mavx2) dnl fma is not used as it is the equivalent of turning on dnl -funsafe-math-optimizations dnl QF_CC_OPTION(-mfma) diff --git a/include/QF/simd/types.h b/include/QF/simd/types.h index c5499f5bb..bfe484ace 100644 --- a/include/QF/simd/types.h +++ b/include/QF/simd/types.h @@ -79,7 +79,11 @@ VEC_TYPE (float, vec4f_t); VEC_TYPE (int, vec4i_t); #define VEC4D_FMT "[%.17g, %.17g, %.17g, %.17g]" +#if __WORDSIZE == 64 #define VEC4L_FMT "[%ld, %ld, %ld, %ld]" +#else +#define VEC4L_FMT "[%lld, %lld, %lld, %lld]" +#endif #define VEC4F_FMT "[%.9g, %.9g, %.9g, %.9g]" #define VEC4I_FMT "[%d, %d, %d, %d]" #define VEC4_EXP(v) (v)[0], (v)[1], (v)[2], (v)[3] @@ -94,4 +98,30 @@ typedef struct vspheref_s { float radius; } vspheref_t; +#include +#ifndef __SSE__ +#define _mm_xor_ps __qf_mm_xor_ps +#define _mm_and_ps __qf_mm_and_ps +GNU89INLINE inline __m128 _mm_xor_ps (__m128 a, __m128 b); +GNU89INLINE inline __m128 _mm_and_ps (__m128 a, __m128 b); +#ifndef IMPLEMENT_MAT4F_Funcs +GNU89INLINE inline +#else +VISIBLE +#endif +__m128 _mm_xor_ps (__m128 a, __m128 b) +{ + return (__m128) ((vec4i_t) a ^ (vec4i_t) b); +} +#ifndef IMPLEMENT_MAT4F_Funcs +GNU89INLINE inline +#else +VISIBLE +#endif +__m128 _mm_and_ps (__m128 a, __m128 b) +{ + return (__m128) ((vec4i_t) a & (vec4i_t) b); +} +#endif + #endif//__QF_simd_types_h diff --git a/include/QF/simd/vec4d.h b/include/QF/simd/vec4d.h index 01fa42d88..f1c6d6a49 100644 --- a/include/QF/simd/vec4d.h +++ b/include/QF/simd/vec4d.h @@ -28,6 +28,7 @@ #ifndef __QF_simd_vec4d_h #define __QF_simd_vec4d_h +#ifdef __AVX__ #include #include "QF/simd/types.h" @@ -292,4 +293,6 @@ storevec3d (double v3[3], vec4d_t v4) v3[2] = v4[2]; } +#endif + #endif//__QF_simd_vec4d_h diff --git a/include/QF/simd/vec4f.h b/include/QF/simd/vec4f.h index 82addbfaf..07498d706 100644 --- a/include/QF/simd/vec4f.h +++ b/include/QF/simd/vec4f.h @@ -110,7 +110,11 @@ vabsf (vec4f_t v) { const uint32_t nan = ~0u >> 1; const vec4i_t abs = { nan, nan, nan, nan }; +#ifndef __SSE__ + return (vec4f_t) ((vec4i_t) v & abs); +#else return _mm_and_ps (v, (__m128) abs); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -121,7 +125,12 @@ VISIBLE vec4f_t vsqrtf (vec4f_t v) { +#ifndef __SSE__ + vec4f_t r = { sqrtf (v[0]), sqrtf (v[1]), sqrtf (v[2]), sqrtf (v[3]) }; + return r; +#else return _mm_sqrt_ps (v); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -132,7 +141,16 @@ VISIBLE vec4f_t vceilf (vec4f_t v) { +#ifndef __SSE4_1__ + return (vec4f_t) { + ceilf (v[0]), + ceilf (v[1]), + ceilf (v[2]), + ceilf (v[3]) + }; +#else return _mm_ceil_ps (v); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -143,7 +161,16 @@ VISIBLE vec4f_t vfloorf (vec4f_t v) { +#ifndef __SSE4_1__ + return (vec4f_t) { + floorf (v[0]), + floorf (v[1]), + floorf (v[2]), + floorf (v[3]) + }; +#else return _mm_floor_ps (v); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -154,7 +181,16 @@ VISIBLE vec4f_t vtruncf (vec4f_t v) { +#ifndef __SSE4_1__ + return (vec4f_t) { + truncf (v[0]), + truncf (v[1]), + truncf (v[2]), + truncf (v[3]) + }; +#else return _mm_round_ps (v, _MM_FROUND_TRUNC); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -179,8 +215,13 @@ vec4f_t dotf (vec4f_t a, vec4f_t b) { vec4f_t c = a * b; +#ifndef __SSE3__ + float x = c[0] + c[1] + c[2] + c[3]; + c = (vec4f_t) { x, x, x, x }; +#else c = _mm_hadd_ps (c, c); c = _mm_hadd_ps (c, c); +#endif return c; } @@ -197,7 +238,11 @@ qmulf (vec4f_t a, vec4f_t b) vec4f_t c = crossf (a, b) + a * b[3] + a[3] * b; vec4f_t d = dotf (a, b); // zero out the vector component of dot product so only the scalar remains +#ifndef __SSE4_1__ + d = (vec4f_t) { 0, 0, 0, d[3] }; +#else d = _mm_insert_ps (d, d, 0xf7); +#endif return c - d; } @@ -212,7 +257,11 @@ qvmulf (vec4f_t q, vec4f_t v) float s = q[3]; // zero the scalar of the quaternion. Results in an extra operation, but // avoids adding precision issues. +#ifndef __SSE4_1__ + q[3] = 0; +#else q = _mm_insert_ps (q, q, 0xf8); +#endif vec4f_t c = crossf (q, v); vec4f_t qv = dotf (q, v); // q.w is 0 so v.w is irrelevant vec4f_t qq = dotf (q, q); @@ -231,7 +280,11 @@ vqmulf (vec4f_t v, vec4f_t q) float s = q[3]; // zero the scalar of the quaternion. Results in an extra operation, but // avoids adding precision issues. +#ifndef __SSE4_1__ + q[3] = 0; +#else q = _mm_insert_ps (q, q, 0xf8); +#endif vec4f_t c = crossf (q, v); vec4f_t qv = dotf (q, v); // q.w is 0 so v.w is irrelevant vec4f_t qq = dotf (q, q); @@ -266,7 +319,11 @@ vec4f_t qconjf (vec4f_t q) { const vec4i_t neg = { 1u << 31, 1u << 31, 1u << 31, 0 }; +#ifndef __SSE__ + return (vec4f_t) ((vec4i_t) q ^ neg); +#else return _mm_xor_ps (q, (__m128) neg); +#endif } #ifndef IMPLEMENT_VEC4F_Funcs @@ -299,6 +356,9 @@ loadvec3f (const float v3[3]) { vec4f_t v4; +#ifndef __SSE4_1__ + v4 = (vec4f_t) { v3[0], v3[1], v3[2], 0 }; +#else // this had to be in asm otherwise gcc thinks v4 is only partially // initialized, and gcc 10 does not use the zero flags when generating // the code, resulting in a memory access to load a 0 into v4[3] @@ -311,6 +371,7 @@ loadvec3f (const float v3[3]) " : "=v"(v4) : "m"(v3[0]), "m"(v3[1]), "m"(v3[2])); +#endif return v4; } diff --git a/libs/util/test/test-simd.c b/libs/util/test/test-simd.c index ebaab413f..ceb9ad7d4 100644 --- a/libs/util/test/test-simd.c +++ b/libs/util/test/test-simd.c @@ -48,15 +48,19 @@ #define s05 0.70710678118654757 +#ifdef __AVX__ typedef struct { + int line; vec4d_t (*op) (vec4d_t a, vec4d_t b); vec4d_t a; vec4d_t b; vec4d_t expect; vec4d_t ulp_errors; } vec4d_test_t; +#endif typedef struct { + int line; vec4f_t (*op) (vec4f_t a, vec4f_t b); vec4f_t a; vec4f_t b; @@ -65,6 +69,7 @@ typedef struct { } vec4f_test_t; typedef struct { + int line; void (*op) (mat4f_t c, const mat4f_t a, const mat4f_t b); mat4f_t a; mat4f_t b; @@ -73,6 +78,7 @@ typedef struct { } mat4f_test_t; typedef struct { + int line; vec4f_t (*op) (const mat4f_t a, vec4f_t b); mat4f_t a; vec4f_t b; @@ -81,12 +87,14 @@ typedef struct { } mv4f_test_t; typedef struct { + int line; void (*op) (mat4f_t m, vec4f_t q); vec4f_t q; mat4f_t expect; mat4f_t ulp_errors; } mq4f_test_t; +#ifdef __AVX__ static vec4d_t tvtruncd (vec4d_t v, vec4d_t ignore) { return vtruncd (v); @@ -106,6 +114,7 @@ static vec4d_t tqconjd (vec4d_t v, vec4d_t ignore) { return qconjd (v); } +#endif static vec4f_t tvtruncf (vec4f_t v, vec4f_t ignore) { @@ -147,290 +156,320 @@ static vec4f_t tmagnitude3f (vec4f_t v, vec4f_t ignore) return magnitude3f (v); } +#define T(t...) { __LINE__, t } + +#ifdef __AVX__ static vec4d_test_t vec4d_tests[] = { // 3D dot products - { dotd, right, right, one }, - { dotd, right, forward, zero }, - { dotd, right, up, zero }, - { dotd, forward, right, zero }, - { dotd, forward, forward, one }, - { dotd, forward, up, zero }, - { dotd, up, right, zero }, - { dotd, up, forward, zero }, - { dotd, up, up, one }, + T(dotd, right, right, one ), + T(dotd, right, forward, zero ), + T(dotd, right, up, zero ), + T(dotd, forward, right, zero ), + T(dotd, forward, forward, one ), + T(dotd, forward, up, zero ), + T(dotd, up, right, zero ), + T(dotd, up, forward, zero ), + T(dotd, up, up, one ), // one is 4D, so its self dot product is 4 - { dotd, one, one, { 4, 4, 4, 4} }, - { dotd, one, none, {-4, -4, -4, -4} }, + T(dotd, one, one, { 4, 4, 4, 4} ), + T(dotd, one, none, {-4, -4, -4, -4} ), // 3D cross products - { crossd, right, right, zero }, - { crossd, right, forward, up }, - { crossd, right, up, nforward }, - { crossd, forward, right, nup }, - { crossd, forward, forward, zero }, - { crossd, forward, up, right }, - { crossd, up, right, forward }, - { crossd, up, forward, nright }, - { crossd, up, up, zero }, + T(crossd, right, right, zero ), + T(crossd, right, forward, up ), + T(crossd, right, up, nforward ), + T(crossd, forward, right, nup ), + T(crossd, forward, forward, zero ), + T(crossd, forward, up, right ), + T(crossd, up, right, forward ), + T(crossd, up, forward, nright ), + T(crossd, up, up, zero ), // double whammy tests: cross product with an angled vector and // ensuring that a 4d vector (non-zero w component) does not affect // the result, including the result's w component remaining zero. - { crossd, right, one, { 0, -1, 1} }, - { crossd, forward, one, { 1, 0, -1} }, - { crossd, up, one, {-1, 1, 0} }, - { crossd, one, right, { 0, 1, -1} }, - { crossd, one, forward, {-1, 0, 1} }, - { crossd, one, up, { 1, -1, 0} }, + T(crossd, right, one, { 0, -1, 1} ), + T(crossd, forward, one, { 1, 0, -1} ), + T(crossd, up, one, {-1, 1, 0} ), + T(crossd, one, right, { 0, 1, -1} ), + T(crossd, one, forward, {-1, 0, 1} ), + T(crossd, one, up, { 1, -1, 0} ), // This one fails when optimizing with -mfma (which is why fma is not // used): ulp errors in z and w - { crossd, qtest, qtest, {0, 0, 0, 0} }, + T(crossd, qtest, qtest, {0, 0, 0, 0} ), - { qmuld, qident, qident, qident }, - { qmuld, qident, right, right }, - { qmuld, qident, forward, forward }, - { qmuld, qident, up, up }, - { qmuld, right, qident, right }, - { qmuld, forward, qident, forward }, - { qmuld, up, qident, up }, - { qmuld, right, right, nqident }, - { qmuld, right, forward, up }, - { qmuld, right, up, nforward }, - { qmuld, forward, right, nup }, - { qmuld, forward, forward, nqident }, - { qmuld, forward, up, right }, - { qmuld, up, right, forward }, - { qmuld, up, forward, nright }, - { qmuld, up, up, nqident }, - { qmuld, one, one, { 2, 2, 2, -2 } }, - { qmuld, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } }, + T(qmuld, qident, qident, qident ), + T(qmuld, qident, right, right ), + T(qmuld, qident, forward, forward ), + T(qmuld, qident, up, up ), + T(qmuld, right, qident, right ), + T(qmuld, forward, qident, forward ), + T(qmuld, up, qident, up ), + T(qmuld, right, right, nqident ), + T(qmuld, right, forward, up ), + T(qmuld, right, up, nforward ), + T(qmuld, forward, right, nup ), + T(qmuld, forward, forward, nqident ), + T(qmuld, forward, up, right ), + T(qmuld, up, right, forward ), + T(qmuld, up, forward, nright ), + T(qmuld, up, up, nqident ), + T(qmuld, one, one, { 2, 2, 2, -2 } ), + T(qmuld, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } ), // This one fails when optimizing with -mfma (which is why fma is not // used): ulp error in z - { qmuld, qtest, qtest, {0.768, 0.576, 0, -0.28} }, + T(qmuld, qtest, qtest, {0.768, 0.576, 0, -0.28} ), // The one vector is not unit (magnitude 2), so using it as a rotation // quaternion results in scaling by 4. However, it still has the effect // of rotating 120 degrees around the axis equidistant from the three // orthogonal axes such that x->y->z->x - { qvmuld, one, right, { 0, 4, 0, 0 } }, - { qvmuld, one, forward, { 0, 0, 4, 0 } }, - { qvmuld, one, up, { 4, 0, 0, 0 } }, - { qvmuld, one, {1,1,1,0}, { 4, 4, 4, 0 } }, - { qvmuld, one, one, { 4, 4, 4, -2 } }, + T(qvmuld, one, right, { 0, 4, 0, 0 } ), + T(qvmuld, one, forward, { 0, 0, 4, 0 } ), + T(qvmuld, one, up, { 4, 0, 0, 0 } ), + T(qvmuld, one, {1,1,1,0}, { 4, 4, 4, 0 } ), + T(qvmuld, one, one, { 4, 4, 4, -2 } ), // inverse rotation, so x->z->y->x - { vqmuld, right, one, { 0, 0, 4, 0 } }, - { vqmuld, forward, one, { 4, 0, 0, 0 } }, - { vqmuld, up, one, { 0, 4, 0, 0 } }, - { vqmuld, {1,1,1,0}, one, { 4, 4, 4, 0 } }, - { vqmuld, one, one, { 4, 4, 4, -2 } }, + T(vqmuld, right, one, { 0, 0, 4, 0 } ), + T(vqmuld, forward, one, { 4, 0, 0, 0 } ), + T(vqmuld, up, one, { 0, 4, 0, 0 } ), + T(vqmuld, {1,1,1,0}, one, { 4, 4, 4, 0 } ), + T(vqmuld, one, one, { 4, 4, 4, -2 } ), // The half vector is unit. - { qvmuld, half, right, forward }, - { qvmuld, half, forward, up }, - { qvmuld, half, up, right }, - { qvmuld, half, {1,1,1,0}, { 1, 1, 1, 0 } }, + T(qvmuld, half, right, forward ), + T(qvmuld, half, forward, up ), + T(qvmuld, half, up, right ), + T(qvmuld, half, {1,1,1,0}, { 1, 1, 1, 0 } ), // inverse - { vqmuld, right, half, up }, - { vqmuld, forward, half, right }, - { vqmuld, up, half, forward }, - { vqmuld, {1,1,1,0}, half, { 1, 1, 1, 0 } }, + T(vqmuld, right, half, up ), + T(vqmuld, forward, half, right ), + T(vqmuld, up, half, forward ), + T(vqmuld, {1,1,1,0}, half, { 1, 1, 1, 0 } ), // one is a 4D vector and qvmuld is meant for 3D vectors. However, it // seems that the vector's w has no effect on the 3d portion of the // result, but the result's w is cosine of the full rotation angle // scaled by quaternion magnitude and vector w - { qvmuld, half, one, { 1, 1, 1, -0.5 } }, - { qvmuld, half, {2,2,2,2}, { 2, 2, 2, -1 } }, - { qvmuld, qtest, right, {0.5392, 0.6144, -0.576, 0} }, - { qvmuld, qtest, forward, {0.6144, 0.1808, 0.768, 0}, - {0, -2.7e-17, 0, 0} }, - { qvmuld, qtest, up, {0.576, -0.768, -0.28, 0} }, + T(qvmuld, half, one, { 1, 1, 1, -0.5 } ), + T(qvmuld, half, {2,2,2,2}, { 2, 2, 2, -1 } ), + T(qvmuld, qtest, right, {0.5392, 0.6144, -0.576, 0} ), + T(qvmuld, qtest, forward, {0.6144, 0.1808, 0.768, 0}, + {0, -2.7e-17, 0, 0} ), + T(qvmuld, qtest, up, {0.576, -0.768, -0.28, 0} ), // inverse - { vqmuld, one, half, { 1, 1, 1, -0.5 } }, - { vqmuld, {2,2,2,2}, half, { 2, 2, 2, -1 } }, - { vqmuld, right, qtest, {0.5392, 0.6144, 0.576, 0} }, - { vqmuld, forward, qtest, {0.6144, 0.1808, -0.768, 0}, - {0, -2.7e-17, 0, 0} }, - { vqmuld, up, qtest, {-0.576, 0.768, -0.28, 0} }, + T(vqmuld, one, half, { 1, 1, 1, -0.5 } ), + T(vqmuld, {2,2,2,2}, half, { 2, 2, 2, -1 } ), + T(vqmuld, right, qtest, {0.5392, 0.6144, 0.576, 0} ), + T(vqmuld, forward, qtest, {0.6144, 0.1808, -0.768, 0}, + {0, -2.7e-17, 0, 0} ), + T(vqmuld, up, qtest, {-0.576, 0.768, -0.28, 0} ), - { qrotd, right, right, qident }, - { qrotd, right, forward, { 0, 0, s05, s05 }, - {0, 0, -1.1e-16, 0} }, - { qrotd, right, up, { 0, -s05, 0, s05 }, - {0, 1.1e-16, 0, 0} }, - { qrotd, forward, right, { 0, 0, -s05, s05 }, - {0, 0, 1.1e-16, 0} }, - { qrotd, forward, forward, qident }, - { qrotd, forward, up, { s05, 0, 0, s05 }, - {-1.1e-16, 0, 0, 0} }, - { qrotd, up, right, { 0, s05, 0, s05 }, - {0, -1.1e-16, 0, 0} }, - { qrotd, up, forward, { -s05, 0, 0, s05 }, - { 1.1e-16, 0, 0, 0} }, - { qrotd, up, up, qident }, + T(qrotd, right, right, qident ), + T(qrotd, right, forward, { 0, 0, s05, s05 }, + {0, 0, -1.1e-16, 0} ), + T(qrotd, right, up, { 0, -s05, 0, s05 }, + {0, 1.1e-16, 0, 0} ), + T(qrotd, forward, right, { 0, 0, -s05, s05 }, + {0, 0, 1.1e-16, 0} ), + T(qrotd, forward, forward, qident ), + T(qrotd, forward, up, { s05, 0, 0, s05 }, + {-1.1e-16, 0, 0, 0} ), + T(qrotd, up, right, { 0, s05, 0, s05 }, + {0, -1.1e-16, 0, 0} ), + T(qrotd, up, forward, { -s05, 0, 0, s05 }, + { 1.1e-16, 0, 0, 0} ), + T(qrotd, up, up, qident ), - { tvtruncd, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } }, - { tvceild, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } }, - { tvfloord, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } }, - { tqconjd, one, {}, { -1, -1, -1, 1 } }, + T(tvtruncd, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } ), + T(tvceild, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } ), + T(tvfloord, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } ), + T(tqconjd, one, {}, { -1, -1, -1, 1 } ), }; #define num_vec4d_tests (sizeof (vec4d_tests) / (sizeof (vec4d_tests[0]))) +#endif static vec4f_test_t vec4f_tests[] = { // 3D dot products - { dotf, right, right, one }, - { dotf, right, forward, zero }, - { dotf, right, up, zero }, - { dotf, forward, right, zero }, - { dotf, forward, forward, one }, - { dotf, forward, up, zero }, - { dotf, up, right, zero }, - { dotf, up, forward, zero }, - { dotf, up, up, one }, + T(dotf, right, right, one ), + T(dotf, right, forward, zero ), + T(dotf, right, up, zero ), + T(dotf, forward, right, zero ), + T(dotf, forward, forward, one ), + T(dotf, forward, up, zero ), + T(dotf, up, right, zero ), + T(dotf, up, forward, zero ), + T(dotf, up, up, one ), // one is 4D, so its self dot product is 4 - { dotf, one, one, { 4, 4, 4, 4} }, - { dotf, one, none, {-4, -4, -4, -4} }, + T(dotf, one, one, { 4, 4, 4, 4} ), + T(dotf, one, none, {-4, -4, -4, -4} ), // 3D cross products - { crossf, right, right, zero }, - { crossf, right, forward, up }, - { crossf, right, up, nforward }, - { crossf, forward, right, nup }, - { crossf, forward, forward, zero }, - { crossf, forward, up, right }, - { crossf, up, right, forward }, - { crossf, up, forward, nright }, - { crossf, up, up, zero }, + T(crossf, right, right, zero ), + T(crossf, right, forward, up ), + T(crossf, right, up, nforward ), + T(crossf, forward, right, nup ), + T(crossf, forward, forward, zero ), + T(crossf, forward, up, right ), + T(crossf, up, right, forward ), + T(crossf, up, forward, nright ), + T(crossf, up, up, zero ), // double whammy tests: cross product with an angled vector and // ensuring that a 4d vector (non-zero w component) does not affect // the result, including the result's w component remaining zero. - { crossf, right, one, { 0, -1, 1} }, - { crossf, forward, one, { 1, 0, -1} }, - { crossf, up, one, {-1, 1, 0} }, - { crossf, one, right, { 0, 1, -1} }, - { crossf, one, forward, {-1, 0, 1} }, - { crossf, one, up, { 1, -1, 0} }, - { crossf, qtest, qtest, {0, 0, 0, 0} }, + T(crossf, right, one, { 0, -1, 1} ), + T(crossf, forward, one, { 1, 0, -1} ), + T(crossf, up, one, {-1, 1, 0} ), + T(crossf, one, right, { 0, 1, -1} ), + T(crossf, one, forward, {-1, 0, 1} ), + T(crossf, one, up, { 1, -1, 0} ), + T(crossf, qtest, qtest, {0, 0, 0, 0} ), - { qmulf, qident, qident, qident }, - { qmulf, qident, right, right }, - { qmulf, qident, forward, forward }, - { qmulf, qident, up, up }, - { qmulf, right, qident, right }, - { qmulf, forward, qident, forward }, - { qmulf, up, qident, up }, - { qmulf, right, right, nqident }, - { qmulf, right, forward, up }, - { qmulf, right, up, nforward }, - { qmulf, forward, right, nup }, - { qmulf, forward, forward, nqident }, - { qmulf, forward, up, right }, - { qmulf, up, right, forward }, - { qmulf, up, forward, nright }, - { qmulf, up, up, nqident }, - { qmulf, one, one, { 2, 2, 2, -2 } }, - { qmulf, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } }, - { qmulf, qtest, qtest, {0.768, 0.576, 0, -0.28}, - {0, 6e-8, 0, 3e-8} }, + T(qmulf, qident, qident, qident ), + T(qmulf, qident, right, right ), + T(qmulf, qident, forward, forward ), + T(qmulf, qident, up, up ), + T(qmulf, right, qident, right ), + T(qmulf, forward, qident, forward ), + T(qmulf, up, qident, up ), + T(qmulf, right, right, nqident ), + T(qmulf, right, forward, up ), + T(qmulf, right, up, nforward ), + T(qmulf, forward, right, nup ), + T(qmulf, forward, forward, nqident ), + T(qmulf, forward, up, right ), + T(qmulf, up, right, forward ), + T(qmulf, up, forward, nright ), + T(qmulf, up, up, nqident ), + T(qmulf, one, one, { 2, 2, 2, -2 } ), + T(qmulf, one, { 2, 2, 2, -2 }, { 0, 0, 0, -8 } ), + T(qmulf, qtest, qtest, {0.768, 0.576, 0, -0.28}, +#ifndef __SSE__ + {0, 6e-8, 0, 6e-8} +#else + {0, 6e-8, 0, 3e-8} +#endif + ), // The one vector is not unit (magnitude 2), so using it as a rotation // quaternion results in scaling by 4. However, it still has the effect // of rotating 120 degrees around the axis equidistant from the three // orthogonal axes such that x->y->z->x - { qvmulf, one, right, { 0, 4, 0, 0 } }, - { qvmulf, one, forward, { 0, 0, 4, 0 } }, - { qvmulf, one, up, { 4, 0, 0, 0 } }, - { qvmulf, one, {1,1,1,0}, { 4, 4, 4, 0 } }, - { qvmulf, one, one, { 4, 4, 4, -2 } }, + T(qvmulf, one, right, { 0, 4, 0, 0 } ), + T(qvmulf, one, forward, { 0, 0, 4, 0 } ), + T(qvmulf, one, up, { 4, 0, 0, 0 } ), + T(qvmulf, one, {1,1,1,0}, { 4, 4, 4, 0 } ), + T(qvmulf, one, one, { 4, 4, 4, -2 } ), // inverse rotation, so x->z->y->x - { vqmulf, right, one, { 0, 0, 4, 0 } }, - { vqmulf, forward, one, { 4, 0, 0, 0 } }, - { vqmulf, up, one, { 0, 4, 0, 0 } }, - { vqmulf, {1,1,1,0}, one, { 4, 4, 4, 0 } }, - { vqmulf, one, one, { 4, 4, 4, -2 } }, + T(vqmulf, right, one, { 0, 0, 4, 0 } ), + T(vqmulf, forward, one, { 4, 0, 0, 0 } ), + T(vqmulf, up, one, { 0, 4, 0, 0 } ), + T(vqmulf, {1,1,1,0}, one, { 4, 4, 4, 0 } ), + T(vqmulf, one, one, { 4, 4, 4, -2 } ), // - { qvmulf, qtest, right, {0.5392, 0.6144, -0.576, 0}, - {0, -5.9e-8, -6e-8, 0} }, - { qvmulf, qtest, forward, {0.6144, 0.1808, 0.768, 0}, - {-5.9e-8, 1.5e-8, 0, 0} }, - { qvmulf, qtest, up, {0.576, -0.768, -0.28, 0}, - {6e-8, 0, 3e-8, 0} }, - { vqmulf, right, qtest, {0.5392, 0.6144, 0.576, 0}, - {0, -5.9e-8, 5.9e-8, 0} }, - { vqmulf, forward, qtest, {0.6144, 0.1808, -0.768, 0}, - {-5.9e-8, 1.5e-8, 0, 0} }, - { vqmulf, up, qtest, {-0.576, 0.768, -0.28, 0}, - {-5.9e-8, 0, 3e-8, 0} }, + T(qvmulf, qtest, right, {0.5392, 0.6144, -0.576, 0}, + {0, -5.9e-8, -6e-8, 0} ), + T(qvmulf, qtest, forward, {0.6144, 0.1808, 0.768, 0}, +#ifndef __SSE__ + {-5.9e-8, 3e-8, 0, 0} +#else + {-5.9e-8, 1.5e-8, 0, 0} +#endif + ), + T(qvmulf, qtest, up, {0.576, -0.768, -0.28, 0}, +#ifndef __SSE__ + {6e-8, 0, 6e-8, 0} +#else + {6e-8, 0, 3e-8, 0} +#endif + ), + T(vqmulf, right, qtest, {0.5392, 0.6144, 0.576, 0}, + {0, -5.9e-8, 5.9e-8, 0} ), + T(vqmulf, forward, qtest, {0.6144, 0.1808, -0.768, 0}, +#ifndef __SSE__ + {-5.9e-8, 3e-8, 0, 0} +#else + {-5.9e-8, 1.5e-8, 0, 0} +#endif + ), + T(vqmulf, up, qtest, {-0.576, 0.768, -0.28, 0}, +#ifndef __SSE__ + {-5.9e-8, 0, 6e-8, 0} +#else + {-5.9e-8, 0, 3e-8, 0} +#endif + ), - { qrotf, right, right, qident }, - { qrotf, right, forward, { 0, 0, s05, s05 } }, - { qrotf, right, up, { 0, -s05, 0, s05 } }, - { qrotf, forward, right, { 0, 0, -s05, s05 } }, - { qrotf, forward, forward, qident }, - { qrotf, forward, up, { s05, 0, 0, s05 } }, - { qrotf, up, right, { 0, s05, 0, s05 } }, - { qrotf, up, forward, { -s05, 0, 0, s05 } }, - { qrotf, up, up, qident }, + T(qrotf, right, right, qident ), + T(qrotf, right, forward, { 0, 0, s05, s05 } ), + T(qrotf, right, up, { 0, -s05, 0, s05 } ), + T(qrotf, forward, right, { 0, 0, -s05, s05 } ), + T(qrotf, forward, forward, qident ), + T(qrotf, forward, up, { s05, 0, 0, s05 } ), + T(qrotf, up, right, { 0, s05, 0, s05 } ), + T(qrotf, up, forward, { -s05, 0, 0, s05 } ), + T(qrotf, up, up, qident ), - { tvabsf, pmpi, {}, pi }, - { tvsqrtf, { 1, 4, 9, 16}, {}, {1, 2, 3, 4} }, - { tvtruncf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } }, - { tvceilf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } }, - { tvfloorf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } }, - { tqconjf, one, {}, { -1, -1, -1, 1 } }, - { tmagnitudef, { 3, 4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, 4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, 4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, 4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { 3, -4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, 4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, 12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, 12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, -12, 84}, {}, {85, 85, 85, 85} }, - { tmagnitudef, { -3, -4, -12, -84}, {}, {85, 85, 85, 85} }, - { tmagnitude3f, { -3, -4, -12, -84}, {}, {13, 13, 13, 13} }, + T(tvabsf, pmpi, {}, pi ), + T(tvsqrtf, { 1, 4, 9, 16}, {}, {1, 2, 3, 4} ), + T(tvtruncf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -1, -2 } ), + T(tvceilf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 2, 3, -1, -2 } ), + T(tvfloorf, { 1.1, 2.9, -1.1, -2.9 }, {}, { 1, 2, -2, -3 } ), + T(tqconjf, one, {}, { -1, -1, -1, 1 } ), + T(tmagnitudef, { 3, 4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, 4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, 4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, 4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { 3, -4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, 4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, 12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, 12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, -12, 84}, {}, {85, 85, 85, 85} ), + T(tmagnitudef, { -3, -4, -12, -84}, {}, {85, 85, 85, 85} ), + T(tmagnitude3f, { -3, -4, -12, -84}, {}, {13, 13, 13, 13} ), }; #define num_vec4f_tests (sizeof (vec4f_tests) / (sizeof (vec4f_tests[0]))) static mat4f_test_t mat4f_tests[] = { - { mmulf, identity, identity, identity }, - { mmulf, rotate120, identity, rotate120 }, - { mmulf, identity, rotate120, rotate120 }, - { mmulf, rotate120, rotate120, rotate240 }, - { mmulf, rotate120, rotate240, identity }, - { mmulf, rotate240, rotate120, identity }, + T(mmulf, identity, identity, identity ), + T(mmulf, rotate120, identity, rotate120 ), + T(mmulf, identity, rotate120, rotate120 ), + T(mmulf, rotate120, rotate120, rotate240 ), + T(mmulf, rotate120, rotate240, identity ), + T(mmulf, rotate240, rotate120, identity ), }; #define num_mat4f_tests (sizeof (mat4f_tests) / (sizeof (mat4f_tests[0]))) static mv4f_test_t mv4f_tests[] = { - { mvmulf, identity, { 1, 0, 0, 0 }, { 1, 0, 0, 0 } }, - { mvmulf, identity, { 0, 1, 0, 0 }, { 0, 1, 0, 0 } }, - { mvmulf, identity, { 0, 0, 1, 0 }, { 0, 0, 1, 0 } }, - { mvmulf, identity, { 0, 0, 0, 1 }, { 0, 0, 0, 1 } }, - { mvmulf, rotate120, { 1, 2, 3, 4 }, { 3, 1, 2, 4 } }, - { mvmulf, rotate240, { 1, 2, 3, 4 }, { 2, 3, 1, 4 } }, + T(mvmulf, identity, { 1, 0, 0, 0 }, { 1, 0, 0, 0 } ), + T(mvmulf, identity, { 0, 1, 0, 0 }, { 0, 1, 0, 0 } ), + T(mvmulf, identity, { 0, 0, 1, 0 }, { 0, 0, 1, 0 } ), + T(mvmulf, identity, { 0, 0, 0, 1 }, { 0, 0, 0, 1 } ), + T(mvmulf, rotate120, { 1, 2, 3, 4 }, { 3, 1, 2, 4 } ), + T(mvmulf, rotate240, { 1, 2, 3, 4 }, { 2, 3, 1, 4 } ), }; #define num_mv4f_tests (sizeof (mv4f_tests) / (sizeof (mv4f_tests[0]))) // expect filled in using non-simd QuatToMatrix (has its own tests) static mq4f_test_t mq4f_tests[] = { - { mat4fquat, { 0, 0, 0, 1 } }, - { mat4fquat, { 0.5, 0.5, 0.5, 0.5 } }, - { mat4fquat, { 0.5, 0.5, -0.5, 0.5 } }, - { mat4fquat, { 0.5, -0.5, 0.5, 0.5 } }, - { mat4fquat, { 0.5, -0.5, -0.5, 0.5 } }, - { mat4fquat, { -0.5, 0.5, 0.5, 0.5 } }, - { mat4fquat, { -0.5, 0.5, -0.5, 0.5 } }, - { mat4fquat, { -0.5, -0.5, 0.5, 0.5 } }, - { mat4fquat, { -0.5, -0.5, -0.5, 0.5 } }, + T(mat4fquat, { 0, 0, 0, 1 } ), + T(mat4fquat, { 0.5, 0.5, 0.5, 0.5 } ), + T(mat4fquat, { 0.5, 0.5, -0.5, 0.5 } ), + T(mat4fquat, { 0.5, -0.5, 0.5, 0.5 } ), + T(mat4fquat, { 0.5, -0.5, -0.5, 0.5 } ), + T(mat4fquat, { -0.5, 0.5, 0.5, 0.5 } ), + T(mat4fquat, { -0.5, 0.5, -0.5, 0.5 } ), + T(mat4fquat, { -0.5, -0.5, 0.5, 0.5 } ), + T(mat4fquat, { -0.5, -0.5, -0.5, 0.5 } ), }; #define num_mq4f_tests (sizeof (mq4f_tests) / (sizeof (mq4f_tests[0]))) +#ifdef __AVX__ static int run_vec4d_tests (void) { @@ -443,7 +482,7 @@ run_vec4d_tests (void) vec4l_t res = result != expect; if (res[0] || res[1] || res[2] || res[3]) { ret |= 1; - printf ("\nrun_vec4d_tests %zd\n", i); + printf ("\nrun_vec4d_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4D_FMT "\n", VEC4_EXP(test->a)); printf ("b: " VEC4D_FMT "\n", VEC4_EXP(test->b)); printf ("r: " VEC4D_FMT "\n", VEC4_EXP(result)); @@ -455,6 +494,7 @@ run_vec4d_tests (void) } return ret; } +#endif static int run_vec4f_tests (void) @@ -465,10 +505,10 @@ run_vec4f_tests (void) __auto_type test = &vec4f_tests[i]; vec4f_t result = test->op (test->a, test->b); vec4f_t expect = test->expect + test->ulp_errors; - vec4i_t res = result != expect; + vec4i_t res = (vec4i_t) result != (vec4i_t) expect; if (res[0] || res[1] || res[2] || res[3]) { ret |= 1; - printf ("\nrun_vec4f_tests %zd\n", i); + printf ("\nrun_vec4f_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4F_FMT "\n", VEC4_EXP(test->a)); printf ("b: " VEC4F_FMT "\n", VEC4_EXP(test->b)); printf ("r: " VEC4F_FMT "\n", VEC4_EXP(result)); @@ -502,7 +542,7 @@ run_mat4f_tests (void) } if (fail) { ret |= 1; - printf ("\nrun_mat4f_tests %zd\n", i); + printf ("\nrun_mat4f_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4F_FMT "\n", MAT4_ROW(test->a, 0)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 1)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 2)); @@ -549,7 +589,7 @@ run_mv4f_tests (void) if (res[0] || res[1] || res[2] || res[3]) { ret |= 1; - printf ("\nrun_mv4f_tests %zd\n", i); + printf ("\nrun_mv4f_tests %zd, line %d\n", i, test->line); printf ("a: " VEC4F_FMT "\n", MAT4_ROW(test->a, 0)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 1)); printf (" " VEC4F_FMT "\n", MAT4_ROW(test->a, 2)); @@ -595,7 +635,7 @@ run_mq4f_tests (void) } if (fail) { ret |= 1; - printf ("\nrun_mq4f_tests %zd\n", i); + printf ("\nrun_mq4f_tests %zd, line %d\n", i, test->line); printf ("q: " VEC4F_FMT "\n", VEC4_EXP(test->q)); printf ("r: " VEC4F_FMT "\n", MAT4_ROW(result, 0)); printf (" " VEC4F_FMT "\n", MAT4_ROW(result, 1)); @@ -626,7 +666,9 @@ int main (void) { int ret = 0; +#ifdef __AVX__ ret |= run_vec4d_tests (); +#endif ret |= run_vec4f_tests (); ret |= run_mat4f_tests (); ret |= run_mv4f_tests (); diff --git a/tools/qfvis/source/base-vis.c b/tools/qfvis/source/base-vis.c index 089b196f9..877f3f97d 100644 --- a/tools/qfvis/source/base-vis.c +++ b/tools/qfvis/source/base-vis.c @@ -86,6 +86,7 @@ SimpleFlood (basethread_t *thread, portal_t *srcportal, int clusternum) static inline int test_sphere (const vspheref_t *sphere, vec4f_t plane) { +#ifdef __SSE3__ const vec4f_t zero = {}; float r = sphere->radius; vec4f_t eps = { r, r, r, r }; @@ -94,6 +95,12 @@ test_sphere (const vspheref_t *sphere, vec4f_t plane) c = (vec4i_t) _mm_hsub_epi32 ((__m128i) c, (__m128i) c); return c[0]; +#else + float d = DotProduct (sphere->center, plane) + plane[3]; + int front = (d >= sphere->radius); + int back = (d <= -sphere->radius); + return front - back; +#endif } void diff --git a/tools/qfvis/source/qfvis.c b/tools/qfvis/source/qfvis.c index a1dc5ce33..156d1650f 100644 --- a/tools/qfvis/source/qfvis.c +++ b/tools/qfvis/source/qfvis.c @@ -214,12 +214,20 @@ NewFlippedWinding (threaddata_t *thread, const winding_t *w) static vec4i_t signeps (vec4f_t dist) { +#ifdef __SSE3__ const vec4f_t zero = {}; const vec4f_t eps = { ON_EPSILON, ON_EPSILON, ON_EPSILON, ON_EPSILON }; vec4f_t d = _mm_addsub_ps (zero, dist); vec4i_t c = (d - eps) > 0; c = (vec4i_t) _mm_hsub_epi32 ((__m128i) c, (__m128i) c); return c; +#else + float d = dist[0]; + int front = (d >= ON_EPSILON); + int back = (d <= -ON_EPSILON); + int i = front - back; + return (vec4i_t) { i, i, i, i }; +#endif } static vec4f_t @@ -246,7 +254,12 @@ split_edge (const vec4f_t *points, const vec4f_t *dists, vec4i_t x = _mm_and_ps (split, (__m128) nan) == onenan; // plane vector has -dist in w vec4f_t y = _mm_and_ps (split, (__m128) x) * -split[3]; +#ifdef __SSE3__ mid = _mm_blendv_ps (mid, y, (__m128) x); +#else + mid = (vec4f_t) ((vec4i_t) _mm_and_ps (y, (__m128) x) | + (vec4i_t) _mm_and_ps (mid, (__m128) ~x)); +#endif if (isnan (mid[0])) *(int *) 0 = 0; return mid; }