[simd] Remove some intrinsics uses

GCC does a nice enough job compiling the more readable form (though
admittedly, hadd is possibly more readable than what's there for
dot[fd], hadd is supposedly slower than the shuffles and adds, and qfvis
seems to support that).
This commit is contained in:
Bill Currie 2022-05-20 11:09:15 +09:00
parent 89f8dfce09
commit 87f3c206f1
2 changed files with 6 additions and 37 deletions

View file

@ -198,12 +198,8 @@ vec4d_t
dotd (vec4d_t a, vec4d_t b)
{
vec4d_t c = a * b;
#ifndef __AVX__
c = (vec4d_t) { c[0] + c[1], c[0] + c[1], c[2] + c[3], c[2] + c[3] };
#else
c = _mm256_hadd_pd (c, c);
#endif
c += (vec4d_t) {c[2], c[3], c[0], c[1]};
c += (vec4d_t) { c[3], c[0], c[1], c[2] };
c += (vec4d_t) { c[2], c[3], c[0], c[1] };
return c;
}
@ -302,9 +298,7 @@ VISIBLE
vec4d_t
qconjd (vec4d_t q)
{
const uint64_t sign = UINT64_C(1) << 63;
const vec4l_t neg = { sign, sign, sign, 0 };
return (vec4d_t) ((vec4l_t) q ^ neg);
return (vec4d_t) { -q[0], -q[1], -q[2], q[3] };
}
#ifndef IMPLEMENT_VEC4D_Funcs

View file

@ -211,13 +211,8 @@ vec4f_t
dotf (vec4f_t a, vec4f_t b)
{
vec4f_t c = a * b;
#ifndef __SSE3__
float x = c[0] + c[1] + c[2] + c[3];
c = (vec4f_t) { x, x, x, x };
#else
c = _mm_hadd_ps (c, c);
c = _mm_hadd_ps (c, c);
#endif
c += (vec4f_t) { c[3], c[0], c[1], c[2] };
c += (vec4f_t) { c[2], c[3], c[0], c[1] };
return c;
}
@ -234,11 +229,7 @@ qmulf (vec4f_t a, vec4f_t b)
vec4f_t c = crossf (a, b) + a * b[3] + a[3] * b;
vec4f_t d = dotf (a, b);
// zero out the vector component of dot product so only the scalar remains
#ifndef __SSE4_1__
d = (vec4f_t) { 0, 0, 0, d[3] };
#else
d = _mm_insert_ps (d, d, 0xf7);
#endif
return c - d;
}
@ -314,8 +305,7 @@ VISIBLE
vec4f_t
qconjf (vec4f_t q)
{
const vec4i_t neg = { 1u << 31, 1u << 31, 1u << 31, 0 };
return (vec4f_t) ((vec4i_t) q ^ neg);
return (vec4f_t) { -q[0], -q[1], -q[2], q[3] };
}
#ifndef IMPLEMENT_VEC4F_Funcs
@ -348,22 +338,7 @@ loadvec3f (const float *v3)
{
vec4f_t v4;
#ifndef __SSE4_1__
v4 = (vec4f_t) { v3[0], v3[1], v3[2], 0 };
#else
// this had to be in asm otherwise gcc thinks v4 is only partially
// initialized, and gcc 10 does not use the zero flags when generating
// the code, resulting in a memory access to load a 0 into v4[3]
//
// The first instruction zeros v4[3] while loading v4[0]
asm ("\n\
vinsertps $0x08, %1, %0, %0 \n\
vinsertps $0x10, %2, %0, %0 \n\
vinsertps $0x20, %3, %0, %0 \n\
"
: "=v"(v4)
: "m"(v3[0]), "m"(v3[1]), "m"(v3[2]));
#endif
return v4;
}