From 9084121ad28528ab2dff42fb26e7db2b11217409 Mon Sep 17 00:00:00 2001 From: Bill Currie Date: Mon, 3 Jan 2022 17:55:45 +0900 Subject: [PATCH] [simd] Correct result for dot2f It turns out gcc optimizes the obvious code nicely. It doesn't do so well for cmul, but I decided to use obvious code anyway (the instruction counts were the same, so maybe it doesn't get better for a single pair of operands). --- include/QF/simd/vec2f.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/QF/simd/vec2f.h b/include/QF/simd/vec2f.h index ffa403ad4..b75efcd8e 100644 --- a/include/QF/simd/vec2f.h +++ b/include/QF/simd/vec2f.h @@ -120,9 +120,7 @@ vec2f_t dot2f (vec2f_t a, vec2f_t b) { vec2f_t c = a * b; - vec4f_t t = { c[0], c[1], 0, 0 }; - t = _mm_hadd_ps (t, t); - return (vec2f_t) { t[0], t[1] }; + return (vec2f_t) { c[0] + c[1], c[0] + c[1] }; } #ifndef IMPLEMENT_VEC2F_Funcs @@ -135,10 +133,7 @@ cmulf (vec2f_t a, vec2f_t b) { vec2f_t c1 = a * b[0]; vec2f_t c2 = a * b[1]; - vec4f_t c14 ={ c1[0], c1[1], 0, 0 }; - vec4f_t c24 ={ c2[1], c2[0], 0, 0 }; - vec4f_t c = _mm_addsub_ps (c14, c24); - return (vec2f_t) { c[0], c[1] }; + return (vec2f_t) { c1[0] - c2[1], c1[1] + c2[0] }; } #ifndef IMPLEMENT_VEC2F_Funcs