From 09a10f80e1a5ec8111dd28745aabec459cd52f84 Mon Sep 17 00:00:00 2001
From: Bill Currie <bill@taniwha.org>
Date: Mon, 28 Dec 2020 12:29:04 +0900
Subject: [PATCH] [util] Add basic SIMD implemented vector functions

They take advantage of gcc's vector_size attribute and so only cross,
dot, qmul, qvmul and qrot (create rotation quaternion from two vectors)
are needed at this stage as basic (per-component) math is supported
natively by gcc.

The provided functions work on horizontal (array-of-structs) data, ie a
vec4d_t or vec4f_t represents a single vector, or traditional vector
layout. Vertical layout (struct-of-arrays) does not need any special
functions as the regular math can be used to operate on four vectors at
a time.

Functions are provided for loading a vec4 from a vec3 (4th element set
to 0) and storing a vec4 into a vec3 (discarding the 4th element).

With this, QF will require AVX2 support (needed for vec4d_t). Without
support for doubles, SSE is possible, but may not be worthwhile for
horizontal data.

Fused-multiply-add is NOT used because it alters the results between
unoptimized and optimized code, resulting in -mfma really meaning
-mfast-math-anyway. I really do not want to have to debug issues that
occur only in optimized code.
---
 config.d/compiling.m4        |   5 +
 include/QF/simd/types.h      |  85 +++++++++++
 include/QF/simd/vec4d.h      | 156 +++++++++++++++++++
 include/QF/simd/vec4f.h      | 159 +++++++++++++++++++
 libs/util/test/Makemodule.am |   5 +
 libs/util/test/test-simd.c   | 289 +++++++++++++++++++++++++++++++++++
 6 files changed, 699 insertions(+)
 create mode 100644 include/QF/simd/types.h
 create mode 100644 include/QF/simd/vec4d.h
 create mode 100644 include/QF/simd/vec4f.h
 create mode 100644 libs/util/test/test-simd.c

diff --git a/config.d/compiling.m4 b/config.d/compiling.m4
index 343218d35..d9ba92887 100644
--- a/config.d/compiling.m4
+++ b/config.d/compiling.m4
@@ -82,6 +82,11 @@ AC_ARG_ENABLE(optimize,
 	optimize=yes
 )
 
+QF_CC_OPTION(-mavx2)
+dnl fma is not used as it is the equivalent of turning on
+dnl -funsafe-math-optimizations
+dnl QF_CC_OPTION(-mfma)
+
 AC_MSG_CHECKING(for optimization)
 if test "x$optimize" = xyes -a "x$leave_cflags_alone" != "xyes"; then
 	AC_MSG_RESULT(yes)
diff --git a/include/QF/simd/types.h b/include/QF/simd/types.h
new file mode 100644
index 000000000..cd8b0da9d
--- /dev/null
+++ b/include/QF/simd/types.h
@@ -0,0 +1,85 @@
+/*
+	QF/simd/types.h
+
+	Type definitions for QF SIMD values
+
+	Copyright (C) 2020  Bill Currie <bill@taniwha.org>
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+	See the GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to:
+
+		Free Software Foundation, Inc.
+		59 Temple Place - Suite 330
+		Boston, MA  02111-1307, USA
+
+*/
+
+#ifndef __QF_simd_types_h
+#define __QF_simd_types_h
+
+#define VEC_TYPE(t,n) typedef t n __attribute__ ((vector_size (4*sizeof (t))))
+
+/** Three element vector type for interfacing with compact data.
+ *
+ * This cannot be used directly by SIMD code and must be loaded and stored
+ * using load_vec3d() and store_vec3d() respectively.
+ */
+typedef double vec3d_t[3];
+
+#ifdef __AVX__
+/** Four element vector type for horizontal (AOS) vector data.
+ *
+ * This is used for both vectors (3D and 4D) and quaternions. 3D vectors
+ * are simply vec4d_t values with 0 in the 4th element.
+ *
+ * Also usable with vertical (SOA) code, in which case each vec4d_t represents
+ * a single component from four vectors, or a single row/column (depending on
+ * context) of an Nx4 or 4xN matrix.
+ */
+VEC_TYPE (double, vec4d_t);
+
+/** Used mostly for __builtin_shuffle.
+ */
+VEC_TYPE (long, vec4l_t);
+#endif
+
+/** Three element vector type for interfacing with compact data.
+ *
+ * This cannot be used directly by SIMD code and must be loaded and stored
+ * using load_vec3f() and store_vec3f() respectively.
+ */
+typedef float vec3f_t[3];
+
+/** Four element vector type for horizontal (AOS) vector data.
+ *
+ * This is used for both vectors (3D and 4D) and quaternions. 3D vectors
+ * are simply vec4f_t values with 0 in the 4th element.
+ *
+ * Also usable with vertical (SOA) code, in which case each vec4f_t represents
+ * a single component from four vectors, or a single row/column (depending on
+ * context) of an Nx4 or 4xN matrix.
+ */
+VEC_TYPE (float, vec4f_t);
+
+/** Used mostly for __builtin_shuffle.
+ */
+VEC_TYPE (int, vec4i_t);
+
+#define VEC4D_FMT "[%.17g, %.17g, %.17g, %.17g]"
+#define VEC4L_FMT "[%ld, %ld, %ld, %ld]"
+#define VEC4F_FMT "[%.9g, %.9g, %.9g, %.9g]"
+#define VEC4I_FMT "[%d, %d, %d, %d]"
+#define VEC4_EXP(v) (v)[0], (v)[1], (v)[2], (v)[3]
+
+#endif//__QF_simd_types_h
diff --git a/include/QF/simd/vec4d.h b/include/QF/simd/vec4d.h
new file mode 100644
index 000000000..d608933f5
--- /dev/null
+++ b/include/QF/simd/vec4d.h
@@ -0,0 +1,156 @@
+/*
+	QF/simd/vec4d.h
+
+	Vector functions for vec4d_t (ie, double precision)
+
+	Copyright (C) 2020  Bill Currie <bill@taniwha.org>
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+	See the GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to:
+
+		Free Software Foundation, Inc.
+		59 Temple Place - Suite 330
+		Boston, MA  02111-1307, USA
+
+*/
+
+#ifndef __QF_simd_vec4d_h
+#define __QF_simd_vec4d_h
+
+#include <immintrin.h>
+
+#include "QF/simd/types.h"
+
+/** 3D vector cross product.
+ *
+ * The w (4th) component can be any value on input, and is guaranteed to be 0
+ * in the result. The result is not affected in any way by either vector's w
+ * componemnt
+ */
+vec4d_t crossd (vec4d_t a, vec4d_t b) __attribute__((const));
+vec4d_t
+crossd (vec4d_t a, vec4d_t b)
+{
+	static const vec4l_t A = {1, 2, 0, 3};
+	vec4d_t c = a * __builtin_shuffle (b, A);
+	vec4d_t d = __builtin_shuffle (a, A) * b;
+	c = c - d;
+	return __builtin_shuffle(c, A);
+}
+
+/** 4D vector dot product.
+ *
+ * The w component *IS* significant, but if it is 0 in either vector, then
+ * the result will be as for a 3D dot product.
+ *
+ * Note that the dot product is in all 4 of the return value's elements. This
+ * helps optimize vector math as the scalar is already pre-spread. If just the
+ * scalar is required, use result[0].
+ */
+vec4d_t dotd (vec4d_t a, vec4d_t b) __attribute__((const));
+vec4d_t
+dotd (vec4d_t a, vec4d_t b)
+{
+	vec4d_t c = a * b;
+	c = _mm256_hadd_pd (c, c);
+	static const vec4l_t A = {2, 3, 0, 1};
+	c += __builtin_shuffle(c, A);
+	return c;
+}
+
+/** Quaternion product.
+ *
+ * The vector is interpreted as a quaternion instead of a regular 4D vector.
+ * The quaternion may be of any magnitude, so this is more generally useful.
+ * than if the quaternion was required to be unit length.
+ */
+vec4d_t qmuld (vec4d_t a, vec4d_t b) __attribute__((const));
+vec4d_t
+qmuld (vec4d_t a, vec4d_t b)
+{
+	// results in [2*as*bs, as*b + bs*a + a x b] ([scalar, vector] notation)
+	// doesn't seem to adversly affect precision
+	vec4d_t c = crossd (a, b) + a * b[3] + a[3] * b;
+	vec4d_t d = dotd (a, b);
+	// zero out the vector component of dot product so only the scalar remains
+	d = _mm256_permute2f128_pd (d, d, 0x18);
+	d = _mm256_permute4x64_pd (d, 0xc0);
+	return c - d;
+}
+
+/** Optimized quaterion-vector multiplication for vector rotation.
+ *
+ * If the vector's w component is not zero, then the result's w component
+ * is the cosine of the full rotation angle scaled by the vector's w component.
+ * The quaternion is assumed to be unit. If the quaternion is not unit, the
+ * vector will be scaled by the square of the quaternion's magnitude.
+ */
+vec4d_t qvmuld (vec4d_t q, vec4d_t v) __attribute__((const));
+vec4d_t
+qvmuld (vec4d_t q, vec4d_t v)
+{
+	double s = q[3];
+	// zero the scalar of the quaternion. Results in an extra operation, but
+	// avoids adding precision issues.
+	vec4d_t z = {};
+	q = _mm256_blend_pd (q, z, 0x08);
+	vec4d_t c = crossd (q, v);
+	vec4d_t qv = dotd (q, v);	// q.w is 0 so v.w is irrelevant
+	vec4d_t qq = dotd (q, q);
+
+	return (s * s - qq) * v + 2 * (qv * q + s * c);
+}
+
+/** Create the quaternion representing the shortest rotation from a to b.
+ *
+ * Both a and b are assumed to be 3D vectors (w components 0), but a resonable
+ * (but incorrect) result will still be produced if either a or b is a 4D
+ * vector. The rotation axis will be the same as if both vectors were 3D, but
+ * the magnitude of the rotation will be different.
+ */
+vec4d_t qrotd (vec4d_t a, vec4d_t b) __attribute__((const));
+vec4d_t
+qrotd (vec4d_t a, vec4d_t b)
+{
+	vec4d_t ma = _mm256_sqrt_pd (dotd (a, a));
+	vec4d_t mb = _mm256_sqrt_pd (dotd (b, b));
+	vec4d_t den = 2 * ma * mb;
+	vec4d_t t = mb * a + ma * b;
+	vec4d_t mba_mab = _mm256_sqrt_pd (dotd (t, t));
+	vec4d_t q = crossd (a, b) / mba_mab;
+	q[3] = (mba_mab / den)[0];
+	return q;
+}
+
+vec4d_t loadvec3d (const double v3[]) __attribute__((pure, access(read_only, 1)));
+vec4d_t
+loadvec3d (const double v3[])
+{
+	vec4d_t v4 = {};
+
+	v4[0] = v3[0];
+	v4[1] = v3[1];
+	v4[2] = v3[2];
+	return v4;
+}
+
+void storevec3d (double v3[3], vec4d_t v4) __attribute__((access (write_only, 1)));
+void storevec3d (double v3[3], vec4d_t v4)
+{
+	v3[0] = v4[0];
+	v3[1] = v4[1];
+	v3[2] = v4[2];
+}
+
+#endif//__QF_simd_vec4d_h
diff --git a/include/QF/simd/vec4f.h b/include/QF/simd/vec4f.h
new file mode 100644
index 000000000..ea92b81d9
--- /dev/null
+++ b/include/QF/simd/vec4f.h
@@ -0,0 +1,159 @@
+/*
+	QF/simd/vec4f.h
+
+	Vector functions for vec4f_t (ie, float precision)
+
+	Copyright (C) 2020  Bill Currie <bill@taniwha.org>
+
+	This program is free software; you can redistribute it and/or
+	modify it under the terms of the GNU General Public License
+	as published by the Free Software Foundation; either version 2
+	of the License, or (at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+	See the GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program; if not, write to:
+
+		Free Software Foundation, Inc.
+		59 Temple Place - Suite 330
+		Boston, MA  02111-1307, USA
+
+*/
+
+#ifndef __QF_simd_vec4f_h
+#define __QF_simd_vec4f_h
+
+#include <immintrin.h>
+
+#include "QF/simd/types.h"
+
+/** 3D vector cross product.
+ *
+ * The w (4th) component can be any value on input, and is guaranteed to be 0
+ * in the result. The result is not affected in any way by either vector's w
+ * componemnt
+ */
+vec4f_t crossf (vec4f_t a, vec4f_t b) __attribute__((const));
+vec4f_t
+crossf (vec4f_t a, vec4f_t b)
+{
+	static const vec4i_t A = {1, 2, 0, 3};
+	vec4f_t c = a * __builtin_shuffle (b, A) - __builtin_shuffle (a, A) * b;
+	return __builtin_shuffle(c, A);
+}
+
+/** 4D vector dot product.
+ *
+ * The w component *IS* significant, but if it is 0 in either vector, then
+ * the result will be as for a 3D dot product.
+ *
+ * Note that the dot product is in all 4 of the return value's elements. This
+ * helps optimize vector math as the scalar is already pre-spread. If just the
+ * scalar is required, use result[0].
+ */
+vec4f_t dotf (vec4f_t a, vec4f_t b) __attribute__((const));
+vec4f_t
+dotf (vec4f_t a, vec4f_t b)
+{
+	vec4f_t c = a * b;
+	c = _mm_hadd_ps (c, c);
+	c = _mm_hadd_ps (c, c);
+	return c;
+}
+
+/** Quaternion product.
+ *
+ * The vector is interpreted as a quaternion instead of a regular 4D vector.
+ * The quaternion may be of any magnitude, so this is more generally useful.
+ * than if the quaternion was required to be unit length.
+ */
+vec4f_t qmulf (vec4f_t a, vec4f_t b) __attribute__((const));
+vec4f_t
+qmulf (vec4f_t a, vec4f_t b)
+{
+	// results in [2*as*bs, as*b + bs*a + a x b] ([scalar, vector] notation)
+	// doesn't seem to adversly affect precision
+	vec4f_t c = crossf (a, b) + a * b[3] + a[3] * b;
+	vec4f_t d = dotf (a, b);
+	// zero out the vector component of dot product so only the scalar remains
+	d = _mm_insert_ps (d, d, 0xf7);
+	return c - d;
+}
+
+/** Optimized quaterion-vector multiplication for vector rotation.
+ *
+ * If the vector's w component is not zero, then the result's w component
+ * is the cosine of the full rotation angle scaled by the vector's w component.
+ * The quaternion is assumed to be unit.
+ */
+vec4f_t qvmulf (vec4f_t q, vec4f_t v) __attribute__((const));
+vec4f_t
+qvmulf (vec4f_t q, vec4f_t v)
+{
+	float s = q[3];
+	// zero the scalar of the quaternion. Results in an extra operation, but
+	// avoids adding precision issues.
+	q = _mm_insert_ps (q, q, 0xf8);
+	vec4f_t c = crossf (q, v);	// q.w is 0 so v.w is irrelevant
+	vec4f_t qv = dotf (q, v);
+	vec4f_t qq = dotf (q, q);
+
+	return (s * s - qq) * v + 2 * (qv * q + s * c);
+}
+
+/** Create the quaternion representing the shortest rotation from a to b.
+ *
+ * Both a and b are assumed to be 3D vectors (w components 0), but a resonable
+ * (but incorrect) result will still be produced if either a or b is a 4D
+ * vector. The rotation axis will be the same as if both vectors were 3D, but
+ * the magnitude of the rotation will be different.
+ */
+vec4f_t qrotf (vec4f_t a, vec4f_t b) __attribute__((const));
+vec4f_t
+qrotf (vec4f_t a, vec4f_t b)
+{
+	vec4f_t ma = _mm_sqrt_ps (dotf (a, a));
+	vec4f_t mb = _mm_sqrt_ps (dotf (b, b));
+	vec4f_t den = 2 * ma * mb;
+	vec4f_t t = mb * a + ma * b;
+	vec4f_t mba_mab = _mm_sqrt_ps (dotf (t, t));
+	vec4f_t q = crossf (a, b) / mba_mab;
+	q[3] = (mba_mab / den)[0];
+	return q;
+}
+
+vec4f_t loadvec3f (const float v3[3]) __attribute__((pure, access(read_only, 1)));
+vec4f_t
+loadvec3f (const float v3[3])
+{
+	vec4f_t v4;
+
+	// this had to be in asm otherwise gcc thinks v4 is only partially
+	// initialized, and gcc 10 does not use the zero flags when generating
+	// the code, resulting in a memory access to load a 0 into v4[3]
+	//
+	// The first instruction zeros v4[3] while loading v4[0]
+	asm ("\n\
+		vinsertps $0x08, %1, %0, %0     \n\
+		vinsertps $0x10, %2, %0, %0     \n\
+		vinsertps $0x20, %3, %0, %0     \n\
+		"
+		: "=v"(v4)
+		: "m"(v3[0]), "m"(v3[1]), "m"(v3[2]));
+	return v4;
+}
+
+void storevec3f (float v3[3], vec4f_t v4) __attribute__((access (write_only, 1)));
+void storevec3f (float v3[3], vec4f_t v4)
+{
+	v3[0] = v4[0];
+	v3[1] = v4[1];
+	v3[2] = v4[2];
+}
+
+#endif//__QF_simd_vec4f_h
diff --git a/libs/util/test/Makemodule.am b/libs/util/test/Makemodule.am
index 9447a7807..ca777f864 100644
--- a/libs/util/test/Makemodule.am
+++ b/libs/util/test/Makemodule.am
@@ -14,6 +14,7 @@ libs_util_tests = \
 	libs/util/test/test-seb \
 	libs/util/test/test-seg \
 	libs/util/test/test-set \
+	libs/util/test/test-simd \
 	libs/util/test/test-txtbuffer \
 	libs/util/test/test-vrect
 
@@ -81,6 +82,10 @@ libs_util_test_test_set_SOURCES=libs/util/test/test-set.c
 libs_util_test_test_set_LDADD=libs/util/libQFutil.la
 libs_util_test_test_set_DEPENDENCIES=libs/util/libQFutil.la
 
+libs_util_test_test_simd_SOURCES=libs/util/test/test-simd.c
+libs_util_test_test_simd_LDADD=libs/util/libQFutil.la
+libs_util_test_test_simd_DEPENDENCIES=libs/util/libQFutil.la
+
 libs_util_test_test_txtbuffer_SOURCES=libs/util/test/test-txtbuffer.c
 libs_util_test_test_txtbuffer_LDADD=libs/util/libQFutil.la
 libs_util_test_test_txtbuffer_DEPENDENCIES=libs/util/libQFutil.la
diff --git a/libs/util/test/test-simd.c b/libs/util/test/test-simd.c
new file mode 100644
index 000000000..ec1b145ac
--- /dev/null
+++ b/libs/util/test/test-simd.c
@@ -0,0 +1,289 @@
+#include <stdio.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "QF/simd/vec4d.h"
+#include "QF/simd/vec4f.h"
+
+#define right   { 1, 0, 0 }
+#define forward { 0, 1, 0 }
+#define up      { 0, 0, 1 }
+#define one     { 1, 1, 1, 1 }
+#define half    { 0.5, 0.5, 0.5, 0.5 }
+#define zero    { 0, 0, 0, 0 }
+#define qident  { 0, 0, 0, 1 }
+#define qtest   { 0.64, 0.48, 0, 0.6 }
+
+#define nright   { -1,  0,  0 }
+#define nforward {  0, -1,  0 }
+#define nup      {  0,  0, -1 }
+#define none     { -1, -1, -1, -1 }
+#define nqident  {  0,  0,  0, -1 }
+
+#define s05 0.70710678118654757
+
+typedef  struct {
+	vec4d_t   (*op) (vec4d_t a, vec4d_t b);
+	vec4d_t     a;
+	vec4d_t     b;
+	vec4d_t     expect;
+	vec4d_t     ulp_errors;
+} vec4d_test_t;
+
+typedef  struct {
+	vec4f_t   (*op) (vec4f_t a, vec4f_t b);
+	vec4f_t     a;
+	vec4f_t     b;
+	vec4f_t     expect;
+	vec4f_t     ulp_errors;
+} vec4f_test_t;
+
+static vec4d_test_t vec4d_tests[] = {
+	// 3D dot products
+	{ dotd, right,   right,   one  },
+	{ dotd, right,   forward, zero },
+	{ dotd, right,   up,      zero },
+	{ dotd, forward, right,   zero },
+	{ dotd, forward, forward, one  },
+	{ dotd, forward, up,      zero },
+	{ dotd, up,      right,   zero },
+	{ dotd, up,      forward, zero },
+	{ dotd, up,      up,      one  },
+
+	// one is 4D, so its self dot product is 4
+	{ dotd, one,     one,     { 4,  4,  4,  4} },
+	{ dotd, one,    none,     {-4, -4, -4, -4} },
+
+	// 3D cross products
+	{ crossd, right,   right,    zero    },
+	{ crossd, right,   forward,  up      },
+	{ crossd, right,   up,      nforward },
+	{ crossd, forward, right,   nup      },
+	{ crossd, forward, forward,  zero    },
+	{ crossd, forward, up,       right   },
+	{ crossd, up,      right,    forward },
+	{ crossd, up,      forward, nright   },
+	{ crossd, up,      up,       zero    },
+	// double whammy tests: cross product with an angled vector and
+	// ensuring that a 4d vector (non-zero w component) does not affect
+	// the result, including the result's w component remaining zero.
+	{ crossd, right,   one,     { 0, -1,  1} },
+	{ crossd, forward, one,     { 1,  0, -1} },
+	{ crossd, up,      one,     {-1,  1,  0} },
+	{ crossd, one,     right,   { 0,  1, -1} },
+	{ crossd, one,     forward, {-1,  0,  1} },
+	{ crossd, one,     up,      { 1, -1,  0} },
+	// This one fails when optimizing with -mfma (which is why fma is not
+	// used): ulp errors in z and w
+	{ crossd, qtest,   qtest,   {0, 0, 0, 0} },
+
+	{ qmuld, qident,  qident,   qident  },
+	{ qmuld, qident,  right,    right   },
+	{ qmuld, qident,  forward,  forward },
+	{ qmuld, qident,  up,       up      },
+	{ qmuld, right,   qident,   right   },
+	{ qmuld, forward, qident,   forward },
+	{ qmuld, up,      qident,   up      },
+	{ qmuld, right,   right,   nqident  },
+	{ qmuld, right,   forward,  up      },
+	{ qmuld, right,   up,      nforward },
+	{ qmuld, forward, right,   nup      },
+	{ qmuld, forward, forward, nqident  },
+	{ qmuld, forward, up,       right   },
+	{ qmuld, up,      right,    forward },
+	{ qmuld, up,      forward, nright   },
+	{ qmuld, up,      up,      nqident  },
+	{ qmuld, one,     one,     { 2, 2, 2, -2 } },
+	{ qmuld, one,     { 2, 2, 2, -2 }, { 0, 0, 0, -8 } },
+	// This one fails when optimizing with -mfma (which is why fma is not
+	// used): ulp error in z
+	{ qmuld, qtest,   qtest,   {0.768, 0.576, 0, -0.28} },
+
+	// The one vector is not unit (magnitude 2), so using it as a rotation
+	// quaternion results in scaling by 4. However, it still has the effect
+	// of rotating 120 degrees around the axis equidistant from the three
+	// orthogonal axes such that x->y->z->x
+	{ qvmuld, one,    right,     { 0, 4, 0, 0 } },
+	{ qvmuld, one,    forward,   { 0, 0, 4, 0 } },
+	{ qvmuld, one,    up,        { 4, 0, 0, 0 } },
+	{ qvmuld, one,    {1,1,1,0}, { 4, 4, 4, 0 } },
+	{ qvmuld, one,    one,       { 4, 4, 4, -2 } },
+	// The half vector is unit.
+	{ qvmuld, half,   right,     forward },
+	{ qvmuld, half,   forward,   up      },
+	{ qvmuld, half,   up,        right   },
+	{ qvmuld, half,   {1,1,1,0}, { 1, 1, 1, 0 } },
+	// one is a 4D vector and qvmuld is meant for 3D vectors. However, it
+	// seems that the vector's w has no effect on the 3d portion of the
+	// result, but the result's w is cosine of the full rotation angle
+	// scaled by quaternion magnitude and vector w
+	{ qvmuld, half,   one,       { 1, 1, 1, -0.5 } },
+	{ qvmuld, half,   {2,2,2,2}, { 2, 2, 2, -1 } },
+	{ qvmuld, qtest,  right,     {0.5392, 0.6144, -0.576, 0} },
+	{ qvmuld, qtest,  forward,   {0.6144, 0.1808, 0.768, 0},
+	                             {0, -2.7e-17, 0, 0} },
+	{ qvmuld, qtest,  up,        {0.576, -0.768, -0.28, 0} },
+
+	{ qrotd, right,   right,    qident },
+	{ qrotd, right,   forward,  {    0,    0,  s05,  s05 },
+	                            {0, 0, -1.1e-16, 0} },
+	{ qrotd, right,   up,       {    0, -s05,    0,  s05 },
+	                            {0, 1.1e-16, 0, 0} },
+	{ qrotd, forward, right,    {    0,    0, -s05,  s05 },
+	                            {0, 0, 1.1e-16, 0} },
+	{ qrotd, forward, forward,  qident },
+	{ qrotd, forward, up,       {  s05,    0,    0,  s05 },
+	                            {-1.1e-16, 0, 0, 0} },
+	{ qrotd, up,      right,    {    0,  s05,    0,  s05 },
+	                            {0, -1.1e-16, 0, 0} },
+	{ qrotd, up,      forward,  { -s05,    0,    0,  s05 },
+	                            { 1.1e-16, 0, 0, 0} },
+	{ qrotd, up,      up,       qident },
+};
+#define num_vec4d_tests (sizeof (vec4d_tests) / (sizeof (vec4d_tests[0])))
+
+static vec4f_test_t vec4f_tests[] = {
+	// 3D dot products
+	{ dotf, right,   right,   one  },
+	{ dotf, right,   forward, zero },
+	{ dotf, right,   up,      zero },
+	{ dotf, forward, right,   zero },
+	{ dotf, forward, forward, one  },
+	{ dotf, forward, up,      zero },
+	{ dotf, up,      right,   zero },
+	{ dotf, up,      forward, zero },
+	{ dotf, up,      up,      one  },
+
+	// one is 4D, so its self dot product is 4
+	{ dotf, one,     one,     { 4,  4,  4,  4} },
+	{ dotf, one,    none,     {-4, -4, -4, -4} },
+
+	// 3D cross products
+	{ crossf, right,   right,    zero    },
+	{ crossf, right,   forward,  up      },
+	{ crossf, right,   up,      nforward },
+	{ crossf, forward, right,   nup      },
+	{ crossf, forward, forward,  zero    },
+	{ crossf, forward, up,       right   },
+	{ crossf, up,      right,    forward },
+	{ crossf, up,      forward, nright   },
+	{ crossf, up,      up,       zero    },
+	// double whammy tests: cross product with an angled vector and
+	// ensuring that a 4d vector (non-zero w component) does not affect
+	// the result, including the result's w component remaining zero.
+	{ crossf, right,   one,     { 0, -1,  1} },
+	{ crossf, forward, one,     { 1,  0, -1} },
+	{ crossf, up,      one,     {-1,  1,  0} },
+	{ crossf, one,     right,   { 0,  1, -1} },
+	{ crossf, one,     forward, {-1,  0,  1} },
+	{ crossf, one,     up,      { 1, -1,  0} },
+	{ crossf, qtest,   qtest,   {0, 0, 0, 0} },
+
+	{ qmulf, qident,  qident,   qident  },
+	{ qmulf, qident,  right,    right   },
+	{ qmulf, qident,  forward,  forward },
+	{ qmulf, qident,  up,       up      },
+	{ qmulf, right,   qident,   right   },
+	{ qmulf, forward, qident,   forward },
+	{ qmulf, up,      qident,   up      },
+	{ qmulf, right,   right,   nqident  },
+	{ qmulf, right,   forward,  up      },
+	{ qmulf, right,   up,      nforward },
+	{ qmulf, forward, right,   nup      },
+	{ qmulf, forward, forward, nqident  },
+	{ qmulf, forward, up,       right   },
+	{ qmulf, up,      right,    forward },
+	{ qmulf, up,      forward, nright   },
+	{ qmulf, up,      up,      nqident  },
+	{ qmulf, one,     one,     { 2, 2, 2, -2 } },
+	{ qmulf, one,     { 2, 2, 2, -2 }, { 0, 0, 0, -8 } },
+	{ qmulf, qtest,   qtest,   {0.768, 0.576, 0, -0.28},
+	                           {0, 6e-8, 0, 3e-8} },
+
+	// The one vector is not unit (magnitude 2), so using it as a rotation
+	// quaternion results in scaling by 4. However, it still has the effect
+	// of rotating 120 degrees around the axis equidistant from the three
+	// orthogonal axes such that x->y->z->x
+	{ qvmulf, one,    right,     { 0, 4, 0, 0 } },
+	{ qvmulf, one,    forward,   { 0, 0, 4, 0 } },
+	{ qvmulf, one,    up,        { 4, 0, 0, 0 } },
+	{ qvmulf, one,    {1,1,1,0}, { 4, 4, 4, 0 } },
+	{ qvmulf, one,    one,       { 4, 4, 4, -2 } },
+	{ qvmulf, qtest,  right,     {0.5392, 0.6144, -0.576, 0},
+	                             {0, -5.9e-08, -6e-8, 0} },
+	{ qvmulf, qtest,  forward,   {0.6144, 0.1808, 0.768, 0},
+	                             {-5.9e-08, 1.5e-08, 0, 0} },
+	{ qvmulf, qtest,  up,        {0.576, -0.768, -0.28, 0},
+	                             {6e-8, 0, 3e-8, 0} },
+
+	{ qrotf, right,   right,    qident },
+	{ qrotf, right,   forward,  {    0,    0,  s05,  s05 } },
+	{ qrotf, right,   up,       {    0, -s05,    0,  s05 } },
+	{ qrotf, forward, right,    {    0,    0, -s05,  s05 } },
+	{ qrotf, forward, forward,  qident },
+	{ qrotf, forward, up,       {  s05,    0,    0,  s05 } },
+	{ qrotf, up,      right,    {    0,  s05,    0,  s05 } },
+	{ qrotf, up,      forward,  { -s05,    0,    0,  s05 } },
+	{ qrotf, up,      up,       qident },
+};
+#define num_vec4f_tests (sizeof (vec4f_tests) / (sizeof (vec4f_tests[0])))
+
+static int
+run_vec4d_tests (void)
+{
+	int         ret = 0;
+
+	for (size_t i = 0; i < num_vec4d_tests; i++) {
+		__auto_type test = &vec4d_tests[i];
+		vec4d_t     result = test->op (test->a, test->b);
+		vec4d_t     expect = test->expect + test->ulp_errors;
+		vec4l_t     res = result != expect;
+		if (res[0] || res[1] || res[2] || res[3]) {
+			ret |= 1;
+			printf ("\nrun_vec4d_tests\n");
+			printf ("a: " VEC4D_FMT "\n", VEC4_EXP(test->a));
+			printf ("b: " VEC4D_FMT "\n", VEC4_EXP(test->b));
+			printf ("r: " VEC4D_FMT "\n", VEC4_EXP(result));
+			printf ("t: " VEC4L_FMT "\n", VEC4_EXP(res));
+			printf ("E: " VEC4D_FMT "\n", VEC4_EXP(expect));
+			printf ("e: " VEC4D_FMT "\n", VEC4_EXP(test->expect));
+			printf ("u: " VEC4D_FMT "\n", VEC4_EXP(test->ulp_errors));
+		}
+	}
+	return ret;
+}
+
+static int
+run_vec4f_tests (void)
+{
+	int         ret = 0;
+
+	for (size_t i = 0; i < num_vec4f_tests; i++) {
+		__auto_type test = &vec4f_tests[i];
+		vec4f_t     result = test->op (test->a, test->b);
+		vec4f_t     expect = test->expect + test->ulp_errors;
+		vec4i_t     res = result != expect;
+		if (res[0] || res[1] || res[2] || res[3]) {
+			ret |= 1;
+			printf ("\nrun_vec4f_tests\n");
+			printf ("a: " VEC4F_FMT "\n", VEC4_EXP(test->a));
+			printf ("b: " VEC4F_FMT "\n", VEC4_EXP(test->b));
+			printf ("r: " VEC4F_FMT "\n", VEC4_EXP(result));
+			printf ("t: " VEC4I_FMT "\n", VEC4_EXP(res));
+			printf ("E: " VEC4F_FMT "\n", VEC4_EXP(expect));
+			printf ("e: " VEC4F_FMT "\n", VEC4_EXP(test->expect));
+			printf ("u: " VEC4F_FMT "\n", VEC4_EXP(test->ulp_errors));
+		}
+	}
+	return ret;
+}
+
+int
+main (void)
+{
+	int         ret = 0;
+	ret |= run_vec4d_tests ();
+	ret |= run_vec4f_tests ();
+	return ret;
+}