From d404519cce565402aa98c3f9943221ed6ddb2790 Mon Sep 17 00:00:00 2001
From: Zack Middleton <zack@cloemail.com>
Date: Mon, 29 Apr 2019 14:39:10 -0500
Subject: [PATCH] Fix rendering IQM models between model frames

For lerped frames (refEntity_t frame not equal oldframe) IQM joint
matrices may have incorrect axis scale. This can cause significant model
distortion. The matrix lerp is linear causing each vector to move in a
straight line between frames instead of arcing like a circle. Each joint
frame can have a different scale so can't just normalize the joint
matrix.

Store joints as quaternions and spherical lerp between them and then
convert to a matrix. For my test model, setting up the skeleton is four
times slower now but it still seems to be fast enough to be usable.
---
 code/qcommon/q_shared.h         |   4 +
 code/renderergl1/tr_local.h     |  11 +-
 code/renderergl1/tr_model_iqm.c | 216 ++++++++++++++++++++------------
 code/renderergl2/tr_local.h     |  11 +-
 code/renderergl2/tr_model_iqm.c | 216 ++++++++++++++++++++------------
 5 files changed, 294 insertions(+), 164 deletions(-)

diff --git a/code/qcommon/q_shared.h b/code/qcommon/q_shared.h
index 7d8b509c..86d9ac47 100644
--- a/code/qcommon/q_shared.h
+++ b/code/qcommon/q_shared.h
@@ -368,6 +368,8 @@ typedef vec_t vec3_t[3];
 typedef vec_t vec4_t[4];
 typedef vec_t vec5_t[5];
 
+typedef vec_t quat_t[4];
+
 typedef	int	fixed4_t;
 typedef	int	fixed8_t;
 typedef	int	fixed16_t;
@@ -578,6 +580,8 @@ typedef struct {
 
 #define Byte4Copy(a,b)			((b)[0]=(a)[0],(b)[1]=(a)[1],(b)[2]=(a)[2],(b)[3]=(a)[3])
 
+#define QuatCopy(a,b)			((b)[0]=(a)[0],(b)[1]=(a)[1],(b)[2]=(a)[2],(b)[3]=(a)[3])
+
 #define	SnapVector(v) {v[0]=((int)(v[0]));v[1]=((int)(v[1]));v[2]=((int)(v[2]));}
 // just in case you don't want to use the macros
 vec_t _DotProduct( const vec3_t v1, const vec3_t v2 );
diff --git a/code/renderergl1/tr_local.h b/code/renderergl1/tr_local.h
index fe42f4de..e07f575b 100644
--- a/code/renderergl1/tr_local.h
+++ b/code/renderergl1/tr_local.h
@@ -589,6 +589,12 @@ typedef struct {
 	drawVert_t		*verts;
 } srfTriangles_t;
 
+typedef struct {
+	vec3_t translate;
+	quat_t rotate;
+	vec3_t scale;
+} iqmTransform_t;
+
 // inter-quake-model
 typedef struct {
 	int		num_vertexes;
@@ -623,8 +629,9 @@ typedef struct {
 
 	char		*jointNames;
 	int		*jointParents;
-	float		*jointMats;
-	float		*poseMats;
+	float		*bindJoints; // [num_joints * 12]
+	float		*invBindJoints; // [num_joints * 12]
+	iqmTransform_t	*poses; // [num_frames * num_poses]
 	float		*bounds;
 } iqmData_t;
 
diff --git a/code/renderergl1/tr_model_iqm.c b/code/renderergl1/tr_model_iqm.c
index 02616469..fc0e5810 100644
--- a/code/renderergl1/tr_model_iqm.c
+++ b/code/renderergl1/tr_model_iqm.c
@@ -2,6 +2,7 @@
 ===========================================================================
 Copyright (C) 2011 Thilo Schulz <thilo@tjps.eu>
 Copyright (C) 2011 Matthias Bentrup <matthias.bentrup@googlemail.com>
+Copyright (C) 2011-2019 Zack Middleton <zturtleman@gmail.com>
 
 This file is part of Quake III Arena source code.
 
@@ -44,7 +45,7 @@ static qboolean IQM_CheckRange( iqmHeader_t *header, int offset,
 }
 // "multiply" 3x4 matrices, these are assumed to be the top 3 rows
 // of a 4x4 matrix with the last row = (0 0 0 1)
-static void Matrix34Multiply( float *a, float *b, float *out ) {
+static void Matrix34Multiply( const float *a, const float *b, float *out ) {
 	out[ 0] = a[0] * b[0] + a[1] * b[4] + a[ 2] * b[ 8];
 	out[ 1] = a[0] * b[1] + a[1] * b[5] + a[ 2] * b[ 9];
 	out[ 2] = a[0] * b[2] + a[1] * b[6] + a[ 2] * b[10];
@@ -58,23 +59,7 @@ static void Matrix34Multiply( float *a, float *b, float *out ) {
 	out[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10];
 	out[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11];
 }
-static void InterpolateMatrix( float *a, float *b, float lerp, float *mat ) {
-	float unLerp = 1.0f - lerp;
-
-	mat[ 0] = a[ 0] * unLerp + b[ 0] * lerp;
-	mat[ 1] = a[ 1] * unLerp + b[ 1] * lerp;
-	mat[ 2] = a[ 2] * unLerp + b[ 2] * lerp;
-	mat[ 3] = a[ 3] * unLerp + b[ 3] * lerp;
-	mat[ 4] = a[ 4] * unLerp + b[ 4] * lerp;
-	mat[ 5] = a[ 5] * unLerp + b[ 5] * lerp;
-	mat[ 6] = a[ 6] * unLerp + b[ 6] * lerp;
-	mat[ 7] = a[ 7] * unLerp + b[ 7] * lerp;
-	mat[ 8] = a[ 8] * unLerp + b[ 8] * lerp;
-	mat[ 9] = a[ 9] * unLerp + b[ 9] * lerp;
-	mat[10] = a[10] * unLerp + b[10] * lerp;
-	mat[11] = a[11] * unLerp + b[11] * lerp;
-}
-static void JointToMatrix( vec4_t rot, vec3_t scale, vec3_t trans,
+static void JointToMatrix( const quat_t rot, const vec3_t scale, const vec3_t trans,
 			   float *mat ) {
 	float xx = 2.0f * rot[0] * rot[0];
 	float yy = 2.0f * rot[1] * rot[1];
@@ -99,8 +84,7 @@ static void JointToMatrix( vec4_t rot, vec3_t scale, vec3_t trans,
 	mat[10] = scale[2] * (1.0f - (xx + yy));
 	mat[11] = trans[2];
 }
-static void Matrix34Invert( float *inMat, float *outMat )
-{
+static void Matrix34Invert( const float *inMat, float *outMat ) {
 	vec3_t trans;
 	float invSqrLen, *v;
 
@@ -120,6 +104,61 @@ static void Matrix34Invert( float *inMat, float *outMat )
 	outMat[ 7] = -DotProduct(outMat + 4, trans);
 	outMat[11] = -DotProduct(outMat + 8, trans);
 }
+static void QuatSlerp(const quat_t from, const quat_t _to, float fraction, quat_t out) {
+	float angle, cosAngle, sinAngle, backlerp, lerp;
+	quat_t to;
+
+	// cos() of angle
+	cosAngle = from[0] * _to[0] + from[1] * _to[1] + from[2] * _to[2] + from[3] * _to[3];
+
+	// negative handling is needed for taking shortest path (required for model joints)
+	if ( cosAngle < 0.0f ) {
+		cosAngle = -cosAngle;
+		to[0] = - _to[0];
+		to[1] = - _to[1];
+		to[2] = - _to[2];
+		to[3] = - _to[3];
+	} else {
+		QuatCopy( _to, to );
+	}
+
+	if ( cosAngle < 0.999999f ) {
+		// spherical lerp (slerp)
+		angle = acosf( cosAngle );
+		sinAngle = sinf( angle );
+		backlerp = sinf( ( 1.0f - fraction ) * angle ) / sinAngle;
+		lerp = sinf( fraction * angle ) / sinAngle;
+	} else {
+		// linear lerp
+		backlerp = 1.0f - fraction;
+		lerp = fraction;
+	}
+
+	out[0] = from[0] * backlerp + to[0] * lerp;
+	out[1] = from[1] * backlerp + to[1] * lerp;
+	out[2] = from[2] * backlerp + to[2] * lerp;
+	out[3] = from[3] * backlerp + to[3] * lerp;
+}
+static vec_t QuatNormalize2( const quat_t v, quat_t out) {
+	float	length, ilength;
+
+	length = v[0]*v[0] + v[1]*v[1] + v[2]*v[2] + v[3]*v[3];
+
+	if (length) {
+		/* writing it this way allows gcc to recognize that rsqrt can be used */
+		ilength = 1/(float)sqrt (length);
+		/* sqrt(length) = length * (1 / sqrt(length)) */
+		length *= ilength;
+		out[0] = v[0]*ilength;
+		out[1] = v[1]*ilength;
+		out[2] = v[2]*ilength;
+		out[3] = v[3]*ilength;
+	} else {
+		out[0] = out[1] = out[2] = out[3] = 0;
+	}
+
+	return length;
+}
 
 /*
 =================
@@ -139,7 +178,7 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 	unsigned short		*framedata;
 	char			*str;
 	int			i, j, k;
-	float			jointInvMats[IQM_MAX_JOINTS * 12] = {0.0f};
+	iqmTransform_t		*transform;
 	float			*mat, *matInv;
 	size_t			size, joint_names;
 	byte			*dataPtr;
@@ -559,10 +598,11 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 	if( header->num_joints ) {
 		size += joint_names;								// joint names
 		size += header->num_joints * sizeof(int);			// joint parents
-		size += header->num_joints * 12 * sizeof( float );	// joint mats
+		size += header->num_joints * 12 * sizeof(float);	// bind joint matricies
+		size += header->num_joints * 12 * sizeof(float);	// inverse bind joint matricies
 	}
 	if( header->num_poses ) {
-		size += header->num_poses * header->num_frames * 12 * sizeof( float ); // pose mats
+		size += header->num_poses * header->num_frames * sizeof(iqmTransform_t); // pose transforms
 	}
 	if( header->ofs_bounds ) {
 		size += header->num_frames * 6 * sizeof(float);		// model bounds
@@ -633,12 +673,15 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 		iqmData->jointParents = (int*)dataPtr;
 		dataPtr += header->num_joints * sizeof(int);		// joint parents
 
-		iqmData->jointMats = (float*)dataPtr;
-		dataPtr += header->num_joints * 12 * sizeof( float ); // joint mats
+		iqmData->bindJoints = (float*)dataPtr;
+		dataPtr += header->num_joints * 12 * sizeof(float);		// bind joint matricies
+
+		iqmData->invBindJoints = (float*)dataPtr;
+		dataPtr += header->num_joints * 12 * sizeof(float);		// inverse bind joint matricies
 	}
 	if( header->num_poses ) {
-		iqmData->poseMats = (float*)dataPtr;
-		dataPtr += header->num_poses * header->num_frames * 12 * sizeof( float ); // pose mats
+		iqmData->poses = (iqmTransform_t*)dataPtr;
+		dataPtr += header->num_poses * header->num_frames * sizeof(iqmTransform_t); // pose transforms
 	}
 	if( header->ofs_bounds ) {
 		iqmData->bounds = (float*)dataPtr;
@@ -804,22 +847,23 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 			iqmData->jointParents[i] = joint->parent;
 		}
 
-		// calculate joint matrices and their inverses
-		// joint inverses are needed only until the pose matrices are calculated
-		mat = iqmData->jointMats;
-		matInv = jointInvMats;
+		// calculate bind joint matrices and their inverses
+		mat = iqmData->bindJoints;
+		matInv = iqmData->invBindJoints;
 		joint = (iqmJoint_t *)((byte *)header + header->ofs_joints);
 		for( i = 0; i < header->num_joints; i++, joint++ ) {
 			float baseFrame[12], invBaseFrame[12];
 
+			QuatNormalize2( joint->rotate, joint->rotate );
+
 			JointToMatrix( joint->rotate, joint->scale, joint->translate, baseFrame );
 			Matrix34Invert( baseFrame, invBaseFrame );
 
 			if ( joint->parent >= 0 )
 			{
-				Matrix34Multiply( iqmData->jointMats + 12 * joint->parent, baseFrame, mat );
+				Matrix34Multiply( iqmData->bindJoints + 12 * joint->parent, baseFrame, mat );
 				mat += 12;
-				Matrix34Multiply( invBaseFrame, jointInvMats + 12 * joint->parent, matInv );
+				Matrix34Multiply( invBaseFrame, iqmData->invBindJoints + 12 * joint->parent, matInv );
 				matInv += 12;
 			}
 			else
@@ -834,16 +878,15 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 
 	if( header->num_poses )
 	{
-		// calculate pose matrices
+		// calculate pose transforms
+		transform = iqmData->poses;
 		framedata = (unsigned short *)((byte *)header + header->ofs_frames);
-		mat = iqmData->poseMats;
 		for( i = 0; i < header->num_frames; i++ ) {
 			pose = (iqmPose_t *)((byte *)header + header->ofs_poses);
-			for( j = 0; j < header->num_poses; j++, pose++ ) {
+			for( j = 0; j < header->num_poses; j++, pose++, transform++ ) {
 				vec3_t	translate;
-				vec4_t	rotate;
+				quat_t	rotate;
 				vec3_t	scale;
-				float	mat1[12], mat2[12];
 
 				translate[0] = pose->channeloffset[0];
 				if( pose->mask & 0x001)
@@ -878,18 +921,9 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 				if( pose->mask & 0x200)
 					scale[2] += *framedata++ * pose->channelscale[9];
 
-				// construct transformation matrix
-				JointToMatrix( rotate, scale, translate, mat1 );
-
-				if( pose->parent >= 0 ) {
-					Matrix34Multiply( iqmData->jointMats + 12 * pose->parent,
-							  mat1, mat2 );
-				} else {
-					Com_Memcpy( mat2, mat1, sizeof(mat1) );
-				}
-
-				Matrix34Multiply( mat2, jointInvMats + 12 * j, mat );
-				mat += 12;
+				VectorCopy( translate, transform->translate );
+				QuatNormalize2( rotate, transform->rotate );
+				VectorCopy( scale, transform->scale );
 			}
 		}
 	}
@@ -1128,37 +1162,59 @@ void R_AddIQMSurfaces( trRefEntity_t *ent ) {
 
 
 static void ComputePoseMats( iqmData_t *data, int frame, int oldframe,
-			      float backlerp, float *mat ) {
-	float	*mat1, *mat2;
-	int	*joint = data->jointParents;
-	int	i;
+			      float backlerp, float *poseMats ) {
+	iqmTransform_t relativeJoints[IQM_MAX_JOINTS];
+	iqmTransform_t *relativeJoint;
+	const iqmTransform_t *pose;
+	const iqmTransform_t *oldpose;
+	const int *jointParent;
+	const float *invBindMat;
+	float *poseMat, lerp;
+	int i;
 
+	relativeJoint = relativeJoints;
+
+	// copy or lerp animation frame pose
 	if ( oldframe == frame ) {
-		mat1 = data->poseMats + 12 * data->num_poses * frame;
-		for( i = 0; i < data->num_poses; i++, joint++ ) {
-			if( *joint >= 0 ) {
-				Matrix34Multiply( mat + 12 * *joint,
-						  mat1 + 12*i, mat + 12*i );
-			} else {
-				Com_Memcpy( mat + 12*i, mat1 + 12*i, 12 * sizeof(float) );
-			}
+		pose = &data->poses[frame * data->num_poses];
+		for ( i = 0; i < data->num_poses; i++, pose++, relativeJoint++ ) {
+			VectorCopy( pose->translate, relativeJoint->translate );
+			QuatCopy( pose->rotate, relativeJoint->rotate );
+			VectorCopy( pose->scale, relativeJoint->scale );
 		}
-	} else  {
-		mat1 = data->poseMats + 12 * data->num_poses * frame;
-		mat2 = data->poseMats + 12 * data->num_poses * oldframe;
-		
-		for( i = 0; i < data->num_poses; i++, joint++ ) {
-			if( *joint >= 0 ) {
-				float tmpMat[12];
-				InterpolateMatrix( mat1 + 12*i, mat2 + 12*i,
-						   backlerp, tmpMat );
-				Matrix34Multiply( mat + 12 * *joint,
-						  tmpMat, mat + 12*i );
-				
-			} else {
-				InterpolateMatrix( mat1 + 12*i, mat2 + 12*i,
-						   backlerp, mat + 12*i );
-			}
+	} else {
+		lerp = 1.0f - backlerp;
+		pose = &data->poses[frame * data->num_poses];
+		oldpose = &data->poses[oldframe * data->num_poses];
+		for ( i = 0; i < data->num_poses; i++, oldpose++, pose++, relativeJoint++ ) {
+			relativeJoint->translate[0] = oldpose->translate[0] * backlerp + pose->translate[0] * lerp;
+			relativeJoint->translate[1] = oldpose->translate[1] * backlerp + pose->translate[1] * lerp;
+			relativeJoint->translate[2] = oldpose->translate[2] * backlerp + pose->translate[2] * lerp;
+
+			relativeJoint->scale[0] = oldpose->scale[0] * backlerp + pose->scale[0] * lerp;
+			relativeJoint->scale[1] = oldpose->scale[1] * backlerp + pose->scale[1] * lerp;
+			relativeJoint->scale[2] = oldpose->scale[2] * backlerp + pose->scale[2] * lerp;
+
+			QuatSlerp( oldpose->rotate, pose->rotate, lerp, relativeJoint->rotate );
+		}
+	}
+
+	// multiply by inverse of bind pose and parent 'pose mat' (bind pose transform matrix)
+	relativeJoint = relativeJoints;
+	jointParent = data->jointParents;
+	invBindMat = data->invBindJoints;
+	poseMat = poseMats;
+	for ( i = 0; i < data->num_poses; i++, relativeJoint++, jointParent++, invBindMat += 12, poseMat += 12 ) {
+		float mat1[12], mat2[12];
+
+		JointToMatrix( relativeJoint->rotate, relativeJoint->scale, relativeJoint->translate, mat1 );
+
+		if ( *jointParent >= 0 ) {
+			Matrix34Multiply( &data->bindJoints[(*jointParent)*12], mat1, mat2 );
+			Matrix34Multiply( mat2, invBindMat, mat1 );
+			Matrix34Multiply( &poseMats[(*jointParent)*12], mat1, poseMat );
+		} else {
+			Matrix34Multiply( mat1, invBindMat, poseMat );
 		}
 	}
 }
@@ -1169,7 +1225,7 @@ static void ComputeJointMats( iqmData_t *data, int frame, int oldframe,
 	int	i;
 
 	if ( data->num_poses == 0 ) {
-		Com_Memcpy( mat, data->jointMats, data->num_joints * 12 * sizeof(float) );
+		Com_Memcpy( mat, data->bindJoints, data->num_joints * 12 * sizeof(float) );
 		return;
 	}
 
@@ -1181,7 +1237,7 @@ static void ComputeJointMats( iqmData_t *data, int frame, int oldframe,
 
 		Com_Memcpy(outmat, mat1, sizeof(outmat));
 
-		Matrix34Multiply( outmat, data->jointMats + 12*i, mat1 );
+		Matrix34Multiply( outmat, data->bindJoints + 12*i, mat1 );
 	}
 }
 
diff --git a/code/renderergl2/tr_local.h b/code/renderergl2/tr_local.h
index 12b469c7..39f6f7a2 100644
--- a/code/renderergl2/tr_local.h
+++ b/code/renderergl2/tr_local.h
@@ -954,6 +954,12 @@ typedef struct srfBspSurface_s
 	float			*heightLodError;
 } srfBspSurface_t;
 
+typedef struct {
+	vec3_t translate;
+	quat_t rotate;
+	vec3_t scale;
+} iqmTransform_t;
+
 // inter-quake-model
 typedef struct {
 	int		num_vertexes;
@@ -988,8 +994,9 @@ typedef struct {
 
 	char		*jointNames;
 	int		*jointParents;
-	float		*jointMats;
-	float		*poseMats;
+	float		*bindJoints; // [num_joints * 12]
+	float		*invBindJoints; // [num_joints * 12]
+	iqmTransform_t	*poses; // [num_frames * num_poses]
 	float		*bounds;
 
 	int		numVaoSurfaces;
diff --git a/code/renderergl2/tr_model_iqm.c b/code/renderergl2/tr_model_iqm.c
index 1a6e1e5a..649c4dc4 100644
--- a/code/renderergl2/tr_model_iqm.c
+++ b/code/renderergl2/tr_model_iqm.c
@@ -2,6 +2,7 @@
 ===========================================================================
 Copyright (C) 2011 Thilo Schulz <thilo@tjps.eu>
 Copyright (C) 2011 Matthias Bentrup <matthias.bentrup@googlemail.com>
+Copyright (C) 2011-2019 Zack Middleton <zturtleman@gmail.com>
 
 This file is part of Quake III Arena source code.
 
@@ -44,7 +45,7 @@ static qboolean IQM_CheckRange( iqmHeader_t *header, int offset,
 }
 // "multiply" 3x4 matrices, these are assumed to be the top 3 rows
 // of a 4x4 matrix with the last row = (0 0 0 1)
-static void Matrix34Multiply( float *a, float *b, float *out ) {
+static void Matrix34Multiply( const float *a, const float *b, float *out ) {
 	out[ 0] = a[0] * b[0] + a[1] * b[4] + a[ 2] * b[ 8];
 	out[ 1] = a[0] * b[1] + a[1] * b[5] + a[ 2] * b[ 9];
 	out[ 2] = a[0] * b[2] + a[1] * b[6] + a[ 2] * b[10];
@@ -58,23 +59,7 @@ static void Matrix34Multiply( float *a, float *b, float *out ) {
 	out[10] = a[8] * b[2] + a[9] * b[6] + a[10] * b[10];
 	out[11] = a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11];
 }
-static void InterpolateMatrix( float *a, float *b, float lerp, float *mat ) {
-	float unLerp = 1.0f - lerp;
-
-	mat[ 0] = a[ 0] * unLerp + b[ 0] * lerp;
-	mat[ 1] = a[ 1] * unLerp + b[ 1] * lerp;
-	mat[ 2] = a[ 2] * unLerp + b[ 2] * lerp;
-	mat[ 3] = a[ 3] * unLerp + b[ 3] * lerp;
-	mat[ 4] = a[ 4] * unLerp + b[ 4] * lerp;
-	mat[ 5] = a[ 5] * unLerp + b[ 5] * lerp;
-	mat[ 6] = a[ 6] * unLerp + b[ 6] * lerp;
-	mat[ 7] = a[ 7] * unLerp + b[ 7] * lerp;
-	mat[ 8] = a[ 8] * unLerp + b[ 8] * lerp;
-	mat[ 9] = a[ 9] * unLerp + b[ 9] * lerp;
-	mat[10] = a[10] * unLerp + b[10] * lerp;
-	mat[11] = a[11] * unLerp + b[11] * lerp;
-}
-static void JointToMatrix( vec4_t rot, vec3_t scale, vec3_t trans,
+static void JointToMatrix( const quat_t rot, const vec3_t scale, const vec3_t trans,
 			   float *mat ) {
 	float xx = 2.0f * rot[0] * rot[0];
 	float yy = 2.0f * rot[1] * rot[1];
@@ -99,8 +84,7 @@ static void JointToMatrix( vec4_t rot, vec3_t scale, vec3_t trans,
 	mat[10] = scale[2] * (1.0f - (xx + yy));
 	mat[11] = trans[2];
 }
-static void Matrix34Invert( float *inMat, float *outMat )
-{
+static void Matrix34Invert( const float *inMat, float *outMat ) {
 	vec3_t trans;
 	float invSqrLen, *v;
 
@@ -120,6 +104,61 @@ static void Matrix34Invert( float *inMat, float *outMat )
 	outMat[ 7] = -DotProduct(outMat + 4, trans);
 	outMat[11] = -DotProduct(outMat + 8, trans);
 }
+static void QuatSlerp(const quat_t from, const quat_t _to, float fraction, quat_t out) {
+	float angle, cosAngle, sinAngle, backlerp, lerp;
+	quat_t to;
+
+	// cos() of angle
+	cosAngle = from[0] * _to[0] + from[1] * _to[1] + from[2] * _to[2] + from[3] * _to[3];
+
+	// negative handling is needed for taking shortest path (required for model joints)
+	if ( cosAngle < 0.0f ) {
+		cosAngle = -cosAngle;
+		to[0] = - _to[0];
+		to[1] = - _to[1];
+		to[2] = - _to[2];
+		to[3] = - _to[3];
+	} else {
+		QuatCopy( _to, to );
+	}
+
+	if ( cosAngle < 0.999999f ) {
+		// spherical lerp (slerp)
+		angle = acosf( cosAngle );
+		sinAngle = sinf( angle );
+		backlerp = sinf( ( 1.0f - fraction ) * angle ) / sinAngle;
+		lerp = sinf( fraction * angle ) / sinAngle;
+	} else {
+		// linear lerp
+		backlerp = 1.0f - fraction;
+		lerp = fraction;
+	}
+
+	out[0] = from[0] * backlerp + to[0] * lerp;
+	out[1] = from[1] * backlerp + to[1] * lerp;
+	out[2] = from[2] * backlerp + to[2] * lerp;
+	out[3] = from[3] * backlerp + to[3] * lerp;
+}
+static vec_t QuatNormalize2( const quat_t v, quat_t out) {
+	float	length, ilength;
+
+	length = v[0]*v[0] + v[1]*v[1] + v[2]*v[2] + v[3]*v[3];
+
+	if (length) {
+		/* writing it this way allows gcc to recognize that rsqrt can be used */
+		ilength = 1/(float)sqrt (length);
+		/* sqrt(length) = length * (1 / sqrt(length)) */
+		length *= ilength;
+		out[0] = v[0]*ilength;
+		out[1] = v[1]*ilength;
+		out[2] = v[2]*ilength;
+		out[3] = v[3]*ilength;
+	} else {
+		out[0] = out[1] = out[2] = out[3] = 0;
+	}
+
+	return length;
+}
 
 /*
 =================
@@ -139,7 +178,7 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 	unsigned short		*framedata;
 	char			*str;
 	int			i, j, k;
-	float			jointInvMats[IQM_MAX_JOINTS * 12] = {0.0f};
+	iqmTransform_t		*transform;
 	float			*mat, *matInv;
 	size_t			size, joint_names;
 	byte			*dataPtr;
@@ -562,10 +601,11 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 	if( header->num_joints ) {
 		size += joint_names;								// joint names
 		size += header->num_joints * sizeof(int);			// joint parents
-		size += header->num_joints * 12 * sizeof( float );	// joint mats
+		size += header->num_joints * 12 * sizeof(float);	// bind joint matricies
+		size += header->num_joints * 12 * sizeof(float);	// inverse bind joint matricies
 	}
 	if( header->num_poses ) {
-		size += header->num_poses * header->num_frames * 12 * sizeof( float ); // pose mats
+		size += header->num_poses * header->num_frames * sizeof(iqmTransform_t); // pose transforms
 	}
 	if( header->ofs_bounds ) {
 		size += header->num_frames * 6 * sizeof(float);		// model bounds
@@ -636,12 +676,15 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 		iqmData->jointParents = (int*)dataPtr;
 		dataPtr += header->num_joints * sizeof(int);		// joint parents
 
-		iqmData->jointMats = (float*)dataPtr;
-		dataPtr += header->num_joints * 12 * sizeof( float ); // joint mats
+		iqmData->bindJoints = (float*)dataPtr;
+		dataPtr += header->num_joints * 12 * sizeof(float);		// bind joint matricies
+
+		iqmData->invBindJoints = (float*)dataPtr;
+		dataPtr += header->num_joints * 12 * sizeof(float);		// inverse bind joint matricies
 	}
 	if( header->num_poses ) {
-		iqmData->poseMats = (float*)dataPtr;
-		dataPtr += header->num_poses * header->num_frames * 12 * sizeof( float ); // pose mats
+		iqmData->poses = (iqmTransform_t*)dataPtr;
+		dataPtr += header->num_poses * header->num_frames * sizeof(iqmTransform_t); // pose transforms
 	}
 	if( header->ofs_bounds ) {
 		iqmData->bounds = (float*)dataPtr;
@@ -807,22 +850,23 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 			iqmData->jointParents[i] = joint->parent;
 		}
 
-		// calculate joint matrices and their inverses
-		// joint inverses are needed only until the pose matrices are calculated
-		mat = iqmData->jointMats;
-		matInv = jointInvMats;
+		// calculate bind joint matrices and their inverses
+		mat = iqmData->bindJoints;
+		matInv = iqmData->invBindJoints;
 		joint = (iqmJoint_t *)((byte *)header + header->ofs_joints);
 		for( i = 0; i < header->num_joints; i++, joint++ ) {
 			float baseFrame[12], invBaseFrame[12];
 
+			QuatNormalize2( joint->rotate, joint->rotate );
+
 			JointToMatrix( joint->rotate, joint->scale, joint->translate, baseFrame );
 			Matrix34Invert( baseFrame, invBaseFrame );
 
 			if ( joint->parent >= 0 )
 			{
-				Matrix34Multiply( iqmData->jointMats + 12 * joint->parent, baseFrame, mat );
+				Matrix34Multiply( iqmData->bindJoints + 12 * joint->parent, baseFrame, mat );
 				mat += 12;
-				Matrix34Multiply( invBaseFrame, jointInvMats + 12 * joint->parent, matInv );
+				Matrix34Multiply( invBaseFrame, iqmData->invBindJoints + 12 * joint->parent, matInv );
 				matInv += 12;
 			}
 			else
@@ -837,16 +881,15 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 
 	if( header->num_poses )
 	{
-		// calculate pose matrices
+		// calculate pose transforms
+		transform = iqmData->poses;
 		framedata = (unsigned short *)((byte *)header + header->ofs_frames);
-		mat = iqmData->poseMats;
 		for( i = 0; i < header->num_frames; i++ ) {
 			pose = (iqmPose_t *)((byte *)header + header->ofs_poses);
-			for( j = 0; j < header->num_poses; j++, pose++ ) {
+			for( j = 0; j < header->num_poses; j++, pose++, transform++ ) {
 				vec3_t	translate;
-				vec4_t	rotate;
+				quat_t	rotate;
 				vec3_t	scale;
-				float	mat1[12], mat2[12];
 
 				translate[0] = pose->channeloffset[0];
 				if( pose->mask & 0x001)
@@ -881,18 +924,9 @@ qboolean R_LoadIQM( model_t *mod, void *buffer, int filesize, const char *mod_na
 				if( pose->mask & 0x200)
 					scale[2] += *framedata++ * pose->channelscale[9];
 
-				// construct transformation matrix
-				JointToMatrix( rotate, scale, translate, mat1 );
-
-				if( pose->parent >= 0 ) {
-					Matrix34Multiply( iqmData->jointMats + 12 * pose->parent,
-							  mat1, mat2 );
-				} else {
-					Com_Memcpy( mat2, mat1, sizeof(mat1) );
-				}
-
-				Matrix34Multiply( mat2, jointInvMats + 12 * j, mat );
-				mat += 12;
+				VectorCopy( translate, transform->translate );
+				QuatNormalize2( rotate, transform->rotate );
+				VectorCopy( scale, transform->scale );
 			}
 		}
 	}
@@ -1306,37 +1340,59 @@ void R_AddIQMSurfaces( trRefEntity_t *ent ) {
 
 
 static void ComputePoseMats( iqmData_t *data, int frame, int oldframe,
-			      float backlerp, float *mat ) {
-	float	*mat1, *mat2;
-	int	*joint = data->jointParents;
-	int	i;
+			      float backlerp, float *poseMats ) {
+	iqmTransform_t relativeJoints[IQM_MAX_JOINTS];
+	iqmTransform_t *relativeJoint;
+	const iqmTransform_t *pose;
+	const iqmTransform_t *oldpose;
+	const int *jointParent;
+	const float *invBindMat;
+	float *poseMat, lerp;
+	int i;
 
+	relativeJoint = relativeJoints;
+
+	// copy or lerp animation frame pose
 	if ( oldframe == frame ) {
-		mat1 = data->poseMats + 12 * data->num_poses * frame;
-		for( i = 0; i < data->num_poses; i++, joint++ ) {
-			if( *joint >= 0 ) {
-				Matrix34Multiply( mat + 12 * *joint,
-						  mat1 + 12*i, mat + 12*i );
-			} else {
-				Com_Memcpy( mat + 12*i, mat1 + 12*i, 12 * sizeof(float) );
-			}
+		pose = &data->poses[frame * data->num_poses];
+		for ( i = 0; i < data->num_poses; i++, pose++, relativeJoint++ ) {
+			VectorCopy( pose->translate, relativeJoint->translate );
+			QuatCopy( pose->rotate, relativeJoint->rotate );
+			VectorCopy( pose->scale, relativeJoint->scale );
 		}
-	} else  {
-		mat1 = data->poseMats + 12 * data->num_poses * frame;
-		mat2 = data->poseMats + 12 * data->num_poses * oldframe;
-		
-		for( i = 0; i < data->num_poses; i++, joint++ ) {
-			if( *joint >= 0 ) {
-				float tmpMat[12];
-				InterpolateMatrix( mat1 + 12*i, mat2 + 12*i,
-						   backlerp, tmpMat );
-				Matrix34Multiply( mat + 12 * *joint,
-						  tmpMat, mat + 12*i );
-				
-			} else {
-				InterpolateMatrix( mat1 + 12*i, mat2 + 12*i,
-						   backlerp, mat + 12*i );
-			}
+	} else {
+		lerp = 1.0f - backlerp;
+		pose = &data->poses[frame * data->num_poses];
+		oldpose = &data->poses[oldframe * data->num_poses];
+		for ( i = 0; i < data->num_poses; i++, oldpose++, pose++, relativeJoint++ ) {
+			relativeJoint->translate[0] = oldpose->translate[0] * backlerp + pose->translate[0] * lerp;
+			relativeJoint->translate[1] = oldpose->translate[1] * backlerp + pose->translate[1] * lerp;
+			relativeJoint->translate[2] = oldpose->translate[2] * backlerp + pose->translate[2] * lerp;
+
+			relativeJoint->scale[0] = oldpose->scale[0] * backlerp + pose->scale[0] * lerp;
+			relativeJoint->scale[1] = oldpose->scale[1] * backlerp + pose->scale[1] * lerp;
+			relativeJoint->scale[2] = oldpose->scale[2] * backlerp + pose->scale[2] * lerp;
+
+			QuatSlerp( oldpose->rotate, pose->rotate, lerp, relativeJoint->rotate );
+		}
+	}
+
+	// multiply by inverse of bind pose and parent 'pose mat' (bind pose transform matrix)
+	relativeJoint = relativeJoints;
+	jointParent = data->jointParents;
+	invBindMat = data->invBindJoints;
+	poseMat = poseMats;
+	for ( i = 0; i < data->num_poses; i++, relativeJoint++, jointParent++, invBindMat += 12, poseMat += 12 ) {
+		float mat1[12], mat2[12];
+
+		JointToMatrix( relativeJoint->rotate, relativeJoint->scale, relativeJoint->translate, mat1 );
+
+		if ( *jointParent >= 0 ) {
+			Matrix34Multiply( &data->bindJoints[(*jointParent)*12], mat1, mat2 );
+			Matrix34Multiply( mat2, invBindMat, mat1 );
+			Matrix34Multiply( &poseMats[(*jointParent)*12], mat1, poseMat );
+		} else {
+			Matrix34Multiply( mat1, invBindMat, poseMat );
 		}
 	}
 }
@@ -1347,7 +1403,7 @@ static void ComputeJointMats( iqmData_t *data, int frame, int oldframe,
 	int	i;
 
 	if ( data->num_poses == 0 ) {
-		Com_Memcpy( mat, data->jointMats, data->num_joints * 12 * sizeof(float) );
+		Com_Memcpy( mat, data->bindJoints, data->num_joints * 12 * sizeof(float) );
 		return;
 	}
 
@@ -1359,7 +1415,7 @@ static void ComputeJointMats( iqmData_t *data, int frame, int oldframe,
 
 		Com_Memcpy(outmat, mat1, sizeof(outmat));
 
-		Matrix34Multiply( outmat, data->jointMats + 12*i, mat1 );
+		Matrix34Multiply( outmat, data->bindJoints + 12*i, mat1 );
 	}
 }