quadrilateralcowboy/idlib/math/Simd_SSE.cpp

/*
===========================================================================

Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.

This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).

Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/

#include "../precompiled.h"
#pragma hdrstop

#include "Simd_Generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"


//===============================================================
//                                                        M
//  SSE implementation of idSIMDProcessor                MrE
//                                                        E
//===============================================================


#if defined(MACOS_X) && defined(__i386__)

#include <xmmintrin.h>

#define DRAWVERT_SIZE				60
#define DRAWVERT_XYZ_OFFSET			(0*4)
#define DRAWVERT_ST_OFFSET			(3*4)
#define DRAWVERT_NORMAL_OFFSET		(5*4)
#define DRAWVERT_TANGENT0_OFFSET	(8*4)
#define DRAWVERT_TANGENT1_OFFSET	(11*4)
#define DRAWVERT_COLOR_OFFSET		(14*4)

#define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

/*
============
idSIMD_SSE::GetName
============
*/
const char * idSIMD_SSE::GetName( void ) const {
	return "MMX & SSE";
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
	// 0,  1,  2
	// 3,  4,  5
	// 6,  7,  8
	// 9, 10, 11

	/*
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
	*/
	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 	// Declare 8 xmm registers.
	int count_l4 = count;                                   // count_l4 = eax
	int count_l1 = count;                                   // count_l1 = edx
	char *constant_p = (char *)&constant;                   // constant_p = edi
	char *src_p = (char *) src;                             // src_p = esi
	char *dst_p = (char *) dst;                             // dst_p = ecx

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	/*
		and			eax, ~3
		movss		xmm4, [edi+0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm5, [edi+4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm7, [edi+12]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
	*/
	count_l4 = count_l4 & ~3;
	xmm4 = _mm_load_ss((float *) (constant_p));
	xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
	xmm5 = _mm_load_ss((float *) (constant_p + 4));
	xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
	xmm6 = _mm_load_ss((float *) (constant_p + 8));
	xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
	xmm7 = _mm_load_ss((float *) (constant_p + 12));
 	xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));

	/*
		jz			startVert1
	*/
	if(count_l4 != 0) {
	/*
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax
	*/
		count_l4 = count_l4 * DRAWVERT_SIZE;
		src_p = src_p + count_l4;
		count_l4 = -count_l4;
	/*
	loopVert4:
	*/
		do {
	/*
		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
		movaps		xmm1, xmm0												//  3,  X,  0,  1
	*/
			xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));        // 3,  X,  X,  X
			xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));        // 2,  X,  X,  X
			xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3,  X,  0,  1
			xmm1 = xmm0;							                                                    // 3,  X,  0,  1

	/*
		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5
	*/
			xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4,  5,  0,  1
			xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ));                               // 2,  X,  4,  5

	/*
		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9
	*/
			xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0));        // 9,  X,  X,  X
			xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9,  X,  6,  7
			xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ));                               // 0,  3,  6,  9
	/*
		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10
	*/
			xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4));  // 10, 11, 6,  7
			xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ));                               // 1,  4,  7,  10
	/*
		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11
	*/
			xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8));  // 10, 11, 8,  X
			xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ));                               // 2,  5,  8,  11

	/*
		add			ecx, 16
		add			eax, 4*DRAWVERT_SIZE
	*/
			dst_p = dst_p + 16;
			count_l4 = count_l4 + 4*DRAWVERT_SIZE;

	/*
		mulps		xmm0, xmm4
		mulps		xmm1, xmm5
		mulps		xmm2, xmm6
		addps		xmm0, xmm7
		addps		xmm0, xmm1
		addps		xmm0, xmm2
	*/
			xmm0 = _mm_mul_ps(xmm0, xmm4);
			xmm1 = _mm_mul_ps(xmm1, xmm5);
			xmm2 = _mm_mul_ps(xmm2, xmm6);
			xmm0 = _mm_add_ps(xmm0, xmm7);
			xmm0 = _mm_add_ps(xmm0, xmm1);
			xmm0 = _mm_add_ps(xmm0, xmm2);

	/*
		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loopVert4
	*/
			_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
			_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
		} while(count_l4 < 0);
	}

	/*
	startVert1:
		and			edx, 3
		jz			done
	*/
	count_l1 = count_l1 & 3;
	if(count_l1 != 0) {
	/*
		loopVert1:
		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
		mulss		xmm0, xmm4
		mulss		xmm1, xmm5
		mulss		xmm2, xmm6
		addss		xmm0, xmm7
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, DRAWVERT_SIZE
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loopVert1
	*/
		do {
			xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
			xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
			xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
			xmm0 = _mm_mul_ss(xmm0, xmm4);
			xmm1 = _mm_mul_ss(xmm1, xmm5);
			xmm2 = _mm_mul_ss(xmm2, xmm6);
			xmm0 = _mm_add_ss(xmm0, xmm7);
			dst_p = dst_p + 4;
			xmm0 = _mm_add_ss(xmm0, xmm1);
			count_l4 = count_l4 + DRAWVERT_SIZE;
			xmm0 = _mm_add_ss(xmm0, xmm2);
			count_l1 = count_l1 - 1;
			_mm_store_ss((float *) (dst_p-4), xmm0);
		} while( count_l1 != 0);
	}
	/*
		done:
	*/
}

/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
	char *indexes_p;
	char *src_p;
	int count_l;
	int edx;
	char *min_p;
	char *max_p;

	/*
		movss		xmm0, idMath::INFINITY
		xorps		xmm1, xmm1
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		subps		xmm1, xmm0
		movaps		xmm2, xmm0
		movaps		xmm3, xmm1
	*/
		xmm0 = _mm_load_ss(&idMath::INFINITY);
		// To satisfy the compiler use xmm0 instead.
		xmm1 = _mm_xor_ps(xmm0, xmm0);
		xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
		xmm1 = _mm_sub_ps(xmm1, xmm0);
		xmm2 = xmm0;
		xmm3 = xmm1;

	/*
		mov			edi, indexes
		mov			esi, src
		mov			eax, count
		and			eax, ~3
		jz			done4
	*/
		indexes_p = (char *) indexes;
		src_p = (char *) src;
		count_l = count;
		count_l = count_l & ~3;
		if(count_l != 0) {
	/*
		shl			eax, 2
		add			edi, eax
		neg			eax
	*/
			count_l = count_l << 2;
			indexes_p = indexes_p + count_l;
			count_l = -count_l;
	/*
	loop4:
//		prefetchnta	[edi+128]
//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
	*/
		do {
	/*
		mov			edx, [edi+eax+0]
		imul		edx, DRAWVERT_SIZE
		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4
	*/
			edx = *((int*)(indexes_p+count_l+0));
			edx = edx * DRAWVERT_SIZE;
			xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
			xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
			xmm0 = _mm_min_ps(xmm0, xmm4);
			xmm1 = _mm_max_ps(xmm1, xmm4);

	/*
		mov			edx, [edi+eax+4]
		imul		edx, DRAWVERT_SIZE
		movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
		minps		xmm2, xmm5
		maxps		xmm3, xmm5
	*/
			edx = *((int*)(indexes_p+count_l+4));
			edx = edx * DRAWVERT_SIZE;
			xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
			xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
			xmm2 = _mm_min_ps(xmm2, xmm5);
			xmm3 = _mm_max_ps(xmm3, xmm5);

	/*
		mov			edx, [edi+eax+8]
		imul		edx, DRAWVERT_SIZE
		movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm6
		maxps		xmm1, xmm6
	*/
			edx = *((int*)(indexes_p+count_l+8));
			edx = edx * DRAWVERT_SIZE;
			xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
			xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
			xmm0 = _mm_min_ps(xmm0, xmm6);
			xmm1 = _mm_max_ps(xmm1, xmm6);

	/*
		mov			edx, [edi+eax+12]
		imul		edx, DRAWVERT_SIZE
		movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
		minps		xmm2, xmm7
		maxps		xmm3, xmm7
	*/
			edx = *((int*)(indexes_p+count_l+12));
			edx = edx * DRAWVERT_SIZE;
			xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
			xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
			xmm2 = _mm_min_ps(xmm2, xmm7);
			xmm3 = _mm_max_ps(xmm3, xmm7);

	/*
		add			eax, 4*4
		jl			loop4
	*/
			count_l = count_l + 4*4;
		} while (count_l < 0);
	}
	/*
	done4:
		mov			eax, count
		and			eax, 3
		jz			done1
	*/
	count_l = count;
	count_l = count_l & 3;
	if(count_l != 0) {
	/*
		shl			eax, 2
		add			edi, eax
		neg			eax
	*/
		count_l = count_l << 2;
		indexes_p = indexes_p + count_l;
		count_l = -count_l;
	/*
	loop1:
	*/
		do{
	/*
		mov			edx, [edi+eax+0]
		imul		edx, DRAWVERT_SIZE;
		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4
	*/
			edx = *((int*)(indexes_p+count_l+0));
			edx = edx * DRAWVERT_SIZE;
			xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
			xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
			xmm0 = _mm_min_ps(xmm0, xmm4);
			xmm1 = _mm_max_ps(xmm1, xmm4);

	/*
		add			eax, 4
		jl			loop1
	*/
			count_l = count_l + 4;
		} while (count_l < 0);

	}

	/*
	done1:
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
		minps		xmm0, xmm2
		maxps		xmm1, xmm3
		mov			esi, min
		movhps		[esi], xmm0
		movss		[esi+8], xmm0
		mov			edi, max
		movhps		[edi], xmm1
		movss		[edi+8], xmm1
	*/
	xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
	xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
	xmm0 = _mm_min_ps(xmm0, xmm2);
	xmm1 = _mm_max_ps(xmm1, xmm3);
	min_p = (char *) &min;
	_mm_storeh_pi((__m64 *)(min_p), xmm0);
	_mm_store_ss((float *)(min_p+8), xmm0);
	max_p = (char *) &max;
	_mm_storeh_pi((__m64 *)(max_p), xmm1);
	_mm_store_ss((float *)(max_p+8), xmm1);
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant * src[i].Normal() + src[i][3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
	int count_l4;
	int count_l1;
	char *constant_p;
	char *src_p;
	char *dst_p;
	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;

	/*
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
		and			eax, ~3
	*/
	count_l4 = count;
	constant_p = (char *) &constant;
	count_l1 = count_l4;
	src_p = (char *) src;
	dst_p = (char *) dst;
	count_l4 = count_l4 & ~3;

	/*
		movss		xmm5, [edi+0]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+4]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm7, [edi+8]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
	*/
	xmm5 = _mm_load_ss((float *) (constant_p+0));
	xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
	xmm6 = _mm_load_ss((float *) (constant_p+4));
	xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
	xmm7 = _mm_load_ss((float *) (constant_p+8));
	xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));

	/*
		jz			startVert1
	*/
	if (count != 0) {
	/*
		imul		eax, 16
		add			esi, eax
		neg			eax
	*/
		count_l4 = count_l4 * 16;
		src_p = src_p + count_l4;
		count_l4 = -count_l4;
	/*
	loopVert4:
	*/
		do {
	/*
		movlps		xmm1, [esi+eax+ 0]
		movlps		xmm3, [esi+eax+ 8]
		movhps		xmm1, [esi+eax+16]
		movhps		xmm3, [esi+eax+24]
		movlps		xmm2, [esi+eax+32]
		movlps		xmm4, [esi+eax+40]
		movhps		xmm2, [esi+eax+48]
		movhps		xmm4, [esi+eax+56]
		movaps		xmm0, xmm1
		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
		shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
		movaps		xmm2, xmm3
		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
		shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
	*/
			xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
			xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
			xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
			xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
			xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
			xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
			xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
			xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));

			xmm0 = xmm1;
			xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
			xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
			xmm2 = xmm3;
			xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
			xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));

	/*
		add			ecx, 16
		add			eax, 4*16
	*/
			dst_p = dst_p + 16;
			count_l4 = count_l4 + 4*16;

	/*
		mulps		xmm0, xmm5
		mulps		xmm1, xmm6
		mulps		xmm2, xmm7
		addps		xmm0, xmm3
		addps		xmm0, xmm1
		addps		xmm0, xmm2
	*/
			xmm0 = _mm_mul_ps(xmm0, xmm5);
			xmm1 = _mm_mul_ps(xmm1, xmm6);
			xmm2 = _mm_mul_ps(xmm2, xmm7);
			xmm0 = _mm_add_ps(xmm0, xmm3);
			xmm0 = _mm_add_ps(xmm0, xmm1);
			xmm0 = _mm_add_ps(xmm0, xmm2);

	/*
		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loopVert4
	*/
			_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
			_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
		} while (count_l4 < 0);
	}

	/*
	startVert1:
		and			edx, 3
		jz			done
	*/
	count_l1 = count_l1 & 3;

	if(count_l1 != 0) {
	/*
	loopVert1:
	*/
		do {
	/*
		movss		xmm0, [esi+eax+0]
		movss		xmm1, [esi+eax+4]
		movss		xmm2, [esi+eax+8]
		mulss		xmm0, xmm5
		mulss		xmm1, xmm6
		mulss		xmm2, xmm7
		addss		xmm0, [esi+eax+12]
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, 16
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loopVert1
	*/
			xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
			xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
			xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
			xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));

			xmm0 = _mm_mul_ss(xmm0, xmm5);
			xmm1 = _mm_mul_ss(xmm1, xmm6);
			xmm2 = _mm_mul_ss(xmm2, xmm7);

			xmm0 = _mm_add_ss(xmm0, xmm3);
			dst_p = dst_p + 4;
			xmm0 = _mm_add_ss(xmm0, xmm1);
			count_l4 = count_l4 + 16;
			xmm0 = _mm_add_ss(xmm0, xmm2);
			count_l1 = count_l1 - 1;
			_mm_store_ss((float *) (dst_p-4), xmm0);
		} while (count_l1 != 0);
	}
	/*
	done:
	*/
}

#elif defined(_WIN32)

#include <xmmintrin.h>

#define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))

// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 )											\
	__asm	movaps		reg4, reg2								/* reg4 =  8,  9, 10, 11 */		\
	__asm	unpcklps	reg2, reg3								/* reg2 =  8, 12,  9, 13 */		\
	__asm	unpckhps	reg4, reg3								/* reg4 = 10, 14, 11, 15 */		\
	__asm	movaps		reg3, reg0								/* reg3 =  0,  1,  2,  3 */		\
	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  4,  1,  5 */		\
	__asm	unpckhps	reg3, reg1								/* reg3 =  2,  6,  3,  7 */		\
	__asm	movaps		reg1, reg0								/* reg1 =  0,  4,  1,  5 */		\
	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg0 =  0,  4,  8, 12 */		\
	__asm	shufps		reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg1 =  1,  5,  9, 13 */		\
	__asm	movaps		reg2, reg3								/* reg2 =  2,  6,  3,  7 */		\
	__asm	shufps		reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 )	/* reg2 =  2,  6, 10, 14 */		\
	__asm	shufps		reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 )	/* reg3 =  3,  7, 11, 15 */

// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 )						\
	__asm	movlps		reg1, [address+ 0]						/* reg1 =  0,  1,  X,  X */		\
	__asm	movlps		reg3, [address+ 8]						/* reg3 =  2,  3,  X,  X */		\
	__asm	movhps		reg1, [address+16]						/* reg1 =  0,  1,  4,  5 */		\
	__asm	movhps		reg3, [address+24]						/* reg3 =  2,  3,  6,  7 */		\
	__asm	movlps		reg2, [address+32]						/* reg2 =  8,  9,  X,  X */		\
	__asm	movlps		reg4, [address+40]						/* reg4 = 10, 11,  X,  X */		\
	__asm	movhps		reg2, [address+48]						/* reg2 =  8,  9, 12, 13 */		\
	__asm	movhps		reg4, [address+56]						/* reg4 = 10, 11, 14, 15 */		\
	__asm	movaps		reg0, reg1								/* reg0 =  0,  1,  4,  5 */		\
	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 )	/* reg0 =  0,  4,  8, 12 */		\
	__asm	shufps		reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 )	/* reg1 =  1,  5,  9, 13 */		\
	__asm	movaps		reg2, reg3								/* reg2 =  2,  3,  6,  7 */		\
	__asm	shufps		reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 )	/* reg2 =  2,  6, 10, 14 */		\
	__asm	shufps		reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 )	/* reg3 =  3,  7, 11, 15 */

// transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
#define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 )							\
	__asm	movaps		reg4, reg0								/* reg4 =  0,  4,  8, 12 */		\
	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  1,  4,  5 */		\
	__asm	unpckhps	reg4, reg1								/* reg4 =  8,  9, 12, 13 */		\
	__asm	movaps		reg1, reg2								/* reg1 =  2,  6, 10, 14 */		\
	__asm	unpcklps	reg2, reg3								/* reg2 =  2,  3,  6,  7 */		\
	__asm	unpckhps	reg1, reg3								/* reg1 = 10, 11, 14, 15 */		\
	__asm	movlps		[address+ 0], reg0						/* mem0 =  0,  1,  X,  X */		\
	__asm	movlps		[address+ 8], reg2						/* mem0 =  0,  1,  2,  3 */		\
	__asm	movhps		[address+16], reg0						/* mem1 =  4,  5,  X,  X */		\
	__asm	movhps		[address+24], reg2						/* mem1 =  4,  5,  6,  7 */		\
	__asm	movlps		[address+32], reg4						/* mem2 =  8,  9,  X,  X */		\
	__asm	movlps		[address+40], reg1						/* mem2 =  8,  9, 10, 11 */		\
	__asm	movhps		[address+48], reg4						/* mem3 = 12, 13,  X,  X */		\
	__asm	movhps		[address+56], reg1						/* mem3 = 12, 13, 14, 15 */

// transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
#define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 )													\
	__asm	movaps		reg3, reg2								/* reg3 =  8,  9, 10, 11 */		\
	__asm	shufps		reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 )	/* reg3 = 10, 11,  4,  5 */		\
	__asm	shufps		reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 )	/* reg2 =  8,  9,  2,  3 */		\
	__asm	shufps		reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 )	/* reg1 =  6,  7,  0,  1 */		\
	__asm	movaps		reg0, reg1								/* reg0 =  6,  7,  0,  1 */		\
	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 )	/* reg0 =  0,  6,  3,  9 */		\
	__asm	shufps		reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 )	/* reg1 =  1,  7,  4, 10 */		\
	__asm	shufps		reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 )	/* reg2 =  2,  8,  5, 11 */

// transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
#define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 )							\
	__asm	movlps		reg1, [address+ 0]						/* reg1 =  0,  1,  X,  X */		\
	__asm	movlps		reg2, [address+ 8]						/* reg2 =  2,  3,  X,  X */		\
	__asm	movlps		reg3, [address+16]						/* reg3 =  4,  5,  X,  X */		\
	__asm	movhps		reg1, [address+24]						/* reg1 =  0,  1,  6,  7 */		\
	__asm	movhps		reg2, [address+32]						/* reg2 =  2,  3,  8,  9 */		\
	__asm	movhps		reg3, [address+40]						/* reg3 =  4,  5, 10, 11 */		\
	__asm	movaps		reg0, reg1								/* reg0 =  0,  1,  6,  7 */		\
	__asm	shufps		reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 )	/* reg0 =  0,  6,  3,  9 */		\
	__asm	shufps		reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 )	/* reg1 =  1,  7,  4, 10 */		\
	__asm	shufps		reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 )	/* reg2 =  2,  8,  5, 11 */

// transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
#define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 )								\
	__asm	movhlps		reg3, reg0 								/* reg3 =  3,  9,  X,  X */		\
	__asm	unpcklps	reg0, reg1								/* reg0 =  0,  1,  6,  7 */		\
	__asm	unpckhps	reg1, reg2								/* reg1 =  4,  5, 10, 11 */		\
	__asm	unpcklps	reg2, reg3								/* reg2 =  2,  3,  8,  9 */		\
	__asm	movlps		[address+ 0], reg0						/* mem0 =  0,  1,  X,  X */		\
	__asm	movlps		[address+ 8], reg2						/* mem0 =  0,  1,  2,  3 */		\
	__asm	movlps		[address+16], reg1						/* mem1 =  4,  5,  X,  X */		\
	__asm	movhps		[address+24], reg0						/* mem1 =  4,  5,  6,  7 */		\
	__asm	movhps		[address+32], reg2						/* mem2 =  8,  9,  X,  X */		\
	__asm	movhps		[address+40], reg1						/* mem2 =  8,  9, 10, 11 */


// with alignment
#define KFLOATINITS(   SRC0, COUNT, PRE, POST )				KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITD(   DST, COUNT, PRE, POST )				KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
#define KFLOATINITDS(  DST, SRC0, COUNT, PRE, POST )		KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )

#define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
	__asm	mov		ecx,DST								\
	__asm	shr		ecx,2								\
	__asm	mov		ebx,COUNT							\
	__asm	neg		ecx									\
	__asm	mov		edx,SRC0							\
	__asm	and		ecx,3								\
	__asm	mov		esi,SRC1							\
	__asm	sub		ebx,ecx								\
	__asm	jge		noUnderFlow							\
	__asm	xor		ebx,ebx								\
	__asm	mov		ecx,COUNT							\
	__asm	noUnderFlow:								\
	__asm	mov		PRE,ecx								\
	__asm	mov		eax,ebx								\
	__asm	mov		edi,DST								\
	__asm	and		eax,8-1								\
	__asm	mov		POST,eax							\
	__asm	and		ebx,0xfffffff8						\
	__asm	jle		done								\
	__asm	shl		ebx,2								\
	__asm	lea		ecx,[ecx*4+ebx]						\
	__asm	neg		ebx									\
	__asm	add		edx,ecx								\
	__asm	add		esi,ecx								\
	__asm	add		edi,ecx								\
	__asm	mov		eax,edx								\
	__asm	or		eax,esi

// without alignment (pre==0)
#define KFLOATINITS_NA(   SRC0, COUNT, PRE, POST )				KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITD_NA(   DST, COUNT, PRE, POST )				KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
#define KFLOATINITDS_NA(  DST, SRC0, COUNT, PRE, POST )			KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
	__asm	mov		eax,COUNT							\
	__asm	mov		PRE,0								\
	__asm	and		eax,8-1								\
	__asm	mov		ebx,COUNT							\
	__asm	mov		POST,eax							\
	__asm	and		ebx,0xfffffff8						\
	__asm	je		done								\
	__asm	shl		ebx,2								\
	__asm	mov		edx,SRC0							\
	__asm	mov		esi,SRC1							\
	__asm	mov		edi,DST								\
	__asm	add		edx,ebx								\
	__asm	add		esi,ebx								\
	__asm	add		edi,ebx								\
	__asm	mov		eax,edx								\
	__asm	or		eax,esi								\
	__asm	or		eax,edi								\
	__asm	neg		ebx									\

/*
	when OPER is called:
	edx = s0
	esi	= s1
	edi	= d
	ebx	= index*4

	xmm0 & xmm1	must not be trashed
*/
#define KMOVDS1( DST, SRC0 )							\
	__asm	movss	xmm2,SRC0							\
	__asm	movss	DST,xmm2
#define KMOVDS4( DST, SRC0 )							\
	__asm	movups	xmm2,SRC0							\
	__asm	movups	DST,xmm2
#define KMINDS1( DST, SRC0 )							\
	__asm	movss	xmm2,SRC0							\
	__asm	minss	DST,xmm2
#define KMAXDS1( DST, SRC0 )							\
	__asm	movss	xmm2,SRC0							\
	__asm	maxss	DST,xmm2

// general ALU operation
#define KALUDSS1( OP, DST, SRC0, SRC1 )					\
	__asm	movss	xmm2,SRC0							\
	__asm	OP##ss	xmm2,SRC1							\
	__asm	movss	DST,xmm2
#define KALUDSS4( OP, DST, SRC0, SRC1 )					\
	__asm	movups	xmm2,SRC0							\
	__asm	movups	xmm3,SRC1							\
	__asm	OP##ps	xmm2,xmm3							\
	__asm	movups	DST,xmm2

#define KADDDSS1( DST, SRC0, SRC1 )		KALUDSS1( add, DST,SRC0,SRC1 )
#define KADDDSS4( DST, SRC0, SRC1 )		KALUDSS4( add, DST,SRC0,SRC1 )
#define KSUBDSS1( DST, SRC0, SRC1 )		KALUDSS1( sub, DST,SRC0,SRC1 )
#define KSUBDSS4( DST, SRC0, SRC1 )		KALUDSS4( sub, DST,SRC0,SRC1 )
#define KMULDSS1( DST, SRC0, SRC1 )		KALUDSS1( mul, DST,SRC0,SRC1 )
#define KMULDSS4( DST, SRC0, SRC1 )		KALUDSS4( mul, DST,SRC0,SRC1 )

#define KDIVDSS1( DST, SRC0, SRC1 )						\
	__asm	movss	xmm2,SRC1							\
	__asm	rcpss	xmm3,xmm2							\
	__asm	mulss	xmm2,xmm3							\
	__asm	mulss	xmm2,xmm3							\
	__asm	addss	xmm3,xmm3							\
	__asm	subss	xmm3,xmm2							\
	__asm	mulss	xmm3,SRC0							\
	__asm	movss	DST,xmm3
#define KDIVDSS4( DST, SRC0, SRC1 )						\
	__asm	movups	xmm2,SRC1							\
	__asm	rcpps	xmm3,xmm2							\
	__asm	mulps	xmm2,xmm3							\
	__asm	mulps	xmm2,xmm3							\
	__asm	addps	xmm3,xmm3							\
	__asm	subps	xmm3,xmm2							\
	__asm	movups	xmm2,SRC0							\
	__asm	mulps	xmm3,xmm2							\
	__asm	movups	DST,xmm3
#define	KF2IDS1( SRC0 )									\
	__asm	movss		xmm2,SRC0						\
	__asm	cvttps2pi	mm2,xmm2						\
	__asm	movd		[edi+ebx],mm2
#define	KF2IDS4( SRC0 )									\
	__asm	movups		xmm2,SRC0						\
	__asm	cvttps2pi	mm2,xmm2						\
	__asm	movq		[edi+ebx+0],mm2					\
	__asm	shufps		xmm2,xmm2,SHUFFLEPS(1,0,3,2)	\
	__asm	cvttps2pi	mm2,xmm2						\
	__asm	movq		[edi+ebx+8],mm2
#define	KISQRTDS1( DST,SRC0 )							\
	__asm	movss	xmm2,SRC0							\
	__asm	rsqrtss	xmm3,xmm2							\
	__asm	mulss	xmm2,xmm3							\
	__asm	mulss	xmm2,xmm3							\
	__asm	subss	xmm2,xmm1							\
	__asm	mulss	xmm3,xmm0							\
	__asm	mulss	xmm3,xmm2							\
	__asm	movss	DST,xmm3
#define	KISQRTDS4( DST,SRC0 )							\
	__asm	movups	xmm2,SRC0							\
	__asm	rsqrtps	xmm3,xmm2							\
	__asm	mulps	xmm2,xmm3							\
	__asm	mulps	xmm2,xmm3							\
	__asm	subps	xmm2,xmm1							\
	__asm	mulps	xmm3,xmm0							\
	__asm	mulps	xmm3,xmm2							\
	__asm	movups	DST,xmm3

// this is used in vector4 implementation to shift constant V4
#define KANDREGDSV( DST, SRC0, VALUE )					\
	__asm	mov		DST,SRC0							\
	__asm	and		DST,VALUE

// this is used in vector4 code to operate with float arrays as sources
#define KEXPANDFLOAT( DST, SRC )						\
	__asm	movss	DST,SRC								\
	__asm	shufps  DST,DST,0

#define	KADDDS1( DST,SRC )		KADDDSS1( DST,DST,SRC )
#define	KADDDS4( DST,SRC )		KADDDSS4( DST,DST,SRC )
#define	KSUBDS1( DST,SRC )		KSUBDSS1( DST,DST,SRC )
#define	KSUBDS4( DST,SRC )		KSUBDSS4( DST,DST,SRC )
#define	KMULDS1( DST,SRC )		KMULDSS1( DST,DST,SRC )
#define	KMULDS4( DST,SRC )		KMULDSS4( DST,DST,SRC )
#define	KDIVDS1( DST,SRC )		KDIVDSS1( DST,DST,SRC )
#define	KDIVDS4( DST,SRC )		KDIVDSS4( DST,DST,SRC )

// handles pre & post leftovers
#define	KFLOATOPER( OPER, OPER4, COUNT )				\
	__asm		mov		ecx,pre							\
	__asm		mov		ebx,COUNT						\
	__asm		cmp		ebx,ecx							\
	__asm		cmovl	ecx,COUNT						\
	__asm		test	ecx,ecx							\
	__asm		je		preDone							\
	__asm		xor		ebx,ebx							\
	__asm	lpPre:										\
				OPER									\
	__asm		add		ebx,4							\
	__asm		dec		ecx								\
	__asm		jg		lpPre							\
	__asm	preDone:									\
	__asm		mov		ecx,post						\
	__asm		mov		ebx,COUNT						\
	__asm		sub		ebx,ecx							\
	__asm		shl		ebx,2							\
	__asm		cmp		ecx,4							\
	__asm		jl		post4Done						\
				OPER4									\
	__asm		sub		ecx,4							\
	__asm		add		ebx,4*4							\
	__asm	post4Done:									\
	__asm		test	ecx,ecx							\
	__asm		je		postDone						\
	__asm	lpPost:										\
				OPER									\
	__asm		add		ebx,4							\
	__asm		dec		ecx								\
	__asm		jg		lpPost							\
	__asm	postDone:

// operate on a constant and a float array
#define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT )	\
	int	pre,post;										\
	__asm		movss	xmm0,CONSTANT					\
	__asm		shufps	xmm0,xmm0,0						\
			KFLOATINITDS( DST, SRC, COUNT, pre, post )	\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		movaps	xmm1,xmm0						\
	__asm		movaps	xmm2,xmm0						\
	__asm		ALUOP##ps	xmm1,[edx+ebx]				\
	__asm		ALUOP##ps	xmm2,[edx+ebx+16]			\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		movaps	xmm1,xmm0						\
	__asm		movaps	xmm2,xmm0						\
	__asm		movups	xmm3,[edx+ebx]					\
	__asm		movups	xmm4,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC							\
	__asm		mov		edi,DST							\
	__asm		KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ),	\
	__asm					KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )

// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT )		\
	int	pre,post;										\
	KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post )	\
	__asm		and		eax,15							\
	__asm		jne		lpNA							\
	__asm		jmp		lpA								\
	__asm		align	16								\
	__asm	lpA:										\
	__asm		movaps	xmm1,[edx+ebx]					\
	__asm		movaps	xmm2,[edx+ebx+16]				\
	__asm		ALUOP##ps	xmm1,[esi+ebx]				\
	__asm		ALUOP##ps	xmm2,[esi+ebx+16]			\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpA								\
	__asm		jmp		done							\
	__asm		align	16								\
	__asm	lpNA:										\
	__asm		movups	xmm1,[edx+ebx]					\
	__asm		movups	xmm2,[edx+ebx+16]				\
	__asm		movups	xmm3,[esi+ebx]					\
	__asm		movups	xmm4,[esi+ebx+16]				\
	__asm		prefetchnta	[edx+ebx+64]				\
	__asm		prefetchnta	[esi+ebx+64]				\
	__asm		ALUOP##ps	xmm1,xmm3					\
	__asm		ALUOP##ps	xmm2,xmm4					\
	__asm		movaps	[edi+ebx],xmm1					\
	__asm		movaps	[edi+ebx+16],xmm2				\
	__asm		add		ebx,16*2						\
	__asm		jl		lpNA							\
	__asm	done:										\
	__asm		mov		edx,SRC0						\
	__asm		mov		esi,SRC1						\
	__asm		mov		edi,DST							\
	KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ),		\
				KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )


#define DRAWVERT_SIZE				60
#define DRAWVERT_XYZ_OFFSET			(0*4)
#define DRAWVERT_ST_OFFSET			(3*4)
#define DRAWVERT_NORMAL_OFFSET		(5*4)
#define DRAWVERT_TANGENT0_OFFSET	(8*4)
#define DRAWVERT_TANGENT1_OFFSET	(11*4)
#define DRAWVERT_COLOR_OFFSET		(14*4)

#define JOINTQUAT_SIZE				(7*4)
#define JOINTMAT_SIZE				(4*3*4)
#define JOINTWEIGHT_SIZE			(4*4)


#define ALIGN4_INIT1( X, INIT )				ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
#define ALIGN4_INIT4( X, I0, I1, I2, I3 )	ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
#define ALIGN8_INIT1( X, INIT )				ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }

ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );

ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );

ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );

ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );

ALIGN4_INIT1( float SIMD_SP_rsqrt_c0,  3.0f );
ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );

ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
ALIGN4_INIT1( float SIMD_SP_sin_c1,  2.7526e-06f );
ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
ALIGN4_INIT1( float SIMD_SP_sin_c3,  8.3333315e-03f );
ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );

ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
ALIGN4_INIT1( float SIMD_SP_cos_c1,  2.47609e-05f );
ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
ALIGN4_INIT1( float SIMD_SP_cos_c3,  4.16666418e-02f );
ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );

ALIGN4_INIT1( float SIMD_SP_atan_c0,  0.0028662257f );
ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
ALIGN4_INIT1( float SIMD_SP_atan_c2,  0.0429096138f );
ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
ALIGN4_INIT1( float SIMD_SP_atan_c4,  0.1065626393f );
ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
ALIGN4_INIT1( float SIMD_SP_atan_c6,  0.1999355085f );
ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );

/*
============
SSE_InvSqrt
============
*/
float SSE_InvSqrt( float x ) {
	float y;

	__asm {
		movss		xmm0, x
		rsqrtss		xmm1, xmm0
		mulss		xmm0, xmm1
		mulss		xmm0, xmm1
		subss		xmm0, SIMD_SP_rsqrt_c0
		mulss		xmm1, SIMD_SP_rsqrt_c1
		mulss		xmm0, xmm1
		movss		y, xmm0
	}
	return y;
}

/*
============
SSE_InvSqrt4
============
*/
void SSE_InvSqrt4( float x[4] ) {
	__asm {
		mov			edi, x
		movaps		xmm0, [edi]
		rsqrtps		xmm1, xmm0
		mulps		xmm0, xmm1
		mulps		xmm0, xmm1
		subps		xmm0, SIMD_SP_rsqrt_c0
		mulps		xmm1, SIMD_SP_rsqrt_c1
		mulps		xmm0, xmm1
		movaps		[edi], xmm0
	}
}

/*
============
SSE_SinZeroHalfPI

  The angle must be between zero and half PI.
============
*/
float SSE_SinZeroHalfPI( float a ) {
#if 1

	float t;

	assert( a >= 0.0f && a <= idMath::HALF_PI );

	__asm {
		movss		xmm0, a
		movss		xmm1, xmm0
		mulss		xmm1, xmm1
		movss		xmm2, SIMD_SP_sin_c0
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c1
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c2
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c3
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c4
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_one
		mulss		xmm2, xmm0
		movss		t, xmm2
	}

	return t;

#else

	float s, t;

	assert( a >= 0.0f && a <= idMath::HALF_PI );

	s = a * a;
	t = -2.39e-08f;
	t *= s;
	t += 2.7526e-06f;
	t *= s;
	t += -1.98409e-04f;
	t *= s;
	t += 8.3333315e-03f;
	t *= s;
	t += -1.666666664e-01f;
	t *= s;
	t += 1.0f;
	t *= a;

	return t;

#endif
}

/*
============
SSE_Sin4ZeroHalfPI

  The angle must be between zero and half PI.
============
*/
void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
	__asm {
		mov			edi, a
		mov			esi, s
		movaps		xmm0, [edi]
		movaps		xmm1, xmm0
		mulps		xmm1, xmm1
		movaps		xmm2, SIMD_SP_sin_c0
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c1
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c2
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c3
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c4
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_one
		mulps		xmm2, xmm0
		movaps		[esi], xmm2
	}
}

/*
============
SSE_Sin
============
*/
float SSE_Sin( float a ) {
#if 1

	float t;

	__asm {
		movss		xmm1, a
		movss		xmm2, xmm1
		movss		xmm3, xmm1
		mulss		xmm2, SIMD_SP_oneOverTwoPI
		cvttss2si	ecx, xmm2
		cmpltss		xmm3, SIMD_SP_zero
		andps		xmm3, SIMD_SP_one
		cvtsi2ss	xmm2, ecx
		subss		xmm2, xmm3
		mulss		xmm2, SIMD_SP_twoPI
		subss		xmm1, xmm2

		movss		xmm0, SIMD_SP_PI			// xmm0 = PI
		subss		xmm0, xmm1					// xmm0 = PI - a
		movss		xmm1, xmm0					// xmm1 = PI - a
		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
		movss		xmm2, xmm0					// xmm2 = PI - a
		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
		cmpnltss	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
		movss		xmm3, SIMD_SP_PI			// xmm3 = PI
		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
		xorps		xmm0, xmm2
		addps		xmm0, xmm3

		movss		xmm1, xmm0
		mulss		xmm1, xmm1
		movss		xmm2, SIMD_SP_sin_c0
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c1
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c2
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c3
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_sin_c4
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_one
		mulss		xmm2, xmm0
		movss		t, xmm2
	}

	return t;

#else

	float s, t;

	if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
		a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
	}

	a = idMath::PI - a;
	if ( fabs( a ) >= idMath::HALF_PI ) {
		a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
	}

	s = a * a;
	t = -2.39e-08f;
	t *= s;
	t += 2.7526e-06f;
	t *= s;
	t += -1.98409e-04f;
	t *= s;
	t += 8.3333315e-03f;
	t *= s;
	t += -1.666666664e-01f;
	t *= s;
	t += 1.0f;
	t *= a;

	return t;

#endif
}

/*
============
SSE_Sin4
============
*/
void SSE_Sin4( float a[4], float s[4] ) {
	__asm {
		mov			edi, a
		mov			esi, s
		movaps		xmm1, [edi]
		movaps		xmm2, xmm1
		mulps		xmm2, SIMD_SP_oneOverTwoPI
		movhlps		xmm3, xmm2
		cvttss2si	ecx, xmm2
		cvtsi2ss	xmm2, ecx
		cvttss2si	edx, xmm3
		cvtsi2ss	xmm3, edx
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
		cvttss2si	ecx, xmm2
		cvtsi2ss	xmm2, ecx
		cvttss2si	edx, xmm3
		cvtsi2ss	xmm3, edx
		shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
		movaps		xmm3, xmm1
		cmpltps		xmm3, SIMD_SP_zero
		andps		xmm3, SIMD_SP_one
		subps		xmm2, xmm3
		mulps		xmm2, SIMD_SP_twoPI
		subps		xmm1, xmm2

		movaps		xmm0, SIMD_SP_PI			// xmm0 = PI
		subps		xmm0, xmm1					// xmm0 = PI - a
		movaps		xmm1, xmm0					// xmm1 = PI - a
		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
		movaps		xmm2, xmm0					// xmm2 = PI - a
		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
		cmpnltps	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
		movaps		xmm3, SIMD_SP_PI			// xmm3 = PI
		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
		xorps		xmm0, xmm2
		addps		xmm0, xmm3

		movaps		xmm1, xmm0
		mulps		xmm1, xmm1
		movaps		xmm2, SIMD_SP_sin_c0
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c1
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c2
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c3
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_sin_c4
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_one
		mulps		xmm2, xmm0
		movaps		[esi], xmm2
	}
}

/*
============
SSE_CosZeroHalfPI

  The angle must be between zero and half PI.
============
*/
float SSE_CosZeroHalfPI( float a ) {
#if 1

	float t;

	assert( a >= 0.0f && a <= idMath::HALF_PI );

	__asm {
		movss		xmm0, a
		mulss		xmm0, xmm0
		movss		xmm1, SIMD_SP_cos_c0
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c1
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c2
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c3
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c4
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_one
		movss		t, xmm1
	}

	return t;

#else

	float s, t;

	assert( a >= 0.0f && a <= idMath::HALF_PI );

	s = a * a;
	t = -2.605e-07f;
	t *= s;
	t += 2.47609e-05f;
	t *= s;
	t += -1.3888397e-03f;
	t *= s;
	t += 4.16666418e-02f;
	t *= s;
	t += -4.999999963e-01f;
	t *= s;
	t += 1.0f;

	return t;

#endif
}

/*
============
SSE_Cos4ZeroHalfPI

  The angle must be between zero and half PI.
============
*/
void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
	__asm {
		mov			edi, a
		mov			esi, c
		movaps		xmm0, [edi]
		mulps		xmm0, xmm0
		movaps		xmm1, SIMD_SP_cos_c0
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c1
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c2
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c3
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c4
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_one
		movaps		[esi], xmm2
	}
}

/*
============
SSE_Cos
============
*/
float SSE_Cos( float a ) {
#if 1

	float t;

	__asm {
		movss		xmm1, a
		movss		xmm2, xmm1
		movss		xmm3, xmm1
		mulss		xmm2, SIMD_SP_oneOverTwoPI
		cvttss2si	ecx, xmm2
		cmpltss		xmm3, SIMD_SP_zero
		andps		xmm3, SIMD_SP_one
		cvtsi2ss	xmm2, ecx
		subss		xmm2, xmm3
		mulss		xmm2, SIMD_SP_twoPI
		subss		xmm1, xmm2

		movss		xmm0, SIMD_SP_PI			// xmm0 = PI
		subss		xmm0, xmm1					// xmm0 = PI - a
		movss		xmm1, xmm0					// xmm1 = PI - a
		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
		movss		xmm2, xmm0					// xmm2 = PI - a
		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
		cmpnltss	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
		movss		xmm3, SIMD_SP_PI			// xmm3 = PI
		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
		xorps		xmm0, xmm2
		addps		xmm0, xmm3

		mulss		xmm0, xmm0
		movss		xmm1, SIMD_SP_cos_c0
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c1
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c2
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c3
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_cos_c4
		mulss		xmm1, xmm0
		addss		xmm1, SIMD_SP_one
		xorps		xmm2, SIMD_SP_signBitMask
		xorps		xmm1, xmm2
		movss		t, xmm1
	}

	return t;

#else

	float s, t;

	if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
		a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
	}

	a = idMath::PI - a;
	if ( fabs( a ) >= idMath::HALF_PI ) {
		a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
		d = 1.0f;
	} else {
		d = -1.0f;
	}

	s = a * a;
	t = -2.605e-07f;
	t *= s;
	t += 2.47609e-05f;
	t *= s;
	t += -1.3888397e-03f;
	t *= s;
	t += 4.16666418e-02f;
	t *= s;
	t += -4.999999963e-01f;
	t *= s;
	t += 1.0f;
	t *= d;

	return t;

#endif
}

/*
============
SSE_Cos4
============
*/
void SSE_Cos4( float a[4], float c[4] ) {
	__asm {
		mov			edi, a
		mov			esi, c
		movaps		xmm1, [edi]
		movaps		xmm2, xmm1
		mulps		xmm2, SIMD_SP_oneOverTwoPI
		movhlps		xmm3, xmm2
		cvttss2si	ecx, xmm2
		cvtsi2ss	xmm2, ecx
		cvttss2si	edx, xmm3
		cvtsi2ss	xmm3, edx
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
		cvttss2si	ecx, xmm2
		cvtsi2ss	xmm2, ecx
		cvttss2si	edx, xmm3
		cvtsi2ss	xmm3, edx
		shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
		movaps		xmm3, xmm1
		cmpltps		xmm3, SIMD_SP_zero
		andps		xmm3, SIMD_SP_one
		subps		xmm2, xmm3
		mulps		xmm2, SIMD_SP_twoPI
		subps		xmm1, xmm2

		movaps		xmm0, SIMD_SP_PI			// xmm0 = PI
		subps		xmm0, xmm1					// xmm0 = PI - a
		movaps		xmm1, xmm0					// xmm1 = PI - a
		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
		movaps		xmm2, xmm0					// xmm2 = PI - a
		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
		cmpnltps	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
		movaps		xmm3, SIMD_SP_PI			// xmm3 = PI
		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
		xorps		xmm0, xmm2
		addps		xmm0, xmm3

		mulps		xmm0, xmm0
		movaps		xmm1, SIMD_SP_cos_c0
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c1
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c2
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c3
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_cos_c4
		mulps		xmm1, xmm0
		addps		xmm1, SIMD_SP_one
		xorps		xmm2, SIMD_SP_signBitMask
		xorps		xmm1, xmm2
		movaps		[esi], xmm1
	}
}

/*
============
SSE_SinCos
============
*/
void SSE_SinCos( float a, float &s, float &c ) {
	__asm {
		mov			edi, s
		mov			esi, c
		movss		xmm1, a
		movss		xmm2, xmm1
		movss		xmm3, xmm1
		mulss		xmm2, SIMD_SP_oneOverTwoPI
		cvttss2si	ecx, xmm2
		cmpltss		xmm3, SIMD_SP_zero
		andps		xmm3, SIMD_SP_one
		cvtsi2ss	xmm2, ecx
		subss		xmm2, xmm3
		mulss		xmm2, SIMD_SP_twoPI
		subss		xmm1, xmm2

		movss		xmm0, SIMD_SP_PI			// xmm0 = PI
		subss		xmm0, xmm1					// xmm0 = PI - a
		movss		xmm1, xmm0					// xmm1 = PI - a
		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
		movss		xmm2, xmm0					// xmm2 = PI - a
		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
		cmpnltss	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
		movss		xmm3, SIMD_SP_PI			// xmm3 = PI
		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
		xorps		xmm0, xmm2
		addps		xmm0, xmm3

		movss		xmm1, xmm0
		mulss		xmm1, xmm1
		movss		xmm3, SIMD_SP_sin_c0
		movss		xmm4, SIMD_SP_cos_c0
		mulss		xmm3, xmm1
		mulss		xmm4, xmm1
		addss		xmm3, SIMD_SP_sin_c1
		addss		xmm4, SIMD_SP_cos_c1
		mulss		xmm3, xmm1
		mulss		xmm4, xmm1
		addss		xmm3, SIMD_SP_sin_c2
		addss		xmm4, SIMD_SP_cos_c2
		mulss		xmm3, xmm1
		mulss		xmm4, xmm1
		addss		xmm3, SIMD_SP_sin_c3
		addss		xmm4, SIMD_SP_cos_c3
		mulss		xmm3, xmm1
		mulss		xmm4, xmm1
		addss		xmm3, SIMD_SP_sin_c4
		addss		xmm4, SIMD_SP_cos_c4
		mulss		xmm3, xmm1
		mulss		xmm4, xmm1
		addss		xmm3, SIMD_SP_one
		addss		xmm4, SIMD_SP_one
		mulss		xmm3, xmm0
		xorps		xmm2, SIMD_SP_signBitMask
		xorps		xmm4, xmm2
		movss		[edi], xmm2
		movss		[esi], xmm3
	}
}

/*
============
SSE_SinCos4
============
*/
void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
	__asm {
		mov			eax, a
		mov			edi, s
		mov			esi, c
		movaps		xmm1, [eax]
		movaps		xmm2, xmm1
		mulps		xmm2, SIMD_SP_oneOverTwoPI
		movhlps		xmm3, xmm2
		cvttss2si	ecx, xmm2
		cvtsi2ss	xmm2, ecx
		cvttss2si	edx, xmm3
		cvtsi2ss	xmm3, edx
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
		cvttss2si	ecx, xmm2
		cvtsi2ss	xmm2, ecx
		cvttss2si	edx, xmm3
		cvtsi2ss	xmm3, edx
		shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
		movaps		xmm3, xmm1
		cmpltps		xmm3, SIMD_SP_zero
		andps		xmm3, SIMD_SP_one
		subps		xmm2, xmm3
		mulps		xmm2, SIMD_SP_twoPI
		subps		xmm1, xmm2

		movaps		xmm0, SIMD_SP_PI			// xmm0 = PI
		subps		xmm0, xmm1					// xmm0 = PI - a
		movaps		xmm1, xmm0					// xmm1 = PI - a
		andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signbit( PI - a )
		movaps		xmm2, xmm0					// xmm2 = PI - a
		xorps		xmm2, xmm1					// xmm2 = fabs( PI - a )
		cmpnltps	xmm2, SIMD_SP_halfPI		// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
		movaps		xmm3, SIMD_SP_PI			// xmm3 = PI
		xorps		xmm3, xmm1					// xmm3 = PI ^ signbit( PI - a )
		andps		xmm3, xmm2					// xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
		andps		xmm2, SIMD_SP_signBitMask	// xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
		xorps		xmm0, xmm2
		addps		xmm0, xmm3

		movaps		xmm0, [eax]
		movaps		xmm1, xmm0
		mulps		xmm1, xmm1
		movaps		xmm3, SIMD_SP_sin_c0
		movaps		xmm4, SIMD_SP_cos_c0
		mulps		xmm3, xmm1
		mulps		xmm4, xmm1
		addps		xmm3, SIMD_SP_sin_c1
		addps		xmm4, SIMD_SP_cos_c1
		mulps		xmm3, xmm1
		mulps		xmm4, xmm1
		addps		xmm3, SIMD_SP_sin_c2
		addps		xmm4, SIMD_SP_cos_c2
		mulps		xmm3, xmm1
		mulps		xmm4, xmm1
		addps		xmm3, SIMD_SP_sin_c3
		addps		xmm4, SIMD_SP_cos_c3
		mulps		xmm3, xmm1
		mulps		xmm4, xmm1
		addps		xmm3, SIMD_SP_sin_c4
		addps		xmm4, SIMD_SP_cos_c4
		mulps		xmm3, xmm1
		mulps		xmm4, xmm1
		addps		xmm3, SIMD_SP_one
		addps		xmm4, SIMD_SP_one
		mulps		xmm3, xmm0
		xorps		xmm2, SIMD_SP_signBitMask
		xorps		xmm4, xmm2
		movaps		[edi], xmm3
		movaps		[esi], xmm4
	}
}

/*
============
SSE_ATanPositive

  Both 'x' and 'y' must be positive.
============
*/
float SSE_ATanPositive( float y, float x ) {
#if 1

	float t;

	assert( y >= 0.0f && x >= 0.0f );

	__asm {
		movss		xmm0, x
		movss		xmm3, xmm0
		movss		xmm1, y
		minss		xmm0, xmm1
		maxss		xmm1, xmm3
		cmpeqss		xmm3, xmm0
		rcpss		xmm2, xmm1
		mulss		xmm1, xmm2
		mulss		xmm1, xmm2
		addss		xmm2, xmm2
		subss		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
		mulss		xmm0, xmm2				// xmm0 = x / y or y / x
		movss		xmm1, xmm3
		andps		xmm1, SIMD_SP_signBitMask
		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
		andps		xmm3, SIMD_SP_halfPI	// xmm3 = HALF_PI or 0.0f
		movss		xmm1, xmm0
		mulss		xmm1, xmm1				// xmm1 = s
		movss		xmm2, SIMD_SP_atan_c0
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c1
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c2
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c3
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c4
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c5
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c6
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c7
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_one
		mulss		xmm2, xmm0
		addss		xmm2, xmm3
		movss		t, xmm2
	}

	return t;

#else

	float a, d, s, t;

	assert( y >= 0.0f && x >= 0.0f );

	if ( y > x ) {
		a = -x / y;
		d = idMath::HALF_PI;
	} else {
		a = y / x;
		d = 0.0f;
	}
	s = a * a;
	t = 0.0028662257f;
	t *= s;
	t += -0.0161657367f;
	t *= s;
	t += 0.0429096138f;
	t *= s;
	t += -0.0752896400f;
	t *= s;
	t += 0.1065626393f;
	t *= s;
	t += -0.1420889944f;
	t *= s;
	t += 0.1999355085f;
	t *= s;
	t += -0.3333314528f;
	t *= s;
	t += 1.0f;
	t *= a;
	t += d;

	return t;

#endif
}

/*
============
SSE_ATan4Positive

  Both 'x' and 'y' must be positive.
============
*/
void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
	__asm {
		mov			esi, x
		mov			edi, y
		mov			edx, at
		movaps		xmm0, [esi]
		movaps		xmm3, xmm0
		movaps		xmm1, [edi]
		minps		xmm0, xmm1
		maxps		xmm1, xmm3
		cmpeqps		xmm3, xmm0
		rcpps		xmm2, xmm1
		mulps		xmm1, xmm2
		mulps		xmm1, xmm2
		addps		xmm2, xmm2
		subps		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
		mulps		xmm0, xmm2				// xmm0 = x / y or y / x
		movaps		xmm1, xmm3
		andps		xmm1, SIMD_SP_signBitMask
		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
		andps		xmm3, SIMD_SP_halfPI	// xmm3 = HALF_PI or 0.0f
		movaps		xmm1, xmm0
		mulps		xmm1, xmm1				// xmm1 = s
		movaps		xmm2, SIMD_SP_atan_c0
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c1
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c2
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c3
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c4
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c5
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c6
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c7
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_one
		mulps		xmm2, xmm0
		addps		xmm2, xmm3
		movaps		[edx], xmm2
	}
}

/*
============
SSE_ATan
============
*/
float SSE_ATan( float y, float x ) {
#if 1

	float t;

	__asm {
		movss		xmm0, x
		movss		xmm3, xmm0
		movss		xmm4, xmm0
		andps		xmm0, SIMD_SP_absMask
		movss		xmm1, y
		xorps		xmm4, xmm1
		andps		xmm1, SIMD_SP_absMask
		andps		xmm4, SIMD_SP_signBitMask
		minss		xmm0, xmm1
		maxss		xmm1, xmm3
		cmpeqss		xmm3, xmm0
		rcpss		xmm2, xmm1
		mulss		xmm1, xmm2
		mulss		xmm1, xmm2
		addss		xmm2, xmm2
		subss		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
		mulss		xmm0, xmm2				// xmm0 = x / y or y / x
		xorps		xmm0, xmm4
		movss		xmm1, xmm3
		andps		xmm1, SIMD_SP_signBitMask
		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
		orps		xmm4, SIMD_SP_halfPI	// xmm4 = +/- HALF_PI
		andps		xmm3, xmm4				// xmm3 = +/- HALF_PI or 0.0f
		movss		xmm1, xmm0
		mulss		xmm1, xmm1				// xmm1 = s
		movss		xmm2, SIMD_SP_atan_c0
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c1
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c2
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c3
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c4
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c5
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c6
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_atan_c7
		mulss		xmm2, xmm1
		addss		xmm2, SIMD_SP_one
		mulss		xmm2, xmm0
		addss		xmm2, xmm3
		movss		t, xmm2
	}

	return t;

#else

	float a, d, s, t;

	if ( fabs( y ) > fabs( x ) ) {
		a = -x / y;
		d = idMath::HALF_PI;
		*((unsigned long *)&d) ^= ( *((unsigned long *)&x) ^ *((unsigned long *)&y) ) & (1<<31);
	} else {
		a = y / x;
		d = 0.0f;
	}

	s = a * a;
	t = 0.0028662257f;
	t *= s;
	t += -0.0161657367f;
	t *= s;
	t += 0.0429096138f;
	t *= s;
	t += -0.0752896400f;
	t *= s;
	t += 0.1065626393f;
	t *= s;
	t += -0.1420889944f;
	t *= s;
	t += 0.1999355085f;
	t *= s;
	t += -0.3333314528f;
	t *= s;
	t += 1.0f;
	t *= a;
	t += d;

	return t;

#endif
}

/*
============
SSE_ATan4
============
*/
void SSE_ATan4( float y[4], float x[4], float at[4] ) {
	__asm {
		mov			esi, x
		mov			edi, y
		mov			edx, at
		movaps		xmm0, [esi]
		movaps		xmm3, xmm0
		movaps		xmm4, xmm0
		andps		xmm0, SIMD_SP_absMask
		movaps		xmm1, [edi]
		xorps		xmm4, xmm1
		andps		xmm1, SIMD_SP_absMask
		andps		xmm4, SIMD_SP_signBitMask
		minps		xmm0, xmm1
		maxps		xmm1, xmm3
		cmpeqps		xmm3, xmm0
		rcpps		xmm2, xmm1
		mulps		xmm1, xmm2
		mulps		xmm1, xmm2
		addps		xmm2, xmm2
		subps		xmm2, xmm1				// xmm2 = 1 / y or 1 / x
		mulps		xmm0, xmm2				// xmm0 = x / y or y / x
		xorps		xmm0, xmm4
		movaps		xmm1, xmm3
		andps		xmm1, SIMD_SP_signBitMask
		xorps		xmm0, xmm1				// xmm0 = -x / y or y / x
		orps		xmm4, SIMD_SP_halfPI	// xmm4 = +/- HALF_PI
		andps		xmm3, xmm4				// xmm3 = +/- HALF_PI or 0.0f
		movaps		xmm1, xmm0
		mulps		xmm1, xmm1				// xmm1 = s
		movaps		xmm2, SIMD_SP_atan_c0
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c1
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c2
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c3
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c4
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c5
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c6
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_atan_c7
		mulps		xmm2, xmm1
		addps		xmm2, SIMD_SP_one
		mulps		xmm2, xmm0
		addps		xmm2, xmm3
		movaps		[edx], xmm2
	}
}

/*
============
SSE_TestTrigonometry
============
*/
void SSE_TestTrigonometry( void ) {
	int i;
	float a, s1, s2, c1, c2;

	for ( i = 0; i < 100; i++ ) {
		a = i * idMath::HALF_PI / 100.0f;

		s1 = sin( a );
		s2 = SSE_SinZeroHalfPI( a );

		if ( fabs( s1 - s2 ) > 1e-7f ) {
			assert( 0 );
		}

		c1 = cos( a );
		c2 = SSE_CosZeroHalfPI( a );

		if ( fabs( c1 - c2 ) > 1e-7f ) {
			assert( 0 );
		}
	}

	for ( i = -200; i < 200; i++ ) {
		a = i * idMath::TWO_PI / 100.0f;

		s1 = sin( a );
		s2 = SSE_Sin( a );

		if ( fabs( s1 - s2 ) > 1e-6f ) {
			assert( 0 );
		}

		c1 = cos( a );
		c2 = SSE_Cos( a );

		if ( fabs( c1 - c2 ) > 1e-6f ) {
			assert( 0 );
		}

		SSE_SinCos( a, s2, c2 );
		if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
			assert( 0 );
		}
	}
}

/*
============
idSIMD_SSE::GetName
============
*/
const char * idSIMD_SSE::GetName( void ) const {
	return "MMX & SSE";
}

/*
============
idSIMD_SSE::Add

  dst[i] = constant + src[i];
============
*/
void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
	KFLOAT_CA( add, dst, src, constant, count )
}

/*
============
idSIMD_SSE::Add

  dst[i] = src0[i] + src1[i];
============
*/
void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
	KFLOAT_AA( add, dst, src0, src1, count )
}

/*
============
idSIMD_SSE::Sub

  dst[i] = constant - src[i];
============
*/
void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
	KFLOAT_CA( sub, dst, src, constant, count )
}

/*
============
idSIMD_SSE::Sub

  dst[i] = src0[i] - src1[i];
============
*/
void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
	KFLOAT_AA( sub, dst, src0, src1, count )
}

/*
============
idSIMD_SSE::Mul

  dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
	KFLOAT_CA( mul, dst, src, constant, count )
}

/*
============
idSIMD_SSE::Mul

  dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
	KFLOAT_AA( mul, dst, src0, src1, count )
}

/*
============
idSIMD_SSE::Div

  dst[i] = constant / src[i];
============
*/
void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
	int pre, post;

	//	1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
	__asm
	{
		movss	xmm1,constant
		shufps	xmm1,xmm1,0

		KFLOATINITDS( dst, src, count, pre, post )
		and		eax,15
		jne		lpNA
		jmp		lpA
		align	16
lpA:
		movaps	xmm2,[edx+ebx]
		movaps	xmm3,[edx+ebx+16]
		rcpps	xmm4,xmm2
		rcpps	xmm5,xmm3
		prefetchnta	[edx+ebx+64]
		mulps	xmm2,xmm4
		mulps	xmm2,xmm4
		mulps	xmm3,xmm5
		mulps	xmm3,xmm5
		addps	xmm4,xmm4
		addps	xmm5,xmm5
		subps	xmm4,xmm2
		subps	xmm5,xmm3
		mulps	xmm4,xmm1
		mulps	xmm5,xmm1
		movaps	[edi+ebx],xmm4
		movaps	[edi+ebx+16],xmm5
		add		ebx,16*2
		jl		lpA
		jmp		done
		align	16
lpNA:
		movups	xmm2,[edx+ebx]
		movups	xmm3,[edx+ebx+16]
		rcpps	xmm4,xmm2
		rcpps	xmm5,xmm3
		prefetchnta	[edx+ebx+64]
		mulps	xmm2,xmm4
		mulps	xmm2,xmm4
		mulps	xmm3,xmm5
		mulps	xmm3,xmm5
		addps	xmm4,xmm4
		addps	xmm5,xmm5
		subps	xmm4,xmm2
		subps	xmm5,xmm3
		mulps	xmm4,xmm1
		mulps	xmm5,xmm1
		movaps	[edi+ebx],xmm4
		movaps	[edi+ebx+16],xmm5
		add		ebx,16*2
		jl		lpNA
done:
		mov		edx,src
		mov		edi,dst
		KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
					KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
	}
}

/*
============
idSIMD_SSE::Div

  dst[i] = src0[i] / src1[i];
============
*/
void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
	int		pre,post;

	//	1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
	__asm
	{
		KFLOATINITDSS( dst, src0, src1, count, pre, post )
		and		eax,15
		jne		lpNA
		jmp		lpA
		align	16
lpA:
		movaps	xmm2,[esi+ebx]
		movaps	xmm3,[esi+ebx+16]
		rcpps	xmm4,xmm2
		rcpps	xmm5,xmm3
		prefetchnta	[esi+ebx+64]
		mulps	xmm2,xmm4
		mulps	xmm2,xmm4
		mulps	xmm3,xmm5
		mulps	xmm3,xmm5
		addps	xmm4,xmm4
		addps	xmm5,xmm5
		subps	xmm4,xmm2
		subps	xmm5,xmm3
		mulps	xmm4,[edx+ebx]
		mulps	xmm5,[edx+ebx+16]
		movaps	[edi+ebx],xmm4
		movaps	[edi+ebx+16],xmm5
		add		ebx,16*2
		jl		lpA
		jmp		done
		align	16
lpNA:
		movups	xmm2,[esi+ebx]
		movups	xmm3,[esi+ebx+16]
		rcpps	xmm4,xmm2
		rcpps	xmm5,xmm3
		prefetchnta	[esi+ebx+64]
		mulps	xmm2,xmm4
		mulps	xmm2,xmm4
		mulps	xmm3,xmm5
		mulps	xmm3,xmm5
		addps	xmm4,xmm4
		addps	xmm5,xmm5
		subps	xmm4,xmm2
		subps	xmm5,xmm3
		movups	xmm2,[edx+ebx]
		movups	xmm3,[edx+ebx+16]
		mulps	xmm4,xmm2
		mulps	xmm5,xmm3
		movaps	[edi+ebx],xmm4
		movaps	[edi+ebx+16],xmm5
		add		ebx,16*2
		jl		lpNA
done:
		mov		edx,src0
		mov		esi,src1
		mov		edi,dst
		KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
					KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
	}
}
/*
============
Simd_MulAdd

 assumes count >= 7
============
*/
static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
	__asm	mov			esi, dst
	__asm	mov			edi, src
	__asm	mov			eax, count
	__asm	shl			eax, 2
	__asm	mov			ecx, esi
	__asm	mov			edx, eax
	__asm	or			ecx, edi
	__asm	fld			constant
	__asm	and			ecx, 15
	__asm	jz			SimdMulAdd16
	__asm	and			ecx, 3
	__asm	jnz			SimdMulAdd8
	__asm	mov			ecx, esi
	__asm	xor			ecx, edi
	__asm	and			ecx, 15
	__asm	jnz			MulAdd8
	__asm	mov			ecx, esi
	__asm	and			ecx, 15
	__asm	neg			ecx
	__asm	add			ecx, 16
	__asm	sub			eax, ecx
	__asm	add			edi, ecx
	__asm	add			esi, ecx
	__asm	neg			ecx
	__asm	mov			edx, eax
	__asm loopPreMulAdd16:
	__asm	fld			st
	__asm	fmul		dword ptr [edi+ecx]
	__asm	fadd		dword ptr [esi+ecx]
	__asm	fstp		dword ptr [esi+ecx]
	__asm	add			ecx, 4
	__asm	jl			loopPreMulAdd16
	__asm SimdMulAdd16:
	__asm	and			eax, ~15
	__asm	movss		xmm1, constant
	__asm	shufps		xmm1, xmm1, 0x00
	__asm	add			esi, eax
	__asm	add			edi, eax
	__asm	neg			eax
	__asm	align		16
	__asm loopMulAdd16:
	__asm	movaps		xmm0, [edi+eax]
	__asm	mulps		xmm0, xmm1
	__asm	addps		xmm0, [esi+eax]
	__asm	movaps		[esi+eax], xmm0
	__asm	add			eax, 16
	__asm	jl			loopMulAdd16
	__asm	jmp			postMulAdd
	__asm MulAdd8:
	__asm	mov			ecx, esi
	__asm	and			ecx, 7
	__asm	jz			SimdMulAdd8
	__asm	sub			eax, ecx
	__asm	add			esi, ecx
	__asm	add			edi, ecx
	__asm	neg			ecx
	__asm	mov			edx, eax
	__asm loopPreMulAdd8:
	__asm	fld			st
	__asm	fmul		dword ptr [edi+ecx]
	__asm	fadd		dword ptr [esi+ecx]
	__asm	fstp		dword ptr [esi+ecx]
	__asm	add			ecx, 4
	__asm	jl			loopPreMulAdd8
	__asm SimdMulAdd8:
	__asm	and			eax, ~15
	__asm	movss		xmm1, constant
	__asm	shufps		xmm1, xmm1, 0x00
	__asm	add			esi, eax
	__asm	add			edi, eax
	__asm	neg			eax
	__asm	align		16
	__asm loopMulAdd8:
	__asm	movlps		xmm0, [edi+eax]
	__asm	movhps		xmm0, [edi+eax+8]
	__asm	mulps		xmm0, xmm1
	__asm	movlps		xmm2, [esi+eax]
	__asm	movhps		xmm2, [esi+eax+8]
	__asm	addps		xmm0, xmm2
	__asm	movlps		[esi+eax], xmm0
	__asm	movhps		[esi+eax+8], xmm0
	__asm	add			eax, 16
	__asm	jl			loopMulAdd8
	__asm	jmp			postMulAdd
	__asm postMulAdd:
	__asm	and			edx, 15
	__asm	jz			MulAddDone
	__asm	add			esi, edx
	__asm	add			edi, edx
	__asm	neg			edx
	__asm loopPostMulAdd:
	__asm	fld			st
	__asm	fmul		dword ptr [edi+edx]
	__asm	fadd		dword ptr [esi+edx]
	__asm	fstp		dword ptr [esi+edx]
	__asm	add			edx, 4
	__asm	jl			loopPostMulAdd
	__asm MulAddDone:
	__asm	fstp		st
}

#define MULADD_FEW( OPER )																				\
switch( count ) {																						\
	case 0:																								\
		return;																							\
	case 1:																								\
		dst[0] OPER c * src[0];																			\
		return;																							\
	case 2:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1];													\
		return;																							\
	case 3:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2];							\
		return;																							\
	case 4:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		return;																							\
	case 5:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4];																			\
		return;																							\
	case 6:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4]; dst[5] OPER c * src[5];													\
		return;																							\
	case 7:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6];							\
		return;																							\
	case 8:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
		return;																							\
	case 9:																								\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
		dst[8] OPER c * src[8];																			\
		return;																							\
	case 10:																							\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
		dst[8] OPER c * src[8]; dst[9] OPER c * src[9];													\
		return;																							\
	case 11:																							\
		dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3];	\
		dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7];	\
		dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10];						\
		return;																							\
}

/*
============
idSIMD_SSE::MulAdd

  dst[i] += constant * src[i];
============
*/
void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
	float c = constant;
	MULADD_FEW( += )
	Simd_MulAdd( dst, constant, src, count );
}

/*
============
idSIMD_SSE::MulAdd

  dst[i] += src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
	for ( int i = 0; i < count; i++ ) {
		dst[i] += src0[i] + src1[i];
	}
}

/*
============
idSIMD_SSE::MulSub

  dst[i] -= constant * src[i];
============
*/
void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
	float c = constant;
	MULADD_FEW( -= )
	Simd_MulAdd( dst, -constant, src, count );
}

/*
============
idSIMD_SSE::MulSub

  dst[i] -= src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
	for ( int i = 0; i < count; i++ ) {
		dst[i] -= src0[i] + src1[i];
	}
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
	__asm
	{
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
		and			eax, ~3

		movss		xmm4, [edi+0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm5, [edi+4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )

		jz			done4
		imul		eax, 12
		add			esi, eax
		neg			eax

	loop4:
		movlps		xmm1, [esi+eax+ 0]
		movlps		xmm2, [esi+eax+ 8]
		movlps		xmm3, [esi+eax+16]
		movhps		xmm1, [esi+eax+24]
		movhps		xmm2, [esi+eax+32]
		movhps		xmm3, [esi+eax+40]
		movaps		xmm0, xmm1
		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
		shufps		xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
		add			ecx, 16
		add			eax, 4*12
		mulps		xmm0, xmm4
		mulps		xmm1, xmm5
		mulps		xmm2, xmm6
		addps		xmm0, xmm1
		addps		xmm0, xmm2
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loop4

	done4:
		and			edx, 3
		jz			done1

	loop1:
		movss		xmm0, [esi+eax+0]
		movss		xmm1, [esi+eax+4]
		movss		xmm2, [esi+eax+8]
		mulss		xmm0, xmm4
		mulss		xmm1, xmm5
		mulss		xmm2, xmm6
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, 12
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loop1

	done1:
	}
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant * src[i].Normal() + src[i][3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
	__asm {
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
		and			eax, ~3

		movss		xmm5, [edi+0]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+4]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm7, [edi+8]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		jz			startVert1
		imul		eax, 16
		add			esi, eax
		neg			eax

	loopVert4:

		movlps		xmm1, [esi+eax+ 0]
		movlps		xmm3, [esi+eax+ 8]
		movhps		xmm1, [esi+eax+16]
		movhps		xmm3, [esi+eax+24]
		movlps		xmm2, [esi+eax+32]
		movlps		xmm4, [esi+eax+40]
		movhps		xmm2, [esi+eax+48]
		movhps		xmm4, [esi+eax+56]
		movaps		xmm0, xmm1
		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
		shufps		xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
		movaps		xmm2, xmm3
		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
		shufps		xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )

		add			ecx, 16
		add			eax, 4*16

		mulps		xmm0, xmm5
		mulps		xmm1, xmm6
		mulps		xmm2, xmm7
		addps		xmm0, xmm3
		addps		xmm0, xmm1
		addps		xmm0, xmm2

		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loopVert4

	startVert1:
		and			edx, 3
		jz			done

	loopVert1:
		movss		xmm0, [esi+eax+0]
		movss		xmm1, [esi+eax+4]
		movss		xmm2, [esi+eax+8]
		mulss		xmm0, xmm5
		mulss		xmm1, xmm6
		mulss		xmm2, xmm7
		addss		xmm0, [esi+eax+12]
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, 16
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loopVert1

	done:
	}
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant * src[i].xyz;
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	// 0,  1,  2
	// 3,  4,  5
	// 6,  7,  8
	// 9, 10, 11

	__asm {
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
		and			eax, ~3

		movss		xmm4, [edi+0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm5, [edi+4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )

		jz			startVert1
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax

	loopVert4:
		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
		movaps		xmm1, xmm0												//  3,  X,  0,  1

		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5

		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9

		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10

		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11

		add			ecx, 16
		add			eax, 4*DRAWVERT_SIZE

		mulps		xmm0, xmm4
		mulps		xmm1, xmm5
		mulps		xmm2, xmm6
		addps		xmm0, xmm1
		addps		xmm0, xmm2

		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loopVert4

	startVert1:
		and			edx, 3
		jz			done

	loopVert1:
		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
		mulss		xmm0, xmm4
		mulss		xmm1, xmm5
		mulss		xmm2, xmm6
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, DRAWVERT_SIZE
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loopVert1

	done:
	}
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant.Normal() * src[i] + constant[3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
	__asm
	{
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
		and			eax, ~3

		movss		xmm4, [edi+0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm5, [edi+4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm7, [edi+12]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		jz			done4
		imul		eax, 12
		add			esi, eax
		neg			eax

	loop4:
		movlps		xmm1, [esi+eax+ 0]
		movlps		xmm2, [esi+eax+ 8]
		movlps		xmm3, [esi+eax+16]
		movhps		xmm1, [esi+eax+24]
		movhps		xmm2, [esi+eax+32]
		movhps		xmm3, [esi+eax+40]
		movaps		xmm0, xmm1
		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
		shufps		xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )

		add			ecx, 16
		add			eax, 4*12

		mulps		xmm0, xmm4
		mulps		xmm1, xmm5
		mulps		xmm2, xmm6
		addps		xmm0, xmm7
		addps		xmm0, xmm1
		addps		xmm0, xmm2
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )

		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loop4

	done4:
		and			edx, 3
		jz			done1

	loop1:
		movss		xmm0, [esi+eax+0]
		movss		xmm1, [esi+eax+4]
		movss		xmm2, [esi+eax+8]
		mulss		xmm0, xmm4
		mulss		xmm1, xmm5
		mulss		xmm2, xmm6
		addss		xmm0, xmm7
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, 12
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loop1

	done1:
	}
}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {

#define SINGLE_OP(SRC, DEST)							\
	__asm	movlps		xmm0,[SRC]						\
	__asm	movlps		xmm1,[SRC+8]					\
	__asm	mulps		xmm0,xmm4						\
	__asm	mulps		xmm1,xmm5						\
	__asm	addps		xmm0,xmm1						\
	__asm	movaps		xmm1,xmm0						\
	__asm	shufps		xmm1,xmm1,SHUFFLEPS(1,1,1,1)	\
	__asm	addss		xmm0,xmm1						\
	__asm	movss		[DEST],xmm0						\
	__asm	add			SRC,16							\
	__asm	add			DEST,4

#define DUAL_OP(SRC, DEST)								\
	__asm	movlps		xmm0,[SRC]						\
	__asm	movlps		xmm1,[SRC+8]					\
	__asm	movhps		xmm0,[SRC+16]					\
	__asm	movhps		xmm1,[SRC+24]					\
	__asm	mulps		xmm0,xmm4						\
	__asm	mulps		xmm1,xmm5						\
	__asm	addps		xmm0,xmm1						\
	__asm	shufps		xmm1,xmm0,SHUFFLEPS(2,0,1,0)	\
	__asm	shufps		xmm0,xmm0,SHUFFLEPS(3,1,2,0)	\
	__asm	addps		xmm0,xmm1						\
	__asm	movhps		[DEST],xmm0						\
	__asm	add			SRC,32							\
	__asm	add			DEST,8

	__asm {
		mov			edx, dst
		mov			eax, src
		mov			ebx, constant
		mov			ecx, count

		movlps		xmm4, [ebx]
		shufps		xmm4, xmm4, SHUFFLEPS(1,0,1,0)
		movlps		xmm5, [ebx+8]
		shufps		xmm5, xmm5, SHUFFLEPS(1,0,1,0)

		xorps		xmm0, xmm0
		xorps		xmm1, xmm1

	_lpAlignDest:
		test		edx, 0x0f
		jz			_destAligned
		SINGLE_OP(eax,edx)
		dec			ecx
		jnz			_lpAlignDest
		jmp			_vpExit

	_destAligned:
		push		ecx

		cmp			ecx, 4
		jl			_post

		and			ecx, ~3
		shl			ecx, 2
		lea			eax, [eax+ecx*4]
		add			edx, ecx
		neg			ecx

		movlps		xmm0, [eax+ecx*4]
		movhps		xmm0, [eax+ecx*4+16]
		movlps		xmm2, [eax+ecx*4+32]
		movhps		xmm2, [eax+ecx*4+48]
		jmp			_lpStart

		align	16
	_lp:
		prefetchnta	[eax+ecx*4+128]
		addps		xmm1, xmm0
		movlps		xmm0, [eax+ecx*4]
		movhps		xmm0, [eax+ecx*4+16]
		movlps		xmm2, [eax+ecx*4+32]
		movhps		xmm2, [eax+ecx*4+48]
		movaps		[edx+ecx-16],xmm1
	_lpStart:
		movlps		xmm1, [eax+ecx*4+8]
		movhps		xmm1, [eax+ecx*4+24]
		movlps		xmm3, [eax+ecx*4+40]
		movhps		xmm3, [eax+ecx*4+56]
		add			ecx, 16
		mulps		xmm1, xmm5
		mulps		xmm2, xmm4
		mulps		xmm3, xmm5
		addps		xmm2, xmm3						// y3+w3 x3+z3 y2+w2 x2+z2
		mulps		xmm0, xmm4
		addps		xmm0, xmm1						// y1+w1 x1+z1 y0+w0 x0+z0
		movaps		xmm1, xmm0
		shufps		xmm0, xmm2, SHUFFLEPS(2,0,2,0)	// x3+z3 x2+z2 x1+z1 x0+z0
		shufps		xmm1, xmm2, SHUFFLEPS(3,1,3,1)	// y3+w3 y2+w2 y1+w1 y0+w0
		js			_lp
		addps		xmm1, xmm0
		movaps		[edx+ecx-16], xmm1
	_post:
		pop			ecx
		and			ecx, 0x3
		cmp			ecx, 2
		jl			_post1
		DUAL_OP(eax,edx)
		sub			ecx, 2
	_post1:
		cmp			ecx, 1
		jne			_vpExit
		SINGLE_OP(eax,edx)
	_vpExit:
	}

#undef DUAL_OP
#undef SINGLE_OP

}

/*
============
idSIMD_SSE::Dot

  dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	// 0,  1,  2
	// 3,  4,  5
	// 6,  7,  8
	// 9, 10, 11

	__asm {
		mov			eax, count
		mov			edi, constant
		mov			edx, eax
		mov			esi, src
		mov			ecx, dst
		and			eax, ~3

		movss		xmm4, [edi+0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm5, [edi+4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [edi+8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm7, [edi+12]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		jz			startVert1
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax

	loopVert4:
		movss		xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  X,  X
		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	//  2,  X,  X,  X
		movhps		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  3,  X,  0,  1
		movaps		xmm1, xmm0												//  3,  X,  0,  1

		movlps		xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	//  4,  5,  0,  1
		shufps		xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )					//  2,  X,  4,  5

		movss		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  X,  X
		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]	//  9,  X,  6,  7
		shufps		xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )					//  0,  3,  6,  9

		movlps		xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]	// 10, 11,  6,  7
		shufps		xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )					//  1,  4,  7, 10

		movhps		xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]	// 10, 11,  8,  X
		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )					//  2,  5,  8, 11

		add			ecx, 16
		add			eax, 4*DRAWVERT_SIZE

		mulps		xmm0, xmm4
		mulps		xmm1, xmm5
		mulps		xmm2, xmm6
		addps		xmm0, xmm7
		addps		xmm0, xmm1
		addps		xmm0, xmm2

		movlps		[ecx-16+0], xmm0
		movhps		[ecx-16+8], xmm0
		jl			loopVert4

	startVert1:
		and			edx, 3
		jz			done

	loopVert1:
		movss		xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
		mulss		xmm0, xmm4
		mulss		xmm1, xmm5
		mulss		xmm2, xmm6
		addss		xmm0, xmm7
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, DRAWVERT_SIZE
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loopVert1

	done:
	}
}

/*
============
idSIMD_SSE::Dot

  dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
	__asm
	{
		mov			eax, count
		mov			edi, src0
		mov			edx, eax
		mov			esi, src1
		mov			ecx, dst
		and			eax, ~3

		jz			done4
		imul		eax, 12
		add			edi, eax
		add			esi, eax
		neg			eax

	loop4:
		movlps		xmm0, [esi+eax]						// 0, 1, X, X
		movlps		xmm3, [edi+eax]						// 0, 1, X, X
		movlps		xmm1, [esi+eax+8]					// 2, 3, X, X
		movlps		xmm4, [edi+eax+8]					// 2, 3, X, X
		movhps		xmm0, [esi+eax+24]					// 0, 1, 6, 7
		movhps		xmm3, [edi+eax+24]					// 0, 1, 6, 7
		movhps		xmm1, [esi+eax+32]					// 2, 3, 8, 9
		movhps		xmm4, [edi+eax+32]					// 2, 3, 8, 9
		movlps		xmm2, [esi+eax+16]					// 4, 5, X, X
		movlps		xmm5, [edi+eax+16]					// 4, 5, X, X
		movhps		xmm2, [esi+eax+40]					// 4, 5, 10, 11
		movhps		xmm5, [edi+eax+40]					// 4, 5, 10, 11

		add			ecx, 16
		add			eax, 48

		mulps		xmm0, xmm3
		mulps		xmm1, xmm4
		mulps		xmm2, xmm5
		movaps		xmm7, xmm0
		shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )	// 0, 6, 3, 9
		shufps		xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 )	// 1, 7, 4, 10
		shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )	// 2, 8, 5, 11
		addps		xmm7, xmm0
		addps		xmm7, xmm1
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )

		movlps		[ecx-16+0], xmm7
		movhps		[ecx-16+8], xmm7
		jl			loop4

	done4:
		and			edx, 3
		jz			done1

	loop1:
		movss		xmm0, [esi+eax+0]
		movss		xmm3, [edi+eax+0]
		movss		xmm1, [esi+eax+4]
		movss		xmm4, [edi+eax+4]
		movss		xmm2, [esi+eax+8]
		movss		xmm5, [edi+eax+8]
		mulss		xmm0, xmm3
		mulss		xmm1, xmm4
		mulss		xmm2, xmm5
		add			ecx, 4
		addss		xmm0, xmm1
		add			eax, 12
		addss		xmm0, xmm2
		dec			edx
		movss		[ecx-4], xmm0
		jnz			loop1

	done1:
	}
}

/*
============
idSIMD_SSE::Dot

  dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
============
*/
void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
	switch( count ) {
		case 0:
			dot = 0.0f;
			return;
		case 1:
			dot = src1[0] * src2[0];
			return;
		case 2:
			dot = src1[0] * src2[0] + src1[1] * src2[1];
			return;
		case 3:
			dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
			return;
		default:
			__asm {
				mov			ecx, src1
				mov			edx, src2
				mov			eax, ecx
				or			eax, edx
				and			eax, 15
				jz			alignedDot
				// unaligned
				mov			eax, count
				shr			eax, 2
				shl			eax, 4
				add			ecx, eax
				add			edx, eax
				neg			eax
				movups		xmm0, [ecx+eax]
				movups		xmm1, [edx+eax]
				mulps		xmm0, xmm1
				add			eax, 16
				jz			doneDot
			loopUnalignedDot:
				movups		xmm1, [ecx+eax]
				movups		xmm2, [edx+eax]
				mulps		xmm1, xmm2
				addps		xmm0, xmm1
				add			eax, 16
				jl			loopUnalignedDot
				jmp			doneDot
				// aligned
			alignedDot:
				mov			eax, count
				shr			eax, 2
				shl			eax, 4
				add			ecx, eax
				add			edx, eax
				neg			eax
				movaps		xmm0, [ecx+eax]
				movaps		xmm1, [edx+eax]
				mulps		xmm0, xmm1
				add			eax, 16
				jz			doneDot
			loopAlignedDot:
				movaps		xmm1, [ecx+eax]
				movaps		xmm2, [edx+eax]
				mulps		xmm1, xmm2
				addps		xmm0, xmm1
				add			eax, 16
				jl			loopAlignedDot
			doneDot:
			}
			switch( count & 3 ) {
				case 1:
					__asm {
						movss	xmm1, [ecx]
						movss	xmm2, [edx]
						mulss	xmm1, xmm2
						addss	xmm0, xmm1
					}
					break;
				case 2:
					__asm {
						xorps	xmm2, xmm2
						movlps	xmm1, [ecx]
						movlps	xmm2, [edx]
						mulps	xmm1, xmm2
						addps	xmm0, xmm1
					}
					break;
				case 3:
					__asm {
						movss	xmm1, [ecx]
						movhps	xmm1, [ecx+4]
						movss	xmm2, [edx]
						movhps	xmm2, [edx+4]
						mulps	xmm1, xmm2
						addps	xmm0, xmm1
					}
					break;
			}
			__asm {
				movhlps		xmm1, xmm0
				addps		xmm0, xmm1
				movaps		xmm1, xmm0
				shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
				addss		xmm0, xmm1
				mov			eax, dot
				movss		[eax], xmm0
			}
			return;
	}
}

//
//	cmpeqps		==		Equal
//	cmpneqps	!=		Not Equal
//	cmpltps		<		Less Than
//  cmpnltps	>=		Not Less Than
//	cmpnleps	>		Not Less Or Equal
//
#define FLIP	not al
#define NOFLIP

#define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP )				\
	int i, cnt, pre, post;																\
	float *aligned;																		\
																						\
	/* if the float array is not aligned on a 4 byte boundary */						\
	if ( ((int) SRC0) & 3 ) {															\
		/* unaligned memory access */													\
		pre = 0;																		\
		cnt = COUNT >> 2;																\
		post = COUNT - (cnt<<2);														\
		__asm	mov			edx, cnt													\
		__asm	test		edx, edx													\
		__asm	je			doneCmp														\
		__asm	push		ebx															\
		__asm	neg			edx															\
		__asm	mov			esi, SRC0													\
		__asm	prefetchnta	[esi+64]													\
		__asm	movss		xmm1, CONSTANT												\
		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						\
		__asm	mov			edi, DST													\
		__asm	mov			ecx, 0x01010101												\
		__asm loopNA:																	\
		__asm	movups		xmm0, [esi]													\
		__asm	prefetchnta	[esi+128]													\
		__asm	CMPSIMD		xmm0, xmm1													\
		__asm	movmskps	eax, xmm0													\
		__asm	DOFLIP																	\
		__asm	mov			ah, al														\
		__asm	shr			ah, 1														\
		__asm	mov			bx, ax														\
		__asm	shl			ebx, 14														\
		__asm	mov			bx, ax														\
		__asm	and			ebx, ecx													\
		__asm	mov			dword ptr [edi], ebx										\
		__asm	add			esi, 16														\
		__asm	add			edi, 4														\
		__asm	inc			edx															\
		__asm	jl			loopNA														\
		__asm	pop			ebx															\
	}																					\
	else {																				\
		/* aligned memory access */														\
		aligned = (float *) ((((int) SRC0) + 15) & ~15);								\
		if ( (int)aligned > ((int)src0) + COUNT ) {										\
			pre = COUNT;																\
			post = 0;																	\
		}																				\
		else {																			\
			pre = aligned - SRC0;														\
			cnt = (COUNT - pre) >> 2;													\
			post = COUNT - pre - (cnt<<2);												\
			__asm	mov			edx, cnt												\
			__asm	test		edx, edx												\
			__asm	je			doneCmp													\
			__asm	push		ebx														\
			__asm	neg			edx														\
			__asm	mov			esi, aligned											\
			__asm	prefetchnta	[esi+64]												\
			__asm	movss		xmm1, CONSTANT											\
			__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )					\
			__asm	mov			edi, DST												\
			__asm	add			edi, pre												\
			__asm	mov			ecx, 0x01010101											\
			__asm loopA:																\
			__asm	movaps		xmm0, [esi]												\
			__asm	prefetchnta	[esi+128]												\
			__asm	CMPSIMD		xmm0, xmm1												\
			__asm	movmskps	eax, xmm0												\
			__asm	DOFLIP																\
			__asm	mov			ah, al													\
			__asm	shr			ah, 1													\
			__asm	mov			bx, ax													\
			__asm	shl			ebx, 14													\
			__asm	mov			bx, ax													\
			__asm	and			ebx, ecx												\
			__asm	mov			dword ptr [edi], ebx									\
			__asm	add			esi, 16													\
			__asm	add			edi, 4													\
			__asm	inc			edx														\
			__asm	jl			loopA													\
			__asm	pop			ebx														\
		}																				\
	}																					\
	doneCmp:																			\
	double c = constant;																\
	for ( i = 0; i < pre; i++ ) {														\
		dst[i] = src0[i] CMP c;															\
	}																					\
 	for ( i = count - post; i < count; i++ ) {											\
		dst[i] = src0[i] CMP c;															\
	}

#define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP )	\
	int i, cnt, pre, post;																\
	float *aligned;																		\
																						\
	/* if the float array is not aligned on a 4 byte boundary */						\
	if ( ((int) SRC0) & 3 ) {															\
		/* unaligned memory access */													\
		pre = 0;																		\
		cnt = COUNT >> 2;																\
		post = COUNT - (cnt<<2);														\
		__asm	mov			edx, cnt													\
		__asm	test		edx, edx													\
		__asm	je			doneCmp														\
		__asm	push		ebx															\
		__asm	neg			edx															\
		__asm	mov			esi, SRC0													\
		__asm	prefetchnta	[esi+64]													\
		__asm	movss		xmm1, CONSTANT												\
		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						\
		__asm	mov			edi, DST													\
		__asm	mov			cl, bitNum													\
		__asm loopNA:																	\
		__asm	movups		xmm0, [esi]													\
		__asm	prefetchnta	[esi+128]													\
		__asm	CMPSIMD		xmm0, xmm1													\
		__asm	movmskps	eax, xmm0													\
		__asm	DOFLIP																	\
		__asm	mov			ah, al														\
		__asm	shr			ah, 1														\
		__asm	mov			bx, ax														\
		__asm	shl			ebx, 14														\
		__asm	mov			bx, ax														\
		__asm	and			ebx, 0x01010101												\
		__asm	shl			ebx, cl														\
		__asm	or			ebx, dword ptr [edi]										\
		__asm	mov			dword ptr [edi], ebx										\
		__asm	add			esi, 16														\
		__asm	add			edi, 4														\
		__asm	inc			edx															\
		__asm	jl			loopNA														\
		__asm	pop			ebx															\
	}																					\
	else {																				\
		/* aligned memory access */														\
		aligned = (float *) ((((int) SRC0) + 15) & ~15);								\
		if ( (int)aligned > ((int)src0) + COUNT ) {										\
			pre = COUNT;																\
			post = 0;																	\
		}																				\
		else {																			\
			pre = aligned - SRC0;														\
			cnt = (COUNT - pre) >> 2;													\
			post = COUNT - pre - (cnt<<2);												\
			__asm	mov			edx, cnt												\
			__asm	test		edx, edx												\
			__asm	je			doneCmp													\
			__asm	push		ebx														\
			__asm	neg			edx														\
			__asm	mov			esi, aligned											\
			__asm	prefetchnta	[esi+64]												\
			__asm	movss		xmm1, CONSTANT											\
			__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )					\
			__asm	mov			edi, DST												\
			__asm	add			edi, pre												\
			__asm	mov			cl, bitNum												\
			__asm loopA:																\
			__asm	movaps		xmm0, [esi]												\
			__asm	prefetchnta	[esi+128]												\
			__asm	CMPSIMD		xmm0, xmm1												\
			__asm	movmskps	eax, xmm0												\
			__asm	DOFLIP																\
			__asm	mov			ah, al													\
			__asm	shr			ah, 1													\
			__asm	mov			bx, ax													\
			__asm	shl			ebx, 14													\
			__asm	mov			bx, ax													\
			__asm	and			ebx, 0x01010101											\
			__asm	shl			ebx, cl													\
			__asm	or			ebx, dword ptr [edi]									\
			__asm	mov			dword ptr [edi], ebx									\
			__asm	add			esi, 16													\
			__asm	add			edi, 4													\
			__asm	inc			edx														\
			__asm	jl			loopA													\
			__asm	pop			ebx														\
		}																				\
	}																					\
	doneCmp:																			\
	float c = constant;																	\
	for ( i = 0; i < pre; i++ ) {														\
		dst[i] |= ( src0[i] CMP c ) << BITNUM;											\
	}																					\
 	for ( i = count - post; i < count; i++ ) {											\
		dst[i] |= ( src0[i] CMP c ) << BITNUM;											\
	}

/*
============
idSIMD_SSE::CmpGT

  dst[i] = src0[i] > constant;
============
*/
void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
	COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
}

/*
============
idSIMD_SSE::CmpGT

  dst[i] |= ( src0[i] > constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
}

/*
============
idSIMD_SSE::CmpGE

  dst[i] = src0[i] >= constant;
============
*/
void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
	COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
}

/*
============
idSIMD_SSE::CmpGE

  dst[i] |= ( src0[i] >= constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
}

/*
============
idSIMD_SSE::CmpLT

  dst[i] = src0[i] < constant;
============
*/
void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
	COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
}

/*
============
idSIMD_SSE::CmpLT

  dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
}

/*
============
idSIMD_SSE::CmpLE

  dst[i] = src0[i] <= constant;
============
*/
void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
	COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
}

/*
============
idSIMD_SSE::CmpLE

  dst[i] |= ( src0[i] <= constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
}

/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
	int i, pre, post;

	min = idMath::INFINITY; max = -idMath::INFINITY;

	__asm
	{
		push		ebx
		mov			eax, min
		mov			ebx, max
		movss		xmm0, [eax]
		movss		xmm1, [ebx]
		shufps		xmm0, xmm0, 0
		shufps		xmm1, xmm1, 0

		KFLOATINITS( src, count, pre, post )
		and			eax, 15
		jz			lpA
		jmp			lpNA
		align		16
lpNA:
		movups		xmm2, [edx+ebx]
		movups		xmm3, [edx+ebx+16]
		minps		xmm0, xmm2
		maxps		xmm1, xmm2
		prefetchnta	[edx+ebx+64]
		minps		xmm0, xmm3
		maxps		xmm1, xmm3
		add			ebx, 16*2
		jl			lpNA
		jmp			done2
lpA:
		movaps		xmm2, [edx+ebx]
		movaps		xmm3, [edx+ebx+16]
		minps		xmm0, xmm2
		maxps		xmm1, xmm2
		prefetchnta	[edx+ebx+64]
		minps		xmm0, xmm3
		maxps		xmm1, xmm3
		add			ebx, 16*2
		jl			lpA
		jmp			done2
		align		16
done2:
		movaps		xmm2, xmm0
		movaps		xmm3, xmm1
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
		minss		xmm0, xmm2
		maxss		xmm1, xmm3
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
		minss		xmm0, xmm2
		maxss		xmm1, xmm3
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
		minss		xmm0, xmm2
		maxss		xmm1, xmm3
		mov			eax, min
		mov			ebx, max
		movss		[eax], xmm0
		movss		[ebx], xmm1
done:
		pop			ebx
	}

	for ( i = 0; i < pre; i++ ) {
		float tmp = src[i];
		if ( tmp > max ) {
			max = tmp;
		}
		if ( tmp < min ) {
			min = tmp;
		}
	}
 	for ( i = count - post; i < count; i++ ) {
		float tmp = src[i];
		if ( tmp > max ) {
			max = tmp;
		}
		if ( tmp < min ) {
			min = tmp;
		}
	}
}

/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
	__asm {
		mov			eax, count
		test		eax, eax
		movss		xmm0, idMath::INFINITY
		xorps		xmm1, xmm1
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		subps		xmm1, xmm0
		jz			done
		mov			ecx, eax
		and			ecx, 1
		mov			esi, src
		jz			startLoop
		movlps		xmm2, [esi]
		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
		dec			eax
		add			esi, 2*4
		minps		xmm0, xmm2
		maxps		xmm1, xmm2
	startLoop:
		imul		eax, 2*4
		add			esi, eax
		neg			eax
	loopVert:
		movlps		xmm2, [esi+eax]
		movhps		xmm2, [esi+eax+8]
		add			eax, 4*4
		minps		xmm0, xmm2
		maxps		xmm1, xmm2
		jl			loopVert
	done:
		movaps		xmm2, xmm0
		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
		minps		xmm0, xmm2
		mov			esi, min
		movlps		[esi], xmm0
		movaps		xmm3, xmm1
		shufps		xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
		maxps		xmm1, xmm3
		mov			edi, max
		movlps		[edi], xmm1
	}
}

/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
	__asm {

		movss		xmm0, idMath::INFINITY
		xorps		xmm1, xmm1
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		subps		xmm1, xmm0
		movaps		xmm2, xmm0
		movaps		xmm3, xmm1

		mov			esi, src
		mov			eax, count
		and			eax, ~3
		jz			done4
		imul		eax, 12
		add			esi, eax
		neg			eax

	loop4:
//		prefetchnta	[esi+4*12]

		movss		xmm4, [esi+eax+0*12+8]
		movhps		xmm4, [esi+eax+0*12+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4

		movss		xmm5, [esi+eax+1*12+0]
		movhps		xmm5, [esi+eax+1*12+4]
		minps		xmm2, xmm5
		maxps		xmm3, xmm5

		movss		xmm6, [esi+eax+2*12+8]
		movhps		xmm6, [esi+eax+2*12+0]
		minps		xmm0, xmm6
		maxps		xmm1, xmm6

		movss		xmm7, [esi+eax+3*12+0]
		movhps		xmm7, [esi+eax+3*12+4]
		minps		xmm2, xmm7
		maxps		xmm3, xmm7

		add			eax, 4*12
		jl			loop4

	done4:
		mov			eax, count
		and			eax, 3
		jz			done1
		imul		eax, 12
		add			esi, eax
		neg			eax

	loop1:
		movss		xmm4, [esi+eax+0*12+8]
		movhps		xmm4, [esi+eax+0*12+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4

		add			eax, 12
		jl			loop1

	done1:
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
		minps		xmm0, xmm2
		maxps		xmm1, xmm3
		mov			esi, min
		movhps		[esi], xmm0
		movss		[esi+8], xmm0
		mov			edi, max
		movhps		[edi], xmm1
		movss		[edi+8], xmm1
	}
}

/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__asm {

		movss		xmm0, idMath::INFINITY
		xorps		xmm1, xmm1
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		subps		xmm1, xmm0
		movaps		xmm2, xmm0
		movaps		xmm3, xmm1

		mov			esi, src
		mov			eax, count
		and			eax, ~3
		jz			done4
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax

	loop4:
//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]

		movss		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4

		movss		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		minps		xmm2, xmm5
		maxps		xmm3, xmm5

		movss		xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm6
		maxps		xmm1, xmm6

		movss		xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		minps		xmm2, xmm7
		maxps		xmm3, xmm7

		add			eax, 4*DRAWVERT_SIZE
		jl			loop4

	done4:
		mov			eax, count
		and			eax, 3
		jz			done1
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax

	loop1:
		movss		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4

		add			eax, DRAWVERT_SIZE
		jl			loop1

	done1:
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
		minps		xmm0, xmm2
		maxps		xmm1, xmm3
		mov			esi, min
		movhps		[esi], xmm0
		movss		[esi+8], xmm0
		mov			edi, max
		movhps		[edi], xmm1
		movss		[edi+8], xmm1
	}
}

/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__asm {

		movss		xmm0, idMath::INFINITY
		xorps		xmm1, xmm1
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		subps		xmm1, xmm0
		movaps		xmm2, xmm0
		movaps		xmm3, xmm1

		mov			edi, indexes
		mov			esi, src
		mov			eax, count
		and			eax, ~3
		jz			done4
		shl			eax, 2
		add			edi, eax
		neg			eax

	loop4:
//		prefetchnta	[edi+128]
//		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]

		mov			edx, [edi+eax+0]
		imul		edx, DRAWVERT_SIZE
		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4

		mov			edx, [edi+eax+4]
		imul		edx, DRAWVERT_SIZE
		movss		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
		minps		xmm2, xmm5
		maxps		xmm3, xmm5

		mov			edx, [edi+eax+8]
		imul		edx, DRAWVERT_SIZE
		movss		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm6
		maxps		xmm1, xmm6

		mov			edx, [edi+eax+12]
		imul		edx, DRAWVERT_SIZE
		movss		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
		minps		xmm2, xmm7
		maxps		xmm3, xmm7

		add			eax, 4*4
		jl			loop4

	done4:
		mov			eax, count
		and			eax, 3
		jz			done1
		shl			eax, 2
		add			edi, eax
		neg			eax

	loop1:
		mov			edx, [edi+eax+0]
		imul		edx, DRAWVERT_SIZE;
		movss		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
		minps		xmm0, xmm4
		maxps		xmm1, xmm4

		add			eax, 4
		jl			loop1

	done1:
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
		minps		xmm0, xmm2
		maxps		xmm1, xmm3
		mov			esi, min
		movhps		[esi], xmm0
		movss		[esi+8], xmm0
		mov			edi, max
		movhps		[edi], xmm1
		movss		[edi+8], xmm1
	}
}

/*
============
idSIMD_SSE::Clamp
============
*/
void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
	int	i, pre, post;

	__asm
	{
		movss	xmm0,min
		movss	xmm1,max
		shufps	xmm0,xmm0,0
		shufps	xmm1,xmm1,0

		KFLOATINITDS( dst, src, count, pre, post )
		and		eax,15
		jne		lpNA
		jmp		lpA
		align	16
lpA:
		movaps	xmm2,[edx+ebx]
		movaps	xmm3,[edx+ebx+16]
		maxps	xmm2,xmm0
		maxps	xmm3,xmm0
		prefetchnta	[edx+ebx+64]
		minps	xmm2,xmm1
		minps	xmm3,xmm1
		movaps	[edi+ebx],xmm2
		movaps	[edi+ebx+16],xmm3
		add		ebx,16*2
		jl		lpA
		jmp		done

		align	16
lpNA:
		movups	xmm2,[edx+ebx]
		movups	xmm3,[edx+ebx+16]
		maxps	xmm2,xmm0
		maxps	xmm3,xmm0
		prefetchnta	[edx+ebx+64]
		minps	xmm2,xmm1
		minps	xmm3,xmm1
		movaps	[edi+ebx],xmm2
		movaps	[edi+ebx+16],xmm3
		add		ebx,16*2
		jl		lpNA
done:
	}

	for ( i = 0; i < pre; i++ ) {
		if ( src[i] < min )
			dst[i] = min;
		else if ( src[i] > max )
			dst[i] = max;
		else
			dst[i] = src[i];
	}

	for( i = count - post; i < count; i++ ) {
		if ( src[i] < min )
			dst[i] = min;
		else if ( src[i] > max )
			dst[i] = max;
		else
			dst[i] = src[i];
	}
}

/*
============
idSIMD_SSE::ClampMin
============
*/
void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
	int	i, pre, post;

	__asm
	{
		movss	xmm0,min
		shufps	xmm0,xmm0,0

		KFLOATINITDS( dst, src, count, pre, post )
		and		eax,15
		jne		lpNA
		jmp		lpA
		align	16
lpA:
		movaps	xmm2,[edx+ebx]
		movaps	xmm3,[edx+ebx+16]
		maxps	xmm2,xmm0
		prefetchnta	[edx+ebx+64]
		maxps	xmm3,xmm0
		movaps	[edi+ebx],xmm2
		movaps	[edi+ebx+16],xmm3
		add		ebx,16*2
		jl		lpA
		jmp		done

		align	16
lpNA:
		movups	xmm2,[edx+ebx]
		movups	xmm3,[edx+ebx+16]
		maxps	xmm2,xmm0
		prefetchnta	[edx+ebx+64]
		maxps	xmm3,xmm0
		movaps	[edi+ebx],xmm2
		movaps	[edi+ebx+16],xmm3
		add		ebx,16*2
		jl		lpNA
done:
	}

	for( i = 0; i < pre; i++ ) {
		if ( src[i] < min )
			dst[i] = min;
		else
			dst[i] = src[i];
	}
	for( i = count - post; i < count; i++ ) {
		if ( src[i] < min )
			dst[i] = min;
		else
			dst[i] = src[i];
	}
}

/*
============
idSIMD_SSE::ClampMax
============
*/
void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
	int	i, pre, post;

	__asm
	{
		movss	xmm1,max
		shufps	xmm1,xmm1,0

		KFLOATINITDS( dst, src, count, pre, post )
		and		eax,15
		jne		lpNA
		jmp		lpA
		align	16
lpA:
		movaps	xmm2,[edx+ebx]
		movaps	xmm3,[edx+ebx+16]
		minps	xmm2,xmm1
		prefetchnta	[edx+ebx+64]
		minps	xmm3,xmm1
		movaps	[edi+ebx],xmm2
		movaps	[edi+ebx+16],xmm3
		add		ebx,16*2
		jl		lpA
		jmp		done

		align	16
lpNA:
		movups	xmm2,[edx+ebx]
		movups	xmm3,[edx+ebx+16]
		minps	xmm2,xmm1
		prefetchnta	[edx+ebx+64]
		minps	xmm3,xmm1
		movaps	[edi+ebx],xmm2
		movaps	[edi+ebx+16],xmm3
		add		ebx,16*2
		jl		lpNA
done:
	}

	for( i = 0; i < pre; i++ ) {
		if ( src[i] > max )
			dst[i] = max;
		else
			dst[i] = src[i];
	}

	for( i = count - post; i < count; i++ ) {
		if ( src[i] > max )
			dst[i] = max;
		else
			dst[i] = src[i];
	}
}

/*
============
idSIMD_SSE::Zero16
============
*/
void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
	__asm {
		mov		edx, dst
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneZero16
		shl		eax, 4
		add		edx, eax
		neg		eax
		xorps	xmm0, xmm0
	loopZero16:
		movaps	[edx+eax], xmm0
		add		eax, 16
		jl		loopZero16
	doneZero16:
	}
}

/*
============
idSIMD_SSE::Negate16
============
*/
void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
	__asm {
		mov		edx, dst
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneNegate16
		shl		eax, 4
		add		edx, eax
		neg		eax
		movss	xmm0, SIMD_SP_signBitMask
		shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
	loopNegate16:
		movaps	xmm1, [edx+eax]
		xorps	xmm1, xmm0
		movaps	[edx+eax], xmm1
		add		eax, 16
		jl		loopNegate16
	doneNegate16:
	}
}

/*
============
idSIMD_SSE::Copy16
============
*/
void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
	__asm {
		mov		ecx, src
		mov		edx, dst
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneCopy16
		shl		eax, 4
		add		ecx, eax
		add		edx, eax
		neg		eax
	loopCopy16:
		movaps	xmm0, [ecx+eax]
		movaps	[edx+eax], xmm0
		add		eax, 16
		jl		loopCopy16
	doneCopy16:
	}
}

/*
============
idSIMD_SSE::Add16
============
*/
void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
	__asm {
		mov		ecx, src1
		mov		edx, src2
		mov		esi, dst
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneAdd16
		shl		eax, 4
		add		esi, eax
		add		ecx, eax
		add		edx, eax
		neg		eax
	loopAdd16:
		movaps	xmm0, [ecx+eax]
		addps	xmm0, [edx+eax]
		movaps	[esi+eax], xmm0
		add		eax, 16
		jl		loopAdd16
	doneAdd16:
	}
}

/*
============
idSIMD_SSE::Sub16
============
*/
void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
	__asm {
		mov		ecx, src1
		mov		edx, src2
		mov		esi, dst
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneSub16
		shl		eax, 4
		add		esi, eax
		add		ecx, eax
		add		edx, eax
		neg		eax
	loopSub16:
		movaps	xmm0, [ecx+eax]
		subps	xmm0, [edx+eax]
		movaps	[esi+eax], xmm0
		add		eax, 16
		jl		loopSub16
	doneSub16:
	}
}

/*
============
idSIMD_SSE::Mul16
============
*/
void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
	__asm {
		mov		ecx, dst
		mov		edx, src1
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneMulScalar16
		movss	xmm1, constant
		shl		eax, 4
		add		ecx, eax
		add		edx, eax
		neg		eax
		shufps	xmm1, xmm1, 0x00
	loopMulScalar16:
		movaps	xmm0, [edx+eax]
		mulps	xmm0, xmm1
		movaps	[ecx+eax], xmm0
		add		eax, 16
		jl		loopMulScalar16
	doneMulScalar16:
	}
}

/*
============
idSIMD_SSE::AddAssign16
============
*/
void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
	__asm {
		mov		ecx, dst
		mov		edx, src
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneAddAssign16
		shl		eax, 4
		add		ecx, eax
		add		edx, eax
		neg		eax
	loopAddAssign16:
		movaps	xmm0, [ecx+eax]
		addps	xmm0, [edx+eax]
		movaps	[ecx+eax], xmm0
		add		eax, 16
		jl		loopAddAssign16
	doneAddAssign16:
	}
}

/*
============
idSIMD_SSE::SubAssign16
============
*/
void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
	__asm {
		mov		ecx, dst
		mov		edx, src
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneSubAssign16
		shl		eax, 4
		add		ecx, eax
		add		edx, eax
		neg		eax
	loopSubAssign16:
		movaps	xmm0, [ecx+eax]
		subps	xmm0, [edx+eax]
		movaps	[ecx+eax], xmm0
		add		eax, 16
		jl		loopSubAssign16
	doneSubAssign16:
	}
}

/*
============
idSIMD_SSE::MulAssign16
============
*/
void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
	__asm {
		mov		ecx, dst
		mov		eax, count
		add		eax, 3
		shr		eax, 2
		jz		doneMulAssign16
		movss	xmm1, constant
		shl		eax, 4
		add		ecx, eax
		neg		eax
		shufps	xmm1, xmm1, 0x00
	loopMulAssign16:
		movaps	xmm0, [ecx+eax]
		mulps	xmm0, xmm1
		movaps	[ecx+eax], xmm0
		add		eax, 16
		jl		loopMulAssign16
	doneMulAssign16:
	}
}

/*
============
idSIMD_SSE::MatX_MultiplyVecX

	optimizes the following matrix multiplications:

	NxN * Nx1
	Nx6 * 6x1
	6xN * Nx1

	with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 )		\
	__asm movss		[eax+offset], reg1
#define STORE2LO( offset, reg1, reg2 )		\
	__asm movlps	[eax+offset], reg1
#define STORE2HI( offset, reg1, reg2 )		\
	__asm movhps	[eax+offset], reg1
#define STORE4( offset, reg1, reg2 )		\
	__asm movlps	[eax+offset], reg1		\
	__asm movhps	[eax+offset+8], reg1
#define STOREC		=

	int numRows;
	const float *mPtr, *vPtr;
	float *dstPtr;

	assert( vec.GetSize() >= mat.GetNumColumns() );
	assert( dst.GetSize() >= mat.GetNumRows() );

	mPtr = mat.ToFloatPtr();
	vPtr = vec.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	numRows = mat.GetNumRows();
	switch( mat.GetNumColumns() ) {
		case 1: {
			switch( numRows ) {
				case 1: {		// 1x1 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						mulss		xmm0, [edi]
						STORE1( 0, xmm0, xmm1 )
					}
					return;
				}
				case 6: {		// 6x1 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm1, xmm0
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						STORE4( 0, xmm0, xmm2 )
						STORE2LO( 16, xmm1, xmm2 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0];
						mPtr++;
					}
					return;
				}
			}
			break;
		}
		case 2: {
			switch( numRows ) {
				case 2: {		// 2x2 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						movss		xmm1, [esi+4]
						movss		xmm2, [edi]
						mulss		xmm2, xmm0
						movss		xmm3, [edi+4]
						mulss		xmm3, xmm1
						addss		xmm2, xmm3
						STORE1( 0, xmm2, xmm4 )
						mulss		xmm0, [edi+8]
						mulss		xmm1, [edi+8+4]
						addss		xmm0, xmm1
						STORE1( 4, xmm0, xmm4 )
					}
					return;
				}
				case 6: {		// 6x2 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm7, [esi]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movaps		xmm0, [edi]
						mulps		xmm0, xmm7
						movaps		xmm1, [edi+16]
						mulps		xmm1, xmm7
						movaps		xmm2, xmm0
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						movaps		xmm3, [edi+32]
						addps		xmm0, xmm2
						mulps		xmm3, xmm7
						STORE4( 0, xmm0, xmm4 )
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm1, xmm3
						addps		xmm3, xmm1
						STORE2LO( 16, xmm3, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
						mPtr += 2;
					}
					return;
				}
			}
			break;
		}
		case 3: {
			switch( numRows ) {
				case 3: {		// 3x3 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						movss		xmm4, [edi]
						mulss		xmm4, xmm0
						movss		xmm1, [esi+4]
						movss		xmm5, [edi+4]
						mulss		xmm5, xmm1
						addss		xmm4, xmm5
						movss		xmm2, [esi+8]
						movss		xmm6, [edi+8]
						mulss		xmm6, xmm2
						addss		xmm4, xmm6
						movss		xmm3, [edi+12]
						mulss		xmm3, xmm0
						STORE1( 0, xmm4, xmm7 );
						movss		xmm5, [edi+12+4]
						mulss		xmm5, xmm1
						addss		xmm3, xmm5
						movss		xmm6, [edi+12+8]
						mulss		xmm6, xmm2
						addss		xmm3, xmm6
						mulss		xmm0, [edi+24]
						mulss		xmm1, [edi+24+4]
						STORE1( 4, xmm3, xmm7 );
						addss		xmm0, xmm1
						mulss		xmm2, [edi+24+8]
						addss		xmm0, xmm2
						STORE1( 8, xmm0, xmm7 );
					}
					return;
				}
				case 6: {		// 6x3 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm5, [esi]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						movss		xmm6, [esi+4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						movss		xmm7, [esi+8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
						movlps		xmm1, [edi+4*4]
						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
						movlps		xmm2, [edi+6*4]
						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
						mulps		xmm0, xmm5
						movlps		xmm3, [edi+10*4]
						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
						movaps		xmm3, xmm1
						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
						mulps		xmm1, xmm6
						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
						mulps		xmm3, xmm7
						addps		xmm0, xmm1
						addps		xmm0, xmm3
						STORE4( 0, xmm0, xmm4 )
						movss		xmm1, [edi+12*4]
						mulss		xmm1, xmm5
						movss		xmm2, [edi+13*4]
						mulss		xmm2, xmm6
						movss		xmm3, [edi+14*4]
						mulss		xmm3, xmm7
						addss		xmm1, xmm2
						addss		xmm1, xmm3
						STORE1( 16, xmm1, xmm4 )
						mulss		xmm5, [edi+15*4]
						mulss		xmm6, [edi+16*4]
						mulss		xmm7, [edi+17*4]
						addss		xmm5, xmm6
						addss		xmm5, xmm7
						STORE1( 20, xmm5, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
						mPtr += 3;
					}
					return;
				}
			}
			break;
		}
		case 4: {
			switch( numRows ) {
				case 4: {		// 4x4 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, qword ptr [esi ]
						movlps		xmm0, qword ptr [edi ]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm0, qword ptr [edi+16]
						mulps		xmm0, xmm6
						movlps		xmm7, qword ptr [esi+ 8]
						movlps		xmm2, qword ptr [edi+ 8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm2, qword ptr [edi+24]
						mulps		xmm2, xmm7
						movlps		xmm1, qword ptr [edi+32]
						movhps		xmm1, qword ptr [edi+48]
						mulps		xmm1, xmm6
						movlps		xmm3, qword ptr [edi+40]
						addps		xmm0, xmm2
						movhps		xmm3, qword ptr [edi+56]
						mulps		xmm3, xmm7
						movaps		xmm4, xmm0
						addps		xmm1, xmm3
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm4
						STORE4( 0, xmm0, xmm2 )
					}
					return;
				}
				case 6: {		// 6x4 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, qword ptr [esi+ 0]
						movlps		xmm0, qword ptr [edi+ 0]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm0, qword ptr [edi+16]
						mulps		xmm0, xmm6
						movlps		xmm7, qword ptr [esi+ 8]
						movlps		xmm2, qword ptr [edi+ 8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm2, qword ptr [edi+24]
						mulps		xmm2, xmm7
						movlps		xmm1, qword ptr [edi+32]
						movhps		xmm1, qword ptr [edi+48]
						mulps		xmm1, xmm6
						movlps		xmm3, qword ptr [edi+40]
						addps		xmm0, xmm2
						movhps		xmm3, qword ptr [edi+56]
						mulps		xmm3, xmm7
						movaps		xmm4, xmm0
						addps		xmm1, xmm3
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm4
						movlps		xmm1, qword ptr [edi+64]
						movhps		xmm1, qword ptr [edi+80]
						STORE4( 0, xmm0, xmm4 )
						mulps		xmm1, xmm6
						movlps		xmm2, qword ptr [edi+72]
						movhps		xmm2, qword ptr [edi+88]
						mulps		xmm2, xmm7
						addps		xmm1, xmm2
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm3, xmm1
						addps		xmm1, xmm3
						STORE2LO( 16, xmm1, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
						mPtr += 4;
					}
					return;
				}
			}
			break;
		}
		case 5: {
			switch( numRows ) {
				case 5: {		// 5x5 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
						movss		xmm7, [esi+0*4]
						shufps		xmm7, xmm7, 0
						mulps		xmm0, xmm7
						movss		xmm5, [esi+1*4]
						shufps		xmm5, xmm5, 0
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movss		xmm6, [esi+2*4]
						shufps		xmm6, xmm6, 0
						mulps		xmm2, xmm6
						addps		xmm0, xmm2
						movss		xmm1, [esi+3*4]
						shufps		xmm1, xmm1, 0
						mulps		xmm3, xmm1
						addps		xmm0, xmm3
						movss		xmm2, [esi+4*4]
						shufps		xmm2, xmm2, 0
						mulps		xmm4, xmm2
						addps		xmm0, xmm4
						mulss		xmm7, [edi+20*4]
						mulss		xmm5, [edi+21*4]
						addps		xmm7, xmm5
						mulss		xmm6, [edi+22*4]
						addps		xmm7, xmm6
						mulss		xmm1, [edi+23*4]
						addps		xmm7, xmm1
						mulss		xmm2, [edi+24*4]
						addps		xmm7, xmm2
						STORE4( 0, xmm0, xmm3 )
						STORE1( 16, xmm7, xmm4 )
					}
					return;
				}
				case 6: {		// 6x5 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, [esi]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movlps		xmm7, [esi+8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movlps		xmm0, [edi]
						movhps		xmm3, [edi+8]
						movaps		xmm1, [edi+16]
						movlps		xmm2, [edi+32]
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
						mulps		xmm0, xmm6
						mulps		xmm3, xmm7
						movlps		xmm2, [edi+40]
						addps		xmm0, xmm3								// xmm0 + xmm1
						movhps		xmm5, [edi+40+8]
						movlps		xmm3, [edi+40+16]
						movhps		xmm3, [edi+40+24]
						movlps		xmm4, [edi+40+32]
						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
						mulps		xmm2, xmm6
						mulps		xmm5, xmm7
						addps		xmm2, xmm5								// xmm2 + xmm3
						movss		xmm5, [esi+16]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm4, xmm0
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
						addps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						STORE4( 0, xmm0, xmm2 )
						movlps		xmm4, [edi+80]
						movhps		xmm3, [edi+80+8]
						movaps		xmm1, [edi+80+16]
						movlps		xmm2, [edi+80+32]
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
						mulps		xmm4, xmm6
						mulps		xmm3, xmm7
						mulps		xmm1, xmm5
						addps		xmm4, xmm3								// xmm4 + xmm1
						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
						addps		xmm4, xmm1
						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
						addps		xmm4, xmm1
						STORE2LO( 16, xmm4, xmm2 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
						mPtr += 5;
					}
					return;
				}
			}
			break;
		}
		case 6: {
			switch( numRows ) {
				case 1: {		// 1x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
                        mov			eax, dstPtr
						movss		xmm0, [esi]
						mulss		xmm0, [edi]
						movss		xmm1, [esi+4]
						mulss		xmm1, [edi+4]
						movss		xmm2, [esi+8]
						addss		xmm0, xmm1
						mulss		xmm2, [edi+8]
						movss		xmm3, [esi+12]
						addss		xmm0, xmm2
						mulss		xmm3, [edi+12]
						movss		xmm4, [esi+16]
						addss		xmm0, xmm3
						mulss		xmm4, [edi+16]
						movss		xmm5, [esi+20]
						addss		xmm0, xmm4
						mulss		xmm5, [edi+20]
						movss		xmm6, [esi+24]
						addss		xmm0, xmm5
						mulss		xmm6, [edi+24]
						addss		xmm0, xmm6
						STORE1( 0, xmm0, xmm7 )
					}
					return;
				}
				case 2: {		// 2x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm0, xmm1
						addps		xmm0, xmm1
						STORE2LO( 0, xmm0, xmm3 )
					}
					return;
				}
				case 3: {		// 3x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm0, xmm1
						addps		xmm0, xmm1
						STORE2LO( 0, xmm0, xmm3 )
						// row 2
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movhlps		xmm1, xmm0
						addps		xmm0, xmm1
						movaps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
						addss		xmm0, xmm1
						STORE1( 8, xmm0, xmm3 )
					}
					return;
				}
				case 4: {		// 4x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm7, xmm0
						movlhps		xmm7, xmm2
						addps		xmm7, xmm1
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm7, xmm0
						// row 2 and 3
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						movaps		xmm2, [edi+48+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						// last 4 additions for the first 4 rows and store result
						movaps		xmm0, xmm7
						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm7
						STORE4( 0, xmm0, xmm4 )
					}
					return;
				}
				case 5: {		// 5x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm7, xmm0
						movlhps		xmm7, xmm2
						addps		xmm7, xmm1
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm7, xmm0
						// row 2 and 3
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						movaps		xmm2, [edi+48+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						// last 4 additions for the first 4 rows and store result
						movaps		xmm0, xmm7
						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm7
						STORE4( 0, xmm0, xmm3 )
						// row 5
						movaps		xmm0, [edi+96]
						movaps		xmm1, [edi+96+16]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movhlps		xmm1, xmm0
						addps		xmm0, xmm1
						movaps		xmm1, xmm0
						shufps		xmm1, xmm1, 0x01
						addss		xmm0, xmm1
						STORE1( 16, xmm0, xmm3 )
					}
					return;
				}
				case 6: {		// 6x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm7, qword ptr [esi]
						movlps		xmm6, qword ptr [esi+8]
						shufps		xmm7, xmm7, 0x44
						shufps		xmm6, xmm6, 0x44
						movlps		xmm0, qword ptr [edi    ]
						movhps		xmm0, qword ptr [edi+ 24]
						mulps		xmm0, xmm7
						movlps		xmm3, qword ptr [edi+  8]
						movhps		xmm3, qword ptr [edi+ 32]
						mulps		xmm3, xmm6
						movlps		xmm1, qword ptr [edi+ 48]
						movhps		xmm1, qword ptr [edi+ 72]
						mulps		xmm1, xmm7
						movlps		xmm2, qword ptr [edi+ 96]
						movhps		xmm2, qword ptr [edi+120]
						mulps		xmm2, xmm7
						movlps		xmm4, qword ptr [edi+ 56]
						movhps		xmm4, qword ptr [edi+ 80]
						movlps		xmm5, qword ptr [edi+104]
						movhps		xmm5, qword ptr [edi+128]
						mulps		xmm4, xmm6
						movlps		xmm7, qword ptr [esi+16]
						addps		xmm0, xmm3
						shufps		xmm7, xmm7, 0x44
						mulps		xmm5, xmm6
						addps		xmm1, xmm4
						movlps		xmm3, qword ptr [edi+ 16]
						movhps		xmm3, qword ptr [edi+ 40]
						addps		xmm2, xmm5
						movlps		xmm4, qword ptr [edi+ 64]
						movhps		xmm4, qword ptr [edi+ 88]
						mulps		xmm3, xmm7
						movlps		xmm5, qword ptr [edi+112]
						movhps		xmm5, qword ptr [edi+136]
						addps		xmm0, xmm3
						mulps		xmm4, xmm7
						mulps		xmm5, xmm7
						addps		xmm1, xmm4
						addps		xmm2, xmm5
						movaps		xmm6, xmm0
						shufps		xmm0, xmm1, 0x88
						shufps		xmm6, xmm1, 0xDD
						movaps		xmm7, xmm2
						shufps		xmm7, xmm2, 0x88
						shufps		xmm2, xmm2, 0xDD
						addps		xmm0, xmm6
						addps		xmm2, xmm7
						STORE4( 0, xmm0, xmm3 )
						STORE2LO( 16, xmm2, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
						mPtr += 6;
					}
					return;
				}
			}
			break;
		}
		default: {
			int numColumns = mat.GetNumColumns();
			for ( int i = 0; i < numRows; i++ ) {
				float sum = mPtr[0] * vPtr[0];
				for ( int j = 1; j < numColumns; j++ ) {
					sum += mPtr[j] * vPtr[j];
				}
				dstPtr[i] STOREC sum;
				mPtr += numColumns;
			}
			break;
		}
	}

#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}

/*
============
idSIMD_SSE::MatX_MultiplyAddVecX

	optimizes the following matrix multiplications:

	NxN * Nx1
	Nx6 * 6x1
	6xN * Nx1

	with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 )		\
	__asm movss		reg2, [eax+offset]		\
	__asm addss		reg2, reg1				\
	__asm movss		[eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm addps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 )		\
	__asm movhps	reg2, [eax+offset]		\
	__asm addps		reg2, reg1				\
	__asm movhps	[eax+offset], reg2
#define STORE4( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm movhps	reg2, [eax+offset+8]	\
	__asm addps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2		\
	__asm movhps	[eax+offset+8], reg2
#define STOREC		+=

	int numRows;
	const float *mPtr, *vPtr;
	float *dstPtr;

	assert( vec.GetSize() >= mat.GetNumColumns() );
	assert( dst.GetSize() >= mat.GetNumRows() );

	mPtr = mat.ToFloatPtr();
	vPtr = vec.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	numRows = mat.GetNumRows();
	switch( mat.GetNumColumns() ) {
		case 1: {
			switch( numRows ) {
				case 1: {		// 1x1 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						mulss		xmm0, [edi]
						STORE1( 0, xmm0, xmm1 )
					}
					return;
				}
				case 6: {		// 6x1 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm1, xmm0
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						STORE4( 0, xmm0, xmm2 )
						STORE2LO( 16, xmm1, xmm2 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0];
						mPtr++;
					}
					return;
				}
			}
			break;
		}
		case 2: {
			switch( numRows ) {
				case 2: {		// 2x2 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						movss		xmm1, [esi+4]
						movss		xmm2, [edi]
						mulss		xmm2, xmm0
						movss		xmm3, [edi+4]
						mulss		xmm3, xmm1
						addss		xmm2, xmm3
						STORE1( 0, xmm2, xmm4 )
						mulss		xmm0, [edi+8]
						mulss		xmm1, [edi+8+4]
						addss		xmm0, xmm1
						STORE1( 4, xmm0, xmm4 )
					}
					return;
				}
				case 6: {		// 6x2 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm7, [esi]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movaps		xmm0, [edi]
						mulps		xmm0, xmm7
						movaps		xmm1, [edi+16]
						mulps		xmm1, xmm7
						movaps		xmm2, xmm0
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						movaps		xmm3, [edi+32]
						addps		xmm0, xmm2
						mulps		xmm3, xmm7
						STORE4( 0, xmm0, xmm4 )
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm1, xmm3
						addps		xmm3, xmm1
						STORE2LO( 16, xmm3, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
						mPtr += 2;
					}
					return;
				}
			}
			break;
		}
		case 3: {
			switch( numRows ) {
				case 3: {		// 3x3 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						movss		xmm4, [edi]
						mulss		xmm4, xmm0
						movss		xmm1, [esi+4]
						movss		xmm5, [edi+4]
						mulss		xmm5, xmm1
						addss		xmm4, xmm5
						movss		xmm2, [esi+8]
						movss		xmm6, [edi+8]
						mulss		xmm6, xmm2
						addss		xmm4, xmm6
						movss		xmm3, [edi+12]
						mulss		xmm3, xmm0
						STORE1( 0, xmm4, xmm7 );
						movss		xmm5, [edi+12+4]
						mulss		xmm5, xmm1
						addss		xmm3, xmm5
						movss		xmm6, [edi+12+8]
						mulss		xmm6, xmm2
						addss		xmm3, xmm6
						mulss		xmm0, [edi+24]
						mulss		xmm1, [edi+24+4]
						STORE1( 4, xmm3, xmm7 );
						addss		xmm0, xmm1
						mulss		xmm2, [edi+24+8]
						addss		xmm0, xmm2
						STORE1( 8, xmm0, xmm7 );
					}
					return;
				}
				case 6: {		// 6x3 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm5, [esi]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						movss		xmm6, [esi+4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						movss		xmm7, [esi+8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
						movlps		xmm1, [edi+4*4]
						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
						movlps		xmm2, [edi+6*4]
						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
						mulps		xmm0, xmm5
						movlps		xmm3, [edi+10*4]
						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
						movaps		xmm3, xmm1
						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
						mulps		xmm1, xmm6
						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
						mulps		xmm3, xmm7
						addps		xmm0, xmm1
						addps		xmm0, xmm3
						STORE4( 0, xmm0, xmm4 )
						movss		xmm1, [edi+12*4]
						mulss		xmm1, xmm5
						movss		xmm2, [edi+13*4]
						mulss		xmm2, xmm6
						movss		xmm3, [edi+14*4]
						mulss		xmm3, xmm7
						addss		xmm1, xmm2
						addss		xmm1, xmm3
						STORE1( 16, xmm1, xmm4 )
						mulss		xmm5, [edi+15*4]
						mulss		xmm6, [edi+16*4]
						mulss		xmm7, [edi+17*4]
						addss		xmm5, xmm6
						addss		xmm5, xmm7
						STORE1( 20, xmm5, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
						mPtr += 3;
					}
					return;
				}
			}
			break;
		}
		case 4: {
			switch( numRows ) {
				case 4: {		// 4x4 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, qword ptr [esi ]
						movlps		xmm0, qword ptr [edi ]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm0, qword ptr [edi+16]
						mulps		xmm0, xmm6
						movlps		xmm7, qword ptr [esi+ 8]
						movlps		xmm2, qword ptr [edi+ 8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm2, qword ptr [edi+24]
						mulps		xmm2, xmm7
						movlps		xmm1, qword ptr [edi+32]
						movhps		xmm1, qword ptr [edi+48]
						mulps		xmm1, xmm6
						movlps		xmm3, qword ptr [edi+40]
						addps		xmm0, xmm2
						movhps		xmm3, qword ptr [edi+56]
						mulps		xmm3, xmm7
						movaps		xmm4, xmm0
						addps		xmm1, xmm3
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm4
						STORE4( 0, xmm0, xmm2 )
					}
					return;
				}
				case 6: {		// 6x4 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, qword ptr [esi+ 0]
						movlps		xmm0, qword ptr [edi+ 0]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm0, qword ptr [edi+16]
						mulps		xmm0, xmm6
						movlps		xmm7, qword ptr [esi+ 8]
						movlps		xmm2, qword ptr [edi+ 8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm2, qword ptr [edi+24]
						mulps		xmm2, xmm7
						movlps		xmm1, qword ptr [edi+32]
						movhps		xmm1, qword ptr [edi+48]
						mulps		xmm1, xmm6
						movlps		xmm3, qword ptr [edi+40]
						addps		xmm0, xmm2
						movhps		xmm3, qword ptr [edi+56]
						mulps		xmm3, xmm7
						movaps		xmm4, xmm0
						addps		xmm1, xmm3
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm4
						movlps		xmm1, qword ptr [edi+64]
						movhps		xmm1, qword ptr [edi+80]
						STORE4( 0, xmm0, xmm4 )
						mulps		xmm1, xmm6
						movlps		xmm2, qword ptr [edi+72]
						movhps		xmm2, qword ptr [edi+88]
						mulps		xmm2, xmm7
						addps		xmm1, xmm2
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm3, xmm1
						addps		xmm1, xmm3
						STORE2LO( 16, xmm1, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
						mPtr += 4;
					}
					return;
				}
			}
			break;
		}
		case 5: {
			switch( numRows ) {
				case 5: {		// 5x5 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
						movss		xmm7, [esi+0*4]
						shufps		xmm7, xmm7, 0
						mulps		xmm0, xmm7
						movss		xmm5, [esi+1*4]
						shufps		xmm5, xmm5, 0
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movss		xmm6, [esi+2*4]
						shufps		xmm6, xmm6, 0
						mulps		xmm2, xmm6
						addps		xmm0, xmm2
						movss		xmm1, [esi+3*4]
						shufps		xmm1, xmm1, 0
						mulps		xmm3, xmm1
						addps		xmm0, xmm3
						movss		xmm2, [esi+4*4]
						shufps		xmm2, xmm2, 0
						mulps		xmm4, xmm2
						addps		xmm0, xmm4
						mulss		xmm7, [edi+20*4]
						mulss		xmm5, [edi+21*4]
						addps		xmm7, xmm5
						mulss		xmm6, [edi+22*4]
						addps		xmm7, xmm6
						mulss		xmm1, [edi+23*4]
						addps		xmm7, xmm1
						mulss		xmm2, [edi+24*4]
						addps		xmm7, xmm2
						STORE4( 0, xmm0, xmm3 )
						STORE1( 16, xmm7, xmm4 )
					}
					return;
				}
				case 6: {		// 6x5 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, [esi]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movlps		xmm7, [esi+8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movlps		xmm0, [edi]
						movhps		xmm3, [edi+8]
						movaps		xmm1, [edi+16]
						movlps		xmm2, [edi+32]
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
						mulps		xmm0, xmm6
						mulps		xmm3, xmm7
						movlps		xmm2, [edi+40]
						addps		xmm0, xmm3								// xmm0 + xmm1
						movhps		xmm5, [edi+40+8]
						movlps		xmm3, [edi+40+16]
						movhps		xmm3, [edi+40+24]
						movlps		xmm4, [edi+40+32]
						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
						mulps		xmm2, xmm6
						mulps		xmm5, xmm7
						addps		xmm2, xmm5								// xmm2 + xmm3
						movss		xmm5, [esi+16]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm4, xmm0
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
						addps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						STORE4( 0, xmm0, xmm2 )
						movlps		xmm4, [edi+80]
						movhps		xmm3, [edi+80+8]
						movaps		xmm1, [edi+80+16]
						movlps		xmm2, [edi+80+32]
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
						mulps		xmm4, xmm6
						mulps		xmm3, xmm7
						mulps		xmm1, xmm5
						addps		xmm4, xmm3								// xmm4 + xmm1
						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
						addps		xmm4, xmm1
						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
						addps		xmm4, xmm1
						STORE2LO( 16, xmm4, xmm2 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
						mPtr += 5;
					}
					return;
				}
			}
			break;
		}
		case 6: {
			switch( numRows ) {
				case 1: {		// 1x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
                        mov			eax, dstPtr
						movss		xmm0, [esi]
						mulss		xmm0, [edi]
						movss		xmm1, [esi+4]
						mulss		xmm1, [edi+4]
						movss		xmm2, [esi+8]
						addss		xmm0, xmm1
						mulss		xmm2, [edi+8]
						movss		xmm3, [esi+12]
						addss		xmm0, xmm2
						mulss		xmm3, [edi+12]
						movss		xmm4, [esi+16]
						addss		xmm0, xmm3
						mulss		xmm4, [edi+16]
						movss		xmm5, [esi+20]
						addss		xmm0, xmm4
						mulss		xmm5, [edi+20]
						movss		xmm6, [esi+24]
						addss		xmm0, xmm5
						mulss		xmm6, [edi+24]
						addss		xmm0, xmm6
						STORE1( 0, xmm0, xmm7 )
					}
					return;
				}
				case 2: {		// 2x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm0, xmm1
						addps		xmm0, xmm1
						STORE2LO( 0, xmm0, xmm3 )
					}
					return;
				}
				case 3: {		// 3x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm0, xmm1
						addps		xmm0, xmm1
						STORE2LO( 0, xmm0, xmm3 )
						// row 2
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movhlps		xmm1, xmm0
						addps		xmm0, xmm1
						movaps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
						addss		xmm0, xmm1
						STORE1( 8, xmm0, xmm3 )
					}
					return;
				}
				case 4: {		// 4x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm7, xmm0
						movlhps		xmm7, xmm2
						addps		xmm7, xmm1
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm7, xmm0
						// row 2 and 3
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						movaps		xmm2, [edi+48+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						// last 4 additions for the first 4 rows and store result
						movaps		xmm0, xmm7
						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm7
						STORE4( 0, xmm0, xmm4 )
					}
					return;
				}
				case 5: {		// 5x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm7, xmm0
						movlhps		xmm7, xmm2
						addps		xmm7, xmm1
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm7, xmm0
						// row 2 and 3
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						movaps		xmm2, [edi+48+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						// last 4 additions for the first 4 rows and store result
						movaps		xmm0, xmm7
						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm7
						STORE4( 0, xmm0, xmm3 )
						// row 5
						movaps		xmm0, [edi+96]
						movaps		xmm1, [edi+96+16]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movhlps		xmm1, xmm0
						addps		xmm0, xmm1
						movaps		xmm1, xmm0
						shufps		xmm1, xmm1, 0x01
						addss		xmm0, xmm1
						STORE1( 16, xmm0, xmm3 )
					}
					return;
				}
				case 6: {		// 6x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm7, qword ptr [esi]
						movlps		xmm6, qword ptr [esi+8]
						shufps		xmm7, xmm7, 0x44
						shufps		xmm6, xmm6, 0x44
						movlps		xmm0, qword ptr [edi    ]
						movhps		xmm0, qword ptr [edi+ 24]
						mulps		xmm0, xmm7
						movlps		xmm3, qword ptr [edi+  8]
						movhps		xmm3, qword ptr [edi+ 32]
						mulps		xmm3, xmm6
						movlps		xmm1, qword ptr [edi+ 48]
						movhps		xmm1, qword ptr [edi+ 72]
						mulps		xmm1, xmm7
						movlps		xmm2, qword ptr [edi+ 96]
						movhps		xmm2, qword ptr [edi+120]
						mulps		xmm2, xmm7
						movlps		xmm4, qword ptr [edi+ 56]
						movhps		xmm4, qword ptr [edi+ 80]
						movlps		xmm5, qword ptr [edi+104]
						movhps		xmm5, qword ptr [edi+128]
						mulps		xmm4, xmm6
						movlps		xmm7, qword ptr [esi+16]
						addps		xmm0, xmm3
						shufps		xmm7, xmm7, 0x44
						mulps		xmm5, xmm6
						addps		xmm1, xmm4
						movlps		xmm3, qword ptr [edi+ 16]
						movhps		xmm3, qword ptr [edi+ 40]
						addps		xmm2, xmm5
						movlps		xmm4, qword ptr [edi+ 64]
						movhps		xmm4, qword ptr [edi+ 88]
						mulps		xmm3, xmm7
						movlps		xmm5, qword ptr [edi+112]
						movhps		xmm5, qword ptr [edi+136]
						addps		xmm0, xmm3
						mulps		xmm4, xmm7
						mulps		xmm5, xmm7
						addps		xmm1, xmm4
						addps		xmm2, xmm5
						movaps		xmm6, xmm0
						shufps		xmm0, xmm1, 0x88
						shufps		xmm6, xmm1, 0xDD
						movaps		xmm7, xmm2
						shufps		xmm7, xmm2, 0x88
						shufps		xmm2, xmm2, 0xDD
						addps		xmm0, xmm6
						addps		xmm2, xmm7
						STORE4( 0, xmm0, xmm3 )
						STORE2LO( 16, xmm2, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
						mPtr += 6;
					}
					return;
				}
			}
			break;
		}
		default: {
			int numColumns = mat.GetNumColumns();
			for ( int i = 0; i < numRows; i++ ) {
				float sum = mPtr[0] * vPtr[0];
				for ( int j = 1; j < numColumns; j++ ) {
					sum += mPtr[j] * vPtr[j];
				}
				dstPtr[i] STOREC sum;
				mPtr += numColumns;
			}
			break;
		}
	}

#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}

/*
============
idSIMD_SSE::MatX_MultiplySubVecX

	optimizes the following matrix multiplications:

	NxN * Nx1
	Nx6 * 6x1
	6xN * Nx1

	with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 )		\
	__asm movss		reg2, [eax+offset]		\
	__asm subss		reg2, reg1				\
	__asm movss		[eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm subps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 )		\
	__asm movhps	reg2, [eax+offset]		\
	__asm subps		reg2, reg1				\
	__asm movhps	[eax+offset], reg2
#define STORE4( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm movhps	reg2, [eax+offset+8]	\
	__asm subps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2		\
	__asm movhps	[eax+offset+8], reg2
#define STOREC		-=

	int numRows;
	const float *mPtr, *vPtr;
	float *dstPtr;

	assert( vec.GetSize() >= mat.GetNumColumns() );
	assert( dst.GetSize() >= mat.GetNumRows() );

	mPtr = mat.ToFloatPtr();
	vPtr = vec.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	numRows = mat.GetNumRows();
	switch( mat.GetNumColumns() ) {
		case 1: {
			switch( numRows ) {
				case 1: {		// 1x1 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						mulss		xmm0, [edi]
						STORE1( 0, xmm0, xmm1 )
					}
					return;
				}
				case 6: {		// 6x1 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm1, xmm0
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						STORE4( 0, xmm0, xmm2 )
						STORE2LO( 16, xmm1, xmm2 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0];
						mPtr++;
					}
					return;
				}
			}
			break;
		}
		case 2: {
			switch( numRows ) {
				case 2: {		// 2x2 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						movss		xmm1, [esi+4]
						movss		xmm2, [edi]
						mulss		xmm2, xmm0
						movss		xmm3, [edi+4]
						mulss		xmm3, xmm1
						addss		xmm2, xmm3
						STORE1( 0, xmm2, xmm4 )
						mulss		xmm0, [edi+8]
						mulss		xmm1, [edi+8+4]
						addss		xmm0, xmm1
						STORE1( 4, xmm0, xmm4 )
					}
					return;
				}
				case 6: {		// 6x2 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm7, [esi]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movaps		xmm0, [edi]
						mulps		xmm0, xmm7
						movaps		xmm1, [edi+16]
						mulps		xmm1, xmm7
						movaps		xmm2, xmm0
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						movaps		xmm3, [edi+32]
						addps		xmm0, xmm2
						mulps		xmm3, xmm7
						STORE4( 0, xmm0, xmm4 )
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm1, xmm3
						addps		xmm3, xmm1
						STORE2LO( 16, xmm3, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
						mPtr += 2;
					}
					return;
				}
			}
			break;
		}
		case 3: {
			switch( numRows ) {
				case 3: {		// 3x3 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						movss		xmm4, [edi]
						mulss		xmm4, xmm0
						movss		xmm1, [esi+4]
						movss		xmm5, [edi+4]
						mulss		xmm5, xmm1
						addss		xmm4, xmm5
						movss		xmm2, [esi+8]
						movss		xmm6, [edi+8]
						mulss		xmm6, xmm2
						addss		xmm4, xmm6
						movss		xmm3, [edi+12]
						mulss		xmm3, xmm0
						STORE1( 0, xmm4, xmm7 );
						movss		xmm5, [edi+12+4]
						mulss		xmm5, xmm1
						addss		xmm3, xmm5
						movss		xmm6, [edi+12+8]
						mulss		xmm6, xmm2
						addss		xmm3, xmm6
						mulss		xmm0, [edi+24]
						mulss		xmm1, [edi+24+4]
						STORE1( 4, xmm3, xmm7 );
						addss		xmm0, xmm1
						mulss		xmm2, [edi+24+8]
						addss		xmm0, xmm2
						STORE1( 8, xmm0, xmm7 );
					}
					return;
				}
				case 6: {		// 6x3 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm5, [esi]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						movss		xmm6, [esi+4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						movss		xmm7, [esi+8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm0, [edi]								// xmm0 = 0, 1, 2, 3
						movlps		xmm1, [edi+4*4]
						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm1 = 4, 5, 1, 2
						movlps		xmm2, [edi+6*4]
						movhps		xmm2, [edi+8*4]							// xmm2 = 6, 7, 8, 9
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 )	// xmm0 = 0, 3, 6, 9
						mulps		xmm0, xmm5
						movlps		xmm3, [edi+10*4]
						shufps		xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 )	// xmm2 = 7, 8, 10, 11
						movaps		xmm3, xmm1
						shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 )	// xmm1 = 1, 4, 7, 10
						mulps		xmm1, xmm6
						shufps		xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 )	// xmm3 = 2, 5, 8, 11
						mulps		xmm3, xmm7
						addps		xmm0, xmm1
						addps		xmm0, xmm3
						STORE4( 0, xmm0, xmm4 )
						movss		xmm1, [edi+12*4]
						mulss		xmm1, xmm5
						movss		xmm2, [edi+13*4]
						mulss		xmm2, xmm6
						movss		xmm3, [edi+14*4]
						mulss		xmm3, xmm7
						addss		xmm1, xmm2
						addss		xmm1, xmm3
						STORE1( 16, xmm1, xmm4 )
						mulss		xmm5, [edi+15*4]
						mulss		xmm6, [edi+16*4]
						mulss		xmm7, [edi+17*4]
						addss		xmm5, xmm6
						addss		xmm5, xmm7
						STORE1( 20, xmm5, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
						mPtr += 3;
					}
					return;
				}
			}
			break;
		}
		case 4: {
			switch( numRows ) {
				case 4: {		// 4x4 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, qword ptr [esi ]
						movlps		xmm0, qword ptr [edi ]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm0, qword ptr [edi+16]
						mulps		xmm0, xmm6
						movlps		xmm7, qword ptr [esi+ 8]
						movlps		xmm2, qword ptr [edi+ 8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm2, qword ptr [edi+24]
						mulps		xmm2, xmm7
						movlps		xmm1, qword ptr [edi+32]
						movhps		xmm1, qword ptr [edi+48]
						mulps		xmm1, xmm6
						movlps		xmm3, qword ptr [edi+40]
						addps		xmm0, xmm2
						movhps		xmm3, qword ptr [edi+56]
						mulps		xmm3, xmm7
						movaps		xmm4, xmm0
						addps		xmm1, xmm3
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm4
						STORE4( 0, xmm0, xmm2 )
					}
					return;
				}
				case 6: {		// 6x4 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, qword ptr [esi+ 0]
						movlps		xmm0, qword ptr [edi+ 0]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm0, qword ptr [edi+16]
						mulps		xmm0, xmm6
						movlps		xmm7, qword ptr [esi+ 8]
						movlps		xmm2, qword ptr [edi+ 8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movhps		xmm2, qword ptr [edi+24]
						mulps		xmm2, xmm7
						movlps		xmm1, qword ptr [edi+32]
						movhps		xmm1, qword ptr [edi+48]
						mulps		xmm1, xmm6
						movlps		xmm3, qword ptr [edi+40]
						addps		xmm0, xmm2
						movhps		xmm3, qword ptr [edi+56]
						mulps		xmm3, xmm7
						movaps		xmm4, xmm0
						addps		xmm1, xmm3
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm4
						movlps		xmm1, qword ptr [edi+64]
						movhps		xmm1, qword ptr [edi+80]
						STORE4( 0, xmm0, xmm4 )
						mulps		xmm1, xmm6
						movlps		xmm2, qword ptr [edi+72]
						movhps		xmm2, qword ptr [edi+88]
						mulps		xmm2, xmm7
						addps		xmm1, xmm2
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm3, xmm1
						addps		xmm1, xmm3
						STORE2LO( 16, xmm1, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
						mPtr += 4;
					}
					return;
				}
			}
			break;
		}
		case 5: {
			switch( numRows ) {
				case 5: {		// 5x5 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [edi+5*4]							// xmm0 =  5,  X,  X,  X
						movhps		xmm0, [edi+0*4]							// xmm0 =  5,  X,  0,  1
						movss		xmm5, [edi+15*4]						// xmm4 = 15,  X,  X,  X
						movhps		xmm5, [edi+10*4]						// xmm5 = 15,  X, 10, 11
						movaps		xmm1, xmm0								// xmm1 =  5,  X,  0,  1
						shufps		xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 )	// xmm0 =  0,  5, 10, 15
						movlps		xmm1, [edi+6*4]							// xmm1 =  6,  7,  0,  1
						movlps		xmm5, [edi+16*4]						// xmm5 = 16, 17, 10, 11
						movaps		xmm2, xmm1								// xmm2 =  6,  7,  0,  1
						shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm1 =  1,  6, 11, 16
						movhps		xmm2, [edi+2*4]							// xmm2 =  6,  7,  2,  3
						movhps		xmm5, [edi+12*4]						// xmm5 = 16, 17, 12, 13
						movaps		xmm3, xmm2								// xmm3 =  6,  7,  2,  3
						shufps		xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 )	// xmm2 =  2,  7, 12, 17
						movlps		xmm3, [edi+8*4]							// xmm3 =  8,  9,  2,  3
						movlps		xmm5, [edi+18*4]						// xmm5 = 18, 19, 12, 13
						movss		xmm4, [edi+4*4]							// xmm4 =  4,  X,  X,  X
						movlhps		xmm4, xmm3								// xmm4 =  4,  X,  8,  9
						shufps		xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 )	// xmm3 =  3,  8, 13, 18
						movhps		xmm5, [edi+14*4]						// xmm6 = 18, 19, 14, 15
						shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 )	// xmm4 =  4,  9, 14, 19
						movss		xmm7, [esi+0*4]
						shufps		xmm7, xmm7, 0
						mulps		xmm0, xmm7
						movss		xmm5, [esi+1*4]
						shufps		xmm5, xmm5, 0
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movss		xmm6, [esi+2*4]
						shufps		xmm6, xmm6, 0
						mulps		xmm2, xmm6
						addps		xmm0, xmm2
						movss		xmm1, [esi+3*4]
						shufps		xmm1, xmm1, 0
						mulps		xmm3, xmm1
						addps		xmm0, xmm3
						movss		xmm2, [esi+4*4]
						shufps		xmm2, xmm2, 0
						mulps		xmm4, xmm2
						addps		xmm0, xmm4
						mulss		xmm7, [edi+20*4]
						mulss		xmm5, [edi+21*4]
						addps		xmm7, xmm5
						mulss		xmm6, [edi+22*4]
						addps		xmm7, xmm6
						mulss		xmm1, [edi+23*4]
						addps		xmm7, xmm1
						mulss		xmm2, [edi+24*4]
						addps		xmm7, xmm2
						STORE4( 0, xmm0, xmm3 )
						STORE1( 16, xmm7, xmm4 )
					}
					return;
				}
				case 6: {		// 6x5 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, [esi]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
						movlps		xmm7, [esi+8]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
						movlps		xmm0, [edi]
						movhps		xmm3, [edi+8]
						movaps		xmm1, [edi+16]
						movlps		xmm2, [edi+32]
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm0 = 0, 1, 5, 6
						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 4, 7, 8, 9
						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 2, 3, 7, 8
						mulps		xmm0, xmm6
						mulps		xmm3, xmm7
						movlps		xmm2, [edi+40]
						addps		xmm0, xmm3								// xmm0 + xmm1
						movhps		xmm5, [edi+40+8]
						movlps		xmm3, [edi+40+16]
						movhps		xmm3, [edi+40+24]
						movlps		xmm4, [edi+40+32]
						shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm2 = 10, 11, 15, 16
						shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm3 = 14, 17, 18, 19
						shufps		xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm5 = 12, 13, 17, 18
						mulps		xmm2, xmm6
						mulps		xmm5, xmm7
						addps		xmm2, xmm5								// xmm2 + xmm3
						movss		xmm5, [esi+16]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm4, xmm0
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
						shufps		xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
						addps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						STORE4( 0, xmm0, xmm2 )
						movlps		xmm4, [edi+80]
						movhps		xmm3, [edi+80+8]
						movaps		xmm1, [edi+80+16]
						movlps		xmm2, [edi+80+32]
						shufps		xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 )	// xmm4 = 20, 21, 25, 26
						shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 )	// xmm1 = 24, 27, 28, 29
						shufps		xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 )	// xmm3 = 22, 23, 27, 28
						mulps		xmm4, xmm6
						mulps		xmm3, xmm7
						mulps		xmm1, xmm5
						addps		xmm4, xmm3								// xmm4 + xmm1
						shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
						shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
						addps		xmm4, xmm1
						shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
						addps		xmm4, xmm1
						STORE2LO( 16, xmm4, xmm2 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
						mPtr += 5;
					}
					return;
				}
			}
			break;
		}
		case 6: {
			switch( numRows ) {
				case 1: {		// 1x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
                        mov			eax, dstPtr
						movss		xmm0, [esi]
						mulss		xmm0, [edi]
						movss		xmm1, [esi+4]
						mulss		xmm1, [edi+4]
						movss		xmm2, [esi+8]
						addss		xmm0, xmm1
						mulss		xmm2, [edi+8]
						movss		xmm3, [esi+12]
						addss		xmm0, xmm2
						mulss		xmm3, [edi+12]
						movss		xmm4, [esi+16]
						addss		xmm0, xmm3
						mulss		xmm4, [edi+16]
						movss		xmm5, [esi+20]
						addss		xmm0, xmm4
						mulss		xmm5, [edi+20]
						movss		xmm6, [esi+24]
						addss		xmm0, xmm5
						mulss		xmm6, [edi+24]
						addss		xmm0, xmm6
						STORE1( 0, xmm0, xmm7 )
					}
					return;
				}
				case 2: {		// 2x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm0, xmm1
						addps		xmm0, xmm1
						STORE2LO( 0, xmm0, xmm3 )
					}
					return;
				}
				case 3: {		// 3x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
						movhlps		xmm0, xmm1
						addps		xmm0, xmm1
						STORE2LO( 0, xmm0, xmm3 )
						// row 2
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movhlps		xmm1, xmm0
						addps		xmm0, xmm1
						movaps		xmm1, xmm0
						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
						addss		xmm0, xmm1
						STORE1( 8, xmm0, xmm3 )
					}
					return;
				}
				case 4: {		// 4x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm7, xmm0
						movlhps		xmm7, xmm2
						addps		xmm7, xmm1
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm7, xmm0
						// row 2 and 3
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						movaps		xmm2, [edi+48+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						// last 4 additions for the first 4 rows and store result
						movaps		xmm0, xmm7
						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm7
						STORE4( 0, xmm0, xmm4 )
					}
					return;
				}
				case 5: {		// 5x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						// load idVecX
						movlps		xmm4, [esi]
						movhps		xmm4, [esi+8]
						movlps		xmm5, [esi+16]
						movlhps		xmm5, xmm4
						movhlps		xmm6, xmm4
						movlhps		xmm6, xmm5
						// row 0 and 1
						movaps		xmm0, [edi]
						movaps		xmm1, [edi+16]
						movaps		xmm2, [edi+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm7, xmm0
						movlhps		xmm7, xmm2
						addps		xmm7, xmm1
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm7, xmm0
						// row 2 and 3
						movaps		xmm0, [edi+48]
						movaps		xmm1, [edi+48+16]
						movaps		xmm2, [edi+48+32]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						mulps		xmm2, xmm6
						movhlps		xmm3, xmm0
						movlhps		xmm3, xmm2
						addps		xmm1, xmm3
						shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
						addps		xmm1, xmm0
						// last 4 additions for the first 4 rows and store result
						movaps		xmm0, xmm7
						shufps		xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
						shufps		xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
						addps		xmm0, xmm7
						STORE4( 0, xmm0, xmm3 )
						// row 5
						movaps		xmm0, [edi+96]
						movaps		xmm1, [edi+96+16]
						mulps		xmm0, xmm4
						mulps		xmm1, xmm5
						addps		xmm0, xmm1
						movhlps		xmm1, xmm0
						addps		xmm0, xmm1
						movaps		xmm1, xmm0
						shufps		xmm1, xmm1, 0x01
						addss		xmm0, xmm1
						STORE1( 16, xmm0, xmm3 )
					}
					return;
				}
				case 6: {		// 6x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm7, qword ptr [esi]
						movlps		xmm6, qword ptr [esi+8]
						shufps		xmm7, xmm7, 0x44
						shufps		xmm6, xmm6, 0x44
						movlps		xmm0, qword ptr [edi    ]
						movhps		xmm0, qword ptr [edi+ 24]
						mulps		xmm0, xmm7
						movlps		xmm3, qword ptr [edi+  8]
						movhps		xmm3, qword ptr [edi+ 32]
						mulps		xmm3, xmm6
						movlps		xmm1, qword ptr [edi+ 48]
						movhps		xmm1, qword ptr [edi+ 72]
						mulps		xmm1, xmm7
						movlps		xmm2, qword ptr [edi+ 96]
						movhps		xmm2, qword ptr [edi+120]
						mulps		xmm2, xmm7
						movlps		xmm4, qword ptr [edi+ 56]
						movhps		xmm4, qword ptr [edi+ 80]
						movlps		xmm5, qword ptr [edi+104]
						movhps		xmm5, qword ptr [edi+128]
						mulps		xmm4, xmm6
						movlps		xmm7, qword ptr [esi+16]
						addps		xmm0, xmm3
						shufps		xmm7, xmm7, 0x44
						mulps		xmm5, xmm6
						addps		xmm1, xmm4
						movlps		xmm3, qword ptr [edi+ 16]
						movhps		xmm3, qword ptr [edi+ 40]
						addps		xmm2, xmm5
						movlps		xmm4, qword ptr [edi+ 64]
						movhps		xmm4, qword ptr [edi+ 88]
						mulps		xmm3, xmm7
						movlps		xmm5, qword ptr [edi+112]
						movhps		xmm5, qword ptr [edi+136]
						addps		xmm0, xmm3
						mulps		xmm4, xmm7
						mulps		xmm5, xmm7
						addps		xmm1, xmm4
						addps		xmm2, xmm5
						movaps		xmm6, xmm0
						shufps		xmm0, xmm1, 0x88
						shufps		xmm6, xmm1, 0xDD
						movaps		xmm7, xmm2
						shufps		xmm7, xmm2, 0x88
						shufps		xmm2, xmm2, 0xDD
						addps		xmm0, xmm6
						addps		xmm2, xmm7
						STORE4( 0, xmm0, xmm3 )
						STORE2LO( 16, xmm2, xmm4 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numRows; i++ ) {
						dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
									mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
						mPtr += 6;
					}
					return;
				}
			}
			break;
		}
		default: {
			int numColumns = mat.GetNumColumns();
			for ( int i = 0; i < numRows; i++ ) {
				float sum = mPtr[0] * vPtr[0];
				for ( int j = 1; j < numColumns; j++ ) {
					sum += mPtr[j] * vPtr[j];
				}
				dstPtr[i] STOREC sum;
				mPtr += numColumns;
			}
			break;
		}
	}

#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}

/*
============
idSIMD_SSE::MatX_TransposeMultiplyVecX

	optimizes the following matrix multiplications:

	Nx6 * Nx1
	6xN * 6x1

	with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 )		\
	__asm movss		[eax+offset], reg1
#define STORE2LO( offset, reg1, reg2 )		\
	__asm movlps	[eax+offset], reg1
#define STORE2HI( offset, reg1, reg2 )		\
	__asm movhps	[eax+offset], reg1
#define STORE4( offset, reg1, reg2 )		\
	__asm movlps	[eax+offset], reg1		\
	__asm movhps	[eax+offset+8], reg1
#define STOREC		=

	int numColumns;
	const float *mPtr, *vPtr;
	float *dstPtr;

	assert( vec.GetSize() >= mat.GetNumRows() );
	assert( dst.GetSize() >= mat.GetNumColumns() );

	mPtr = mat.ToFloatPtr();
	vPtr = vec.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	numColumns = mat.GetNumColumns();
	switch( mat.GetNumRows() ) {
		case 1:
			switch( numColumns ) {
				case 6: {		// 1x6 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm1, xmm0
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						STORE4( 0, xmm0, xmm2 )
						STORE2LO( 16, xmm1, xmm3 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 2:
			switch( numColumns ) {
				case 6: {		// 2x6 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi]
						movaps		xmm1, xmm0
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
						movaps		xmm2, [edi]
						mulps		xmm2, xmm0
						movlps		xmm3, [edi+24]
						movhps		xmm3, [edi+32]
						mulps		xmm3, xmm1
						addps		xmm2, xmm3
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						movlps		xmm4, [edi+16]
						movhps		xmm4, [edi+40]
						mulps		xmm4, xmm0
						movhlps		xmm3, xmm4
						addps		xmm3, xmm4
						STORE4( 0, xmm2, xmm5 )
						STORE2LO( 16, xmm3, xmm6 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 3:
			switch( numColumns ) {
				case 6: {		// 3x6 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movss		xmm1, [esi+2*4]
						movlps		xmm3, [edi+(0*6+0)*4]
						movhps		xmm3, [edi+(0*6+2)*4]
						movaps		xmm4, xmm0
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, xmm4
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*6+0)*4]
						movhps		xmm4, [edi+(2*6+2)*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(2*6+4)*4]
						mulps		xmm5, xmm1
						addps		xmm3, xmm5
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 4:
			switch( numColumns ) {
				case 6: {		// 4x6 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*6+0)*4]
						movhps		xmm4, [edi+(2*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 5:
			switch( numColumns ) {
				case 6: {		// 5x6 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movss		xmm2, [esi+4*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(2*6+0)*4]
						addps		xmm3, xmm6
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm4, xmm2
						mulps		xmm4, [edi+(4*6+0)*4]
						addps		xmm3, xmm4
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(4*6+4)*4]
						mulps		xmm5, xmm2
						addps		xmm3, xmm5
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 6:
			switch( numColumns ) {
				case 1: {		// 6x1 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi]
						movhps		xmm0, [esi+8]
						movlps		xmm1, [esi+16]
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
						addps		xmm0, xmm1
						movhlps		xmm2, xmm0
						addss		xmm2, xmm0
						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
						addss		xmm2, xmm0
						STORE1( 0, xmm2, xmm3 )
					}
					return;
				}
				case 2: {		// 6x2 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm6, [edi+0*4]
						mulps		xmm6, xmm0
						movlps		xmm1, [esi+2*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm7, [edi+4*4]
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movlps		xmm2, [esi+4*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm7, [edi+8*4]
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movhlps		xmm3, xmm6
						addps		xmm3, xmm6
						STORE2LO( 0, xmm3, xmm7 )
					}
					return;
				}
				case 3: {		// 6x3 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [edi+(0*3+2)*4]
						movhps		xmm0, [edi+(0*3+0)*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm6, [esi+0*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, xmm0
						movss		xmm1, [edi+(1*3+0)*4]
						movhps		xmm1, [edi+(1*3+1)*4]
						movss		xmm7, [esi+1*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movss		xmm2, [edi+(2*3+2)*4]
						movhps		xmm2, [edi+(2*3+0)*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm7, [esi+2*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movss		xmm3, [edi+(3*3+0)*4]
						movhps		xmm3, [edi+(3*3+1)*4]
						movss		xmm7, [esi+3*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm3
						addps		xmm6, xmm7
						movss		xmm4, [edi+(4*3+2)*4]
						movhps		xmm4, [edi+(4*3+0)*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm7, [esi+4*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm4
						addps		xmm6, xmm7
						movss		xmm5, [edi+(5*3+0)*4]
						movhps		xmm5, [edi+(5*3+1)*4]
						movss		xmm7, [esi+5*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm5
						addps		xmm6, xmm7
						STORE1( 0, xmm6, xmm7 )
						STORE2HI( 4, xmm6, xmm7 )
					}
					return;
				}
				case 4: {		// 6x4 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm3, [edi+(0*4+0)*4]
						movhps		xmm3, [edi+(0*4+2)*4]
						movss		xmm4, [esi+0*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, xmm4
						movlps		xmm5, [edi+(1*4+0)*4]
						movhps		xmm5, [edi+(1*4+2)*4]
						movss		xmm6, [esi+1*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*4+0)*4]
						movhps		xmm4, [edi+(2*4+2)*4]
						movss		xmm6, [esi+2*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(3*4+0)*4]
						movhps		xmm5, [edi+(3*4+2)*4]
						movss		xmm6, [esi+3*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(4*4+0)*4]
						movhps		xmm4, [edi+(4*4+2)*4]
						movss		xmm6, [esi+4*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(5*4+0)*4]
						movhps		xmm5, [edi+(5*4+2)*4]
						movss		xmm6, [esi+5*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
					}
					return;
				}
				case 5: {		// 6x5 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, [edi+(0*5+0)*4]
						movhps		xmm6, [edi+(0*5+2)*4]
						movss		xmm0, [esi+0*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, xmm0
						movlps		xmm7, [edi+(1*5+0)*4]
						movhps		xmm7, [edi+(1*5+2)*4]
						movss		xmm1, [esi+1*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(2*5+0)*4]
						movhps		xmm7, [edi+(2*5+2)*4]
						movss		xmm2, [esi+2*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(3*5+0)*4]
						movhps		xmm7, [edi+(3*5+2)*4]
						movss		xmm3, [esi+3*4]
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm3
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(4*5+0)*4]
						movhps		xmm7, [edi+(4*5+2)*4]
						movss		xmm4, [esi+4*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm4
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(5*5+0)*4]
						movhps		xmm7, [edi+(5*5+2)*4]
						movss		xmm5, [esi+5*4]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm5
						addps		xmm6, xmm7
						STORE4( 0, xmm6, xmm7 )
						movss		xmm6, [edi+(0*5+4)*4]
						mulss		xmm6, xmm0
						movss		xmm7, [edi+(1*5+4)*4]
						mulss		xmm7, xmm1
						addss		xmm6, xmm7
						movss		xmm7, [edi+(2*5+4)*4]
						mulss		xmm7, xmm2
						addss		xmm6, xmm7
						movss		xmm7, [edi+(3*5+4)*4]
						mulss		xmm7, xmm3
						addss		xmm6, xmm7
						movss		xmm7, [edi+(4*5+4)*4]
						mulss		xmm7, xmm4
						addss		xmm6, xmm7
						movss		xmm7, [edi+(5*5+4)*4]
						mulss		xmm7, xmm5
						addss		xmm6, xmm7
						STORE1( 16, xmm6, xmm7 )
					}
					return;
				}
				case 6: {		// 6x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movlps		xmm2, [esi+4*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(2*6+0)*4]
						addps		xmm3, xmm6
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm2
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(4*6+0)*4]
						addps		xmm3, xmm6
						movaps		xmm6, xmm2
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						movlps		xmm5, [edi+(5*6+0)*4]
						movhps		xmm5, [edi+(5*6+2)*4]
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(4*6+4)*4]
						movhps		xmm5, [edi+(5*6+4)*4]
						mulps		xmm5, xmm2
						addps		xmm3, xmm5
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
						mPtr++;
					}
					return;
				}
			}
			break;
		default:
			int numRows = mat.GetNumRows();
			for ( int i = 0; i < numColumns; i++ ) {
				mPtr = mat.ToFloatPtr() + i;
				float sum = mPtr[0] * vPtr[0];
				for ( int j = 1; j < numRows; j++ ) {
					mPtr += numColumns;
					sum += mPtr[0] * vPtr[j];
				}
				dstPtr[i] STOREC sum;
			}
			break;
	}

#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}

/*
============
idSIMD_SSE::MatX_TransposeMultiplyAddVecX

	optimizes the following matrix multiplications:

	Nx6 * Nx1
	6xN * 6x1

	with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 )		\
	__asm movss		reg2, [eax+offset]		\
	__asm addss		reg2, reg1				\
	__asm movss		[eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm addps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 )		\
	__asm movhps	reg2, [eax+offset]		\
	__asm addps		reg2, reg1				\
	__asm movhps	[eax+offset], reg2
#define STORE4( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm movhps	reg2, [eax+offset+8]	\
	__asm addps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2		\
	__asm movhps	[eax+offset+8], reg2
#define STOREC		+=

	int numColumns;
	const float *mPtr, *vPtr;
	float *dstPtr;

	assert( vec.GetSize() >= mat.GetNumRows() );
	assert( dst.GetSize() >= mat.GetNumColumns() );

	mPtr = mat.ToFloatPtr();
	vPtr = vec.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	numColumns = mat.GetNumColumns();
	switch( mat.GetNumRows() ) {
		case 1:
			switch( numColumns ) {
				case 6: {		// 1x6 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm1, xmm0
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						STORE4( 0, xmm0, xmm2 )
						STORE2LO( 16, xmm1, xmm3 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 2:
			switch( numColumns ) {
				case 6: {		// 2x6 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi]
						movaps		xmm1, xmm0
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
						movaps		xmm2, [edi]
						mulps		xmm2, xmm0
						movlps		xmm3, [edi+24]
						movhps		xmm3, [edi+32]
						mulps		xmm3, xmm1
						addps		xmm2, xmm3
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						movlps		xmm4, [edi+16]
						movhps		xmm4, [edi+40]
						mulps		xmm4, xmm0
						movhlps		xmm3, xmm4
						addps		xmm3, xmm4
						STORE4( 0, xmm2, xmm5 )
						STORE2LO( 16, xmm3, xmm6 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 3:
			switch( numColumns ) {
				case 6: {		// 3x6 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movss		xmm1, [esi+2*4]
						movlps		xmm3, [edi+(0*6+0)*4]
						movhps		xmm3, [edi+(0*6+2)*4]
						movaps		xmm4, xmm0
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, xmm4
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*6+0)*4]
						movhps		xmm4, [edi+(2*6+2)*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(2*6+4)*4]
						mulps		xmm5, xmm1
						addps		xmm3, xmm5
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 4:
			switch( numColumns ) {
				case 6: {		// 4x6 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*6+0)*4]
						movhps		xmm4, [edi+(2*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 5:
			switch( numColumns ) {
				case 6: {		// 5x6 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movss		xmm2, [esi+4*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(2*6+0)*4]
						addps		xmm3, xmm6
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm4, xmm2
						mulps		xmm4, [edi+(4*6+0)*4]
						addps		xmm3, xmm4
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(4*6+4)*4]
						mulps		xmm5, xmm2
						addps		xmm3, xmm5
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 6:
			switch( numColumns ) {
				case 1: {		// 6x1 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi]
						movhps		xmm0, [esi+8]
						movlps		xmm1, [esi+16]
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
						addps		xmm0, xmm1
						movhlps		xmm2, xmm0
						addss		xmm2, xmm0
						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
						addss		xmm2, xmm0
						STORE1( 0, xmm2, xmm3 )
					}
					return;
				}
				case 2: {		// 6x2 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm6, [edi+0*4]
						mulps		xmm6, xmm0
						movlps		xmm1, [esi+2*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm7, [edi+4*4]
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movlps		xmm2, [esi+4*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm7, [edi+8*4]
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movhlps		xmm3, xmm6
						addps		xmm3, xmm6
						STORE2LO( 0, xmm3, xmm7 )
					}
					return;
				}
				case 3: {		// 6x3 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [edi+(0*3+2)*4]
						movhps		xmm0, [edi+(0*3+0)*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm6, [esi+0*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, xmm0
						movss		xmm1, [edi+(1*3+0)*4]
						movhps		xmm1, [edi+(1*3+1)*4]
						movss		xmm7, [esi+1*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movss		xmm2, [edi+(2*3+2)*4]
						movhps		xmm2, [edi+(2*3+0)*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm7, [esi+2*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movss		xmm3, [edi+(3*3+0)*4]
						movhps		xmm3, [edi+(3*3+1)*4]
						movss		xmm7, [esi+3*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm3
						addps		xmm6, xmm7
						movss		xmm4, [edi+(4*3+2)*4]
						movhps		xmm4, [edi+(4*3+0)*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm7, [esi+4*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm4
						addps		xmm6, xmm7
						movss		xmm5, [edi+(5*3+0)*4]
						movhps		xmm5, [edi+(5*3+1)*4]
						movss		xmm7, [esi+5*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm5
						addps		xmm6, xmm7
						STORE1( 0, xmm6, xmm7 )
						STORE2HI( 4, xmm6, xmm7 )
					}
					return;
				}
				case 4: {		// 6x4 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm3, [edi+(0*4+0)*4]
						movhps		xmm3, [edi+(0*4+2)*4]
						movss		xmm4, [esi+0*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, xmm4
						movlps		xmm5, [edi+(1*4+0)*4]
						movhps		xmm5, [edi+(1*4+2)*4]
						movss		xmm6, [esi+1*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*4+0)*4]
						movhps		xmm4, [edi+(2*4+2)*4]
						movss		xmm6, [esi+2*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(3*4+0)*4]
						movhps		xmm5, [edi+(3*4+2)*4]
						movss		xmm6, [esi+3*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(4*4+0)*4]
						movhps		xmm4, [edi+(4*4+2)*4]
						movss		xmm6, [esi+4*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(5*4+0)*4]
						movhps		xmm5, [edi+(5*4+2)*4]
						movss		xmm6, [esi+5*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
					}
					return;
				}
				case 5: {		// 6x5 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, [edi+(0*5+0)*4]
						movhps		xmm6, [edi+(0*5+2)*4]
						movss		xmm0, [esi+0*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, xmm0
						movlps		xmm7, [edi+(1*5+0)*4]
						movhps		xmm7, [edi+(1*5+2)*4]
						movss		xmm1, [esi+1*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(2*5+0)*4]
						movhps		xmm7, [edi+(2*5+2)*4]
						movss		xmm2, [esi+2*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(3*5+0)*4]
						movhps		xmm7, [edi+(3*5+2)*4]
						movss		xmm3, [esi+3*4]
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm3
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(4*5+0)*4]
						movhps		xmm7, [edi+(4*5+2)*4]
						movss		xmm4, [esi+4*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm4
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(5*5+0)*4]
						movhps		xmm7, [edi+(5*5+2)*4]
						movss		xmm5, [esi+5*4]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm5
						addps		xmm6, xmm7
						STORE4( 0, xmm6, xmm7 )
						movss		xmm6, [edi+(0*5+4)*4]
						mulss		xmm6, xmm0
						movss		xmm7, [edi+(1*5+4)*4]
						mulss		xmm7, xmm1
						addss		xmm6, xmm7
						movss		xmm7, [edi+(2*5+4)*4]
						mulss		xmm7, xmm2
						addss		xmm6, xmm7
						movss		xmm7, [edi+(3*5+4)*4]
						mulss		xmm7, xmm3
						addss		xmm6, xmm7
						movss		xmm7, [edi+(4*5+4)*4]
						mulss		xmm7, xmm4
						addss		xmm6, xmm7
						movss		xmm7, [edi+(5*5+4)*4]
						mulss		xmm7, xmm5
						addss		xmm6, xmm7
						STORE1( 16, xmm6, xmm7 )
					}
					return;
				}
				case 6: {		// 6x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movlps		xmm2, [esi+4*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(2*6+0)*4]
						addps		xmm3, xmm6
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm2
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(4*6+0)*4]
						addps		xmm3, xmm6
						movaps		xmm6, xmm2
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						movlps		xmm5, [edi+(5*6+0)*4]
						movhps		xmm5, [edi+(5*6+2)*4]
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(4*6+4)*4]
						movhps		xmm5, [edi+(5*6+4)*4]
						mulps		xmm5, xmm2
						addps		xmm3, xmm5
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
						mPtr++;
					}
					return;
				}
			}
			break;
		default:
			int numRows = mat.GetNumRows();
			for ( int i = 0; i < numColumns; i++ ) {
				mPtr = mat.ToFloatPtr() + i;
				float sum = mPtr[0] * vPtr[0];
				for ( int j = 1; j < numRows; j++ ) {
					mPtr += numColumns;
					sum += mPtr[0] * vPtr[j];
				}
				dstPtr[i] STOREC sum;
			}
			break;
	}

#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}

/*
============
void idSIMD_SSE::MatX_TransposeMultiplySubVecX

	optimizes the following matrix multiplications:

	Nx6 * Nx1
	6xN * 6x1

	with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 )		\
	__asm movss		reg2, [eax+offset]		\
	__asm subss		reg2, reg1				\
	__asm movss		[eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm subps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 )		\
	__asm movhps	reg2, [eax+offset]		\
	__asm subps		reg2, reg1				\
	__asm movhps	[eax+offset], reg2
#define STORE4( offset, reg1, reg2 )		\
	__asm movlps	reg2, [eax+offset]		\
	__asm movhps	reg2, [eax+offset+8]	\
	__asm subps		reg2, reg1				\
	__asm movlps	[eax+offset], reg2		\
	__asm movhps	[eax+offset+8], reg2
#define STOREC		-=

	int numColumns;
	const float *mPtr, *vPtr;
	float *dstPtr;

	assert( vec.GetSize() >= mat.GetNumRows() );
	assert( dst.GetSize() >= mat.GetNumColumns() );

	mPtr = mat.ToFloatPtr();
	vPtr = vec.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	numColumns = mat.GetNumColumns();
	switch( mat.GetNumRows() ) {
		case 1:
			switch( numColumns ) {
				case 6: {		// 1x6 * 1x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [esi]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm1, xmm0
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						STORE4( 0, xmm0, xmm2 )
						STORE2LO( 16, xmm1, xmm3 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 2:
			switch( numColumns ) {
				case 6: {		// 2x6 * 2x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi]
						movaps		xmm1, xmm0
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
						movaps		xmm2, [edi]
						mulps		xmm2, xmm0
						movlps		xmm3, [edi+24]
						movhps		xmm3, [edi+32]
						mulps		xmm3, xmm1
						addps		xmm2, xmm3
						shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						movlps		xmm4, [edi+16]
						movhps		xmm4, [edi+40]
						mulps		xmm4, xmm0
						movhlps		xmm3, xmm4
						addps		xmm3, xmm4
						STORE4( 0, xmm2, xmm5 )
						STORE2LO( 16, xmm3, xmm6 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 3:
			switch( numColumns ) {
				case 6: {		// 3x6 * 3x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movss		xmm1, [esi+2*4]
						movlps		xmm3, [edi+(0*6+0)*4]
						movhps		xmm3, [edi+(0*6+2)*4]
						movaps		xmm4, xmm0
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, xmm4
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*6+0)*4]
						movhps		xmm4, [edi+(2*6+2)*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(2*6+4)*4]
						mulps		xmm5, xmm1
						addps		xmm3, xmm5
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 4:
			switch( numColumns ) {
				case 6: {		// 4x6 * 4x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*6+0)*4]
						movhps		xmm4, [edi+(2*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 5:
			switch( numColumns ) {
				case 6: {		// 5x6 * 5x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movss		xmm2, [esi+4*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(2*6+0)*4]
						addps		xmm3, xmm6
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
						movaps		xmm4, xmm2
						mulps		xmm4, [edi+(4*6+0)*4]
						addps		xmm3, xmm4
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(4*6+4)*4]
						mulps		xmm5, xmm2
						addps		xmm3, xmm5
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
						mPtr++;
					}
					return;
				}
			}
			break;
		case 6:
			switch( numColumns ) {
				case 1: {		// 6x1 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi]
						movhps		xmm0, [esi+8]
						movlps		xmm1, [esi+16]
						mulps		xmm0, [edi]
						mulps		xmm1, [edi+16]
						shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
						addps		xmm0, xmm1
						movhlps		xmm2, xmm0
						addss		xmm2, xmm0
						shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
						addss		xmm2, xmm0
						STORE1( 0, xmm2, xmm3 )
					}
					return;
				}
				case 2: {		// 6x2 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm6, [edi+0*4]
						mulps		xmm6, xmm0
						movlps		xmm1, [esi+2*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm7, [edi+4*4]
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movlps		xmm2, [esi+4*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
						movaps		xmm7, [edi+8*4]
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movhlps		xmm3, xmm6
						addps		xmm3, xmm6
						STORE2LO( 0, xmm3, xmm7 )
					}
					return;
				}
				case 3: {		// 6x3 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movss		xmm0, [edi+(0*3+2)*4]
						movhps		xmm0, [edi+(0*3+0)*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm6, [esi+0*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, xmm0
						movss		xmm1, [edi+(1*3+0)*4]
						movhps		xmm1, [edi+(1*3+1)*4]
						movss		xmm7, [esi+1*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movss		xmm2, [edi+(2*3+2)*4]
						movhps		xmm2, [edi+(2*3+0)*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm7, [esi+2*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movss		xmm3, [edi+(3*3+0)*4]
						movhps		xmm3, [edi+(3*3+1)*4]
						movss		xmm7, [esi+3*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm3
						addps		xmm6, xmm7
						movss		xmm4, [edi+(4*3+2)*4]
						movhps		xmm4, [edi+(4*3+0)*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
						movss		xmm7, [esi+4*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm4
						addps		xmm6, xmm7
						movss		xmm5, [edi+(5*3+0)*4]
						movhps		xmm5, [edi+(5*3+1)*4]
						movss		xmm7, [esi+5*4]
						shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm5
						addps		xmm6, xmm7
						STORE1( 0, xmm6, xmm7 )
						STORE2HI( 4, xmm6, xmm7 )
					}
					return;
				}
				case 4: {		// 6x4 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm3, [edi+(0*4+0)*4]
						movhps		xmm3, [edi+(0*4+2)*4]
						movss		xmm4, [esi+0*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, xmm4
						movlps		xmm5, [edi+(1*4+0)*4]
						movhps		xmm5, [edi+(1*4+2)*4]
						movss		xmm6, [esi+1*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(2*4+0)*4]
						movhps		xmm4, [edi+(2*4+2)*4]
						movss		xmm6, [esi+2*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(3*4+0)*4]
						movhps		xmm5, [edi+(3*4+2)*4]
						movss		xmm6, [esi+3*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movlps		xmm4, [edi+(4*4+0)*4]
						movhps		xmm4, [edi+(4*4+2)*4]
						movss		xmm6, [esi+4*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm4, xmm6
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(5*4+0)*4]
						movhps		xmm5, [edi+(5*4+2)*4]
						movss		xmm6, [esi+5*4]
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
					}
					return;
				}
				case 5: {		// 6x5 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm6, [edi+(0*5+0)*4]
						movhps		xmm6, [edi+(0*5+2)*4]
						movss		xmm0, [esi+0*4]
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, xmm0
						movlps		xmm7, [edi+(1*5+0)*4]
						movhps		xmm7, [edi+(1*5+2)*4]
						movss		xmm1, [esi+1*4]
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm1
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(2*5+0)*4]
						movhps		xmm7, [edi+(2*5+2)*4]
						movss		xmm2, [esi+2*4]
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm2
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(3*5+0)*4]
						movhps		xmm7, [edi+(3*5+2)*4]
						movss		xmm3, [esi+3*4]
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm3
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(4*5+0)*4]
						movhps		xmm7, [edi+(4*5+2)*4]
						movss		xmm4, [esi+4*4]
						shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm4
						addps		xmm6, xmm7
						movlps		xmm7, [edi+(5*5+0)*4]
						movhps		xmm7, [edi+(5*5+2)*4]
						movss		xmm5, [esi+5*4]
						shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm7, xmm5
						addps		xmm6, xmm7
						STORE4( 0, xmm6, xmm7 )
						movss		xmm6, [edi+(0*5+4)*4]
						mulss		xmm6, xmm0
						movss		xmm7, [edi+(1*5+4)*4]
						mulss		xmm7, xmm1
						addss		xmm6, xmm7
						movss		xmm7, [edi+(2*5+4)*4]
						mulss		xmm7, xmm2
						addss		xmm6, xmm7
						movss		xmm7, [edi+(3*5+4)*4]
						mulss		xmm7, xmm3
						addss		xmm6, xmm7
						movss		xmm7, [edi+(4*5+4)*4]
						mulss		xmm7, xmm4
						addss		xmm6, xmm7
						movss		xmm7, [edi+(5*5+4)*4]
						mulss		xmm7, xmm5
						addss		xmm6, xmm7
						STORE1( 16, xmm6, xmm7 )
					}
					return;
				}
				case 6: {		// 6x6 * 6x1
					__asm {
						mov			esi, vPtr
						mov			edi, mPtr
						mov			eax, dstPtr
						movlps		xmm0, [esi+0*4]
						movlps		xmm1, [esi+2*4]
						movlps		xmm2, [esi+4*4]
						movaps		xmm3, xmm0
						shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm3, [edi+(0*6+0)*4]
						movlps		xmm5, [edi+(1*6+0)*4]
						movhps		xmm5, [edi+(1*6+2)*4]
						movaps		xmm6, xmm0
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(2*6+0)*4]
						addps		xmm3, xmm6
						movaps		xmm6, xmm1
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						movlps		xmm5, [edi+(3*6+0)*4]
						movhps		xmm5, [edi+(3*6+2)*4]
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						movaps		xmm6, xmm2
						shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
						mulps		xmm6, [edi+(4*6+0)*4]
						addps		xmm3, xmm6
						movaps		xmm6, xmm2
						shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
						movlps		xmm5, [edi+(5*6+0)*4]
						movhps		xmm5, [edi+(5*6+2)*4]
						mulps		xmm5, xmm6
						addps		xmm3, xmm5
						STORE4( 0, xmm3, xmm7 )
						shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
						shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
						movlps		xmm3, [edi+(0*6+4)*4]
						movhps		xmm3, [edi+(1*6+4)*4]
						mulps		xmm3, xmm0
						movlps		xmm4, [edi+(2*6+4)*4]
						movhps		xmm4, [edi+(3*6+4)*4]
						mulps		xmm4, xmm1
						addps		xmm3, xmm4
						movlps		xmm5, [edi+(4*6+4)*4]
						movhps		xmm5, [edi+(5*6+4)*4]
						mulps		xmm5, xmm2
						addps		xmm3, xmm5
						movhlps		xmm4, xmm3
						addps		xmm3, xmm4
						STORE2LO( 16, xmm3, xmm7 )
					}
					return;
				}
				default: {
					for ( int i = 0; i < numColumns; i++ ) {
						dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
								*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
						mPtr++;
					}
					return;
				}
			}
			break;
		default:
			int numRows = mat.GetNumRows();
			for ( int i = 0; i < numColumns; i++ ) {
				mPtr = mat.ToFloatPtr() + i;
				float sum = mPtr[0] * vPtr[0];
				for ( int j = 1; j < numRows; j++ ) {
					mPtr += numColumns;
					sum += mPtr[0] * vPtr[j];
				}
				dstPtr[i] STOREC sum;
			}
			break;
	}

#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}

/*
============
idSIMD_SSE::MatX_MultiplyMatX

	optimizes the following matrix multiplications:

	NxN * Nx6
	6xN * Nx6
	Nx6 * 6xN
	6x6 * 6xN

	with N in the range [1-6].

	The hot cache clock cycle counts are generally better for the SIMD version than the
	FPU version. At times up to 40% less clock cycles on a P3. In practise however,
	the results are poor probably due to memory access.
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
	int i, j, k, l, n;
	float *dstPtr;
	const float *m1Ptr, *m2Ptr;
	double sum;

	assert( m1.GetNumColumns() == m2.GetNumRows() );

	dstPtr = dst.ToFloatPtr();
	m1Ptr = m1.ToFloatPtr();
	m2Ptr = m2.ToFloatPtr();
	k = m1.GetNumRows();
	l = m2.GetNumColumns();
	n = m1.GetNumColumns();

	switch( n ) {
		case 1: {
			if ( !(l^6) ) {
				switch( k ) {
					case 1:	{			// 1x1 * 1x6, no precision loss compared to FPU version
						__asm {
							mov			esi, m2Ptr
							mov			edi, m1Ptr
							mov			eax, dstPtr
							movss		xmm0, [edi]
							shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
							movaps		xmm1, [esi]
							mulps		xmm1, xmm0
							movaps		[eax], xmm1
							movlps		xmm2, [esi+16]
							mulps		xmm2, xmm0
							movlps		[eax+16], xmm2
						}
						return;
					}
					case 6: {			// 6x1 * 1x6, no precision loss compared to FPU version
						__asm {
							mov			esi, m2Ptr
							mov			edi, m1Ptr
							mov			eax, dstPtr
							xorps		xmm1, xmm1
							movaps		xmm0, [edi]
							movlps		xmm1, [edi+16]
							movlhps		xmm1, xmm0
							movhlps		xmm2, xmm0
							movlhps		xmm2, xmm1
							// row 0 and 1
							movaps		xmm3, [esi]
							movaps		xmm4, xmm3
							shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
							movaps		xmm5, xmm3
							shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
							movaps		xmm6, xmm3
							shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
							mulps		xmm4, xmm0
							mulps		xmm5, xmm1
							mulps		xmm6, xmm2
							movaps		[eax], xmm4
							movaps		[eax+16], xmm5
							movaps		[eax+32], xmm6
							// row 2 and 3
							movaps		xmm4, xmm3
							shufps		xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
							movaps		xmm5, xmm3
							shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
							shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
							mulps		xmm4, xmm0
							mulps		xmm5, xmm1
							mulps		xmm3, xmm2
							movaps		[eax+48], xmm4
							movaps		[eax+64], xmm5
							movaps		[eax+80], xmm3
							// row 4 and 5
							movlps		xmm3, [esi+16]
							movaps		xmm4, xmm3
							shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
							movaps		xmm5, xmm3
							shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
							shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
							mulps		xmm4, xmm0
							mulps		xmm5, xmm1
							mulps		xmm3, xmm2
							movaps		[eax+96], xmm4
							movaps		[eax+112], xmm5
							movaps		[eax+128], xmm3
						}
						return;
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		}
		case 2: {
			if ( !(l^6) ) {
				switch( k ) {
					case 2: {			// 2x2 * 2x6

						#define MUL_Nx2_2x6_INIT								\
						__asm mov		esi, m2Ptr								\
						__asm mov		edi, m1Ptr								\
						__asm mov		eax, dstPtr								\
						__asm movaps	xmm0, [esi]								\
						__asm movlps	xmm1, [esi+16]							\
						__asm movhps	xmm1, [esi+40]							\
						__asm movlps	xmm2, [esi+24]							\
						__asm movhps	xmm2, [esi+32]

						#define MUL_Nx2_2x6_ROW2( row )							\
						__asm movaps	xmm3, [edi+row*16]						\
						__asm movaps	xmm5, xmm0								\
						__asm movaps	xmm4, xmm3								\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm5, xmm4								\
						__asm movaps	xmm4, xmm3								\
						__asm movaps	xmm6, xmm2								\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm5, xmm6								\
						__asm movaps	[eax+row*48], xmm5						\
						__asm movaps	xmm4, xmm3								\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm movaps	xmm7, xmm1								\
						__asm mulps		xmm7, xmm4								\
						__asm movaps	xmm4, xmm3								\
						__asm movaps	xmm5, xmm0								\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )	\
						__asm mulps		xmm5, xmm4								\
						__asm movaps	xmm4, xmm3								\
						__asm movaps	xmm6, xmm2								\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm5, xmm6								\
						__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )	\
						__asm movaps	xmm6, xmm1								\
						__asm mulps		xmm6, xmm3								\
						__asm movaps	xmm4, xmm7								\
						__asm movlhps	xmm7, xmm6								\
						__asm movhlps	xmm6, xmm4								\
						__asm addps		xmm6, xmm7								\
						__asm movlps	[eax+row*48+16], xmm6					\
						__asm movlps	[eax+row*48+24], xmm5					\
						__asm movhps	[eax+row*48+32], xmm5					\
						__asm movhps	[eax+row*48+40], xmm6

						MUL_Nx2_2x6_INIT
						MUL_Nx2_2x6_ROW2( 0 )

						return;
					}
					case 6: {			// 6x2 * 2x6

						MUL_Nx2_2x6_INIT
						MUL_Nx2_2x6_ROW2( 0 )
						MUL_Nx2_2x6_ROW2( 1 )
						MUL_Nx2_2x6_ROW2( 2 )

						return;
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
					m2Ptr++;
				}
				m1Ptr += 2;
			}
			break;
		}
		case 3: {
			if ( !(l^6) ) {
				switch( k ) {
					case 3: {			// 3x3 * 3x6
						__asm {
							mov		esi, m2Ptr
							mov		edi, m1Ptr
							mov		eax, dstPtr
							movaps	xmm5, xmmword ptr [esi]
							movlps	xmm6, qword ptr [esi+24]
							movhps	xmm6, qword ptr [esi+32]
							movaps	xmm7, xmmword ptr [esi+48]
							movss	xmm0, dword ptr [edi]
							shufps	xmm0, xmm0, 0
							mulps	xmm0, xmm5
							movss	xmm1, dword ptr [edi+4]
							shufps	xmm1, xmm1, 0
							mulps	xmm1, xmm6
							movss	xmm2, dword ptr [edi+8]
							shufps	xmm2, xmm2, 0
							mulps	xmm2, xmm7
							addps	xmm0, xmm1
							addps	xmm0, xmm2
							movaps	xmmword ptr [eax], xmm0
							movss	xmm3, dword ptr [edi+12]
							shufps	xmm3, xmm3, 0
							mulps	xmm3, xmm5
							movss	xmm4, dword ptr [edi+16]
							shufps	xmm4, xmm4, 0
							mulps	xmm4, xmm6
							movss	xmm0, dword ptr [edi+20]
							shufps	xmm0, xmm0, 0
							mulps	xmm0, xmm7
							addps	xmm3, xmm4
							addps	xmm0, xmm3
							movlps	qword ptr [eax+24], xmm0
							movhps	qword ptr [eax+32], xmm0
							movss	xmm1, dword ptr [edi+24]
							shufps	xmm1, xmm1, 0
							mulps	xmm1, xmm5
							movss	xmm2, dword ptr [edi+28]
							shufps	xmm2, xmm2, 0
							mulps	xmm2, xmm6
							movss	xmm3, dword ptr [edi+32]
							shufps	xmm3, xmm3, 0
							mulps	xmm3, xmm7
							addps	xmm1, xmm2
							addps	xmm1, xmm3
							movaps	xmmword ptr [eax+48], xmm1
							movlps	xmm5, qword ptr [esi+16]
							movlps	xmm6, qword ptr [esi+40]
							movlps	xmm7, qword ptr [esi+64]
							shufps	xmm5, xmm5, 0x44
							shufps	xmm6, xmm6, 0x44
							shufps	xmm7, xmm7, 0x44
							movaps	xmm3, xmmword ptr [edi]
							movlps	xmm4, qword ptr [edi+16]
							movaps	xmm0, xmm3
							shufps	xmm0, xmm0, 0xF0
							mulps	xmm0, xmm5
							movaps	xmm1, xmm3
							shufps	xmm1, xmm4, 0x05
							mulps	xmm1, xmm6
							shufps	xmm3, xmm4, 0x5A
							mulps	xmm3, xmm7
							addps	xmm1, xmm0
							addps	xmm1, xmm3
							movlps	qword ptr [eax+16], xmm1
							movhps	qword ptr [eax+40], xmm1
							movss	xmm0, dword ptr [edi+24]
							shufps	xmm0, xmm0, 0
							mulps	xmm0, xmm5
							movss	xmm2, dword ptr [edi+28]
							shufps	xmm2, xmm2, 0
							mulps	xmm2, xmm6
							movss	xmm4, dword ptr [edi+32]
							shufps	xmm4, xmm4, 0
							mulps	xmm4, xmm7
							addps	xmm0, xmm2
							addps	xmm0, xmm4
							movlps	qword ptr [eax+64], xmm0
						}
						return;
					}
					case 6: {			// 6x3 * 3x6
						#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT						\
						__asm mov			esi, m2Ptr								\
						__asm mov			edi, m1Ptr								\
						__asm mov			eax, dstPtr								\
						__asm movlps		xmm0, [esi+ 0*4]						\
						__asm movhps		xmm0, [esi+ 2*4]						\
						__asm movlps		xmm1, [esi+ 6*4]						\
						__asm movhps		xmm1, [esi+ 8*4]						\
						__asm movlps		xmm2, [esi+12*4]						\
						__asm movhps		xmm2, [esi+14*4]

						#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row )				\
						__asm movss			xmm3, [edi+(row*3+0)*4]					\
						__asm shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm3, xmm0								\
						__asm movss			xmm4, [edi+(row*3+1)*4]					\
						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm4, xmm1								\
						__asm addps			xmm3, xmm4								\
						__asm movss			xmm5, [edi+(row*3+2)*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm5, xmm2								\
						__asm addps			xmm3, xmm5								\
						__asm movlps		[eax+(row*6+0)*4], xmm3					\
						__asm movhps		[eax+(row*6+2)*4], xmm3

						#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6						\
						__asm movlps		xmm0, [esi+ 4*4]						\
						__asm movlps		xmm1, [esi+10*4]						\
						__asm movlps		xmm2, [esi+16*4]						\
						__asm shufps		xmm0, xmm0, 0x44						\
						__asm shufps		xmm1, xmm1, 0x44						\
						__asm shufps		xmm2, xmm2, 0x44						\
						__asm movlps		xmm3, [edi+0*4]							\
						__asm movhps		xmm3, [edi+2*4]							\
						__asm movaps		xmm4, xmm3								\
						__asm movaps		xmm5, xmm3								\
						__asm shufps		xmm3, xmm3, 0xF0						\
						__asm mulps			xmm3, xmm0								\
						__asm movlps		xmm6, [edi+4*4]							\
						__asm movhps		xmm6, [edi+6*4]							\
						__asm shufps		xmm4, xmm6, 0x05						\
						__asm mulps			xmm4, xmm1								\
						__asm addps			xmm3, xmm4								\
						__asm shufps		xmm5, xmm6, 0x5A						\
						__asm mulps			xmm5, xmm2								\
						__asm addps			xmm3, xmm5								\
						__asm movlps		[eax+4*4], xmm3							\
						__asm movhps		[eax+10*4], xmm3						\
						__asm movaps		xmm5, xmm6								\
						__asm movlps		xmm3, [edi+8*4]							\
						__asm movhps		xmm3, [edi+10*4]						\
						__asm movaps		xmm4, xmm3								\
						__asm shufps		xmm5, xmm3, 0x5A						\
						__asm mulps			xmm5, xmm0								\
						__asm shufps		xmm6, xmm3, 0xAF						\
						__asm mulps			xmm6, xmm1								\
						__asm addps			xmm5, xmm6								\
						__asm shufps		xmm4, xmm4, 0xF0						\
						__asm mulps			xmm4, xmm2								\
						__asm addps			xmm4, xmm5								\
						__asm movlps		[eax+16*4], xmm4						\
						__asm movhps		[eax+22*4], xmm4						\
						__asm movlps		xmm6, [edi+12*4]						\
						__asm movhps		xmm6, [edi+14*4]						\
						__asm movaps		xmm5, xmm6								\
						__asm movaps		xmm4, xmm6								\
						__asm shufps		xmm6, xmm6, 0xF0						\
						__asm mulps			xmm6, xmm0								\
						__asm movlps		xmm3, [edi+16*4]						\
						__asm shufps		xmm5, xmm3, 0x05						\
						__asm mulps			xmm5, xmm1								\
						__asm addps			xmm5, xmm6								\
						__asm shufps		xmm4, xmm3, 0x5A						\
						__asm mulps			xmm4, xmm2								\
						__asm addps			xmm4, xmm5								\
						__asm movlps		[eax+28*4], xmm4						\
						__asm movhps		[eax+34*4], xmm4

						MUL_Nx3_3x6_FIRST4COLUMNS_INIT
						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
						MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
						MUL_Nx3_3x6_LAST2COLUMNS_ROW6

						return;
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
					m2Ptr++;
				}
				m1Ptr += 3;
			}
			break;
		}
		case 4: {
			if ( !(l^6) ) {
				switch( k ) {
					case 4: {			// 4x4 * 4x6

						#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT						\
						__asm mov			esi, m2Ptr								\
						__asm mov			edi, m1Ptr								\
						__asm mov			eax, dstPtr								\
						__asm movlps		xmm0, [esi+ 0*4]						\
						__asm movhps		xmm0, [esi+ 2*4]						\
						__asm movlps		xmm1, [esi+ 6*4]						\
						__asm movhps		xmm1, [esi+ 8*4]						\
						__asm movlps		xmm2, [esi+12*4]						\
						__asm movhps		xmm2, [esi+14*4]						\
						__asm movlps		xmm3, [esi+18*4]						\
						__asm movhps		xmm3, [esi+20*4]

						#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row )				\
						__asm movss			xmm4, [edi+row*16+0*4]					\
						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm4, xmm0								\
						__asm movss			xmm5, [edi+row*16+1*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm5, xmm1								\
						__asm addps			xmm4, xmm5								\
						__asm movss			xmm6, [edi+row*16+2*4]					\
						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm6, xmm2								\
						__asm addps			xmm4, xmm6								\
						__asm movss			xmm7, [edi+row*16+3*4]					\
						__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm7, xmm3								\
						__asm addps			xmm4, xmm7								\
						__asm movlps		[eax+row*24+0], xmm4					\
						__asm movhps		[eax+row*24+8], xmm4

						#define MUL_Nx4_4x6_LAST2COLUMNS_INIT						\
						__asm movlps		xmm0, [esi+ 4*4]						\
						__asm movlps		xmm1, [esi+10*4]						\
						__asm movlps		xmm2, [esi+16*4]						\
						__asm movlps		xmm3, [esi+22*4]						\
						__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )

						#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row )				\
						__asm movlps		xmm7, [edi+row*32+ 0*4]					\
						__asm movhps		xmm7, [edi+row*32+ 4*4]					\
						__asm movaps		xmm6, xmm7								\
						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 )	\
						__asm mulps			xmm6, xmm0								\
						__asm shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 )	\
						__asm mulps			xmm7, xmm1								\
						__asm addps			xmm6, xmm7								\
						__asm movlps		xmm4, [edi+row*32+ 2*4]					\
						__asm movhps		xmm4, [edi+row*32+ 6*4]					\
						__asm movaps		xmm5, xmm4								\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 )	\
						__asm mulps			xmm5, xmm2								\
						__asm addps			xmm6, xmm5								\
						__asm shufps		xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 )	\
						__asm mulps			xmm4, xmm3								\
						__asm addps			xmm6, xmm4								\
						__asm movlps		[eax+row*48+ 4*4], xmm6					\
						__asm movhps		[eax+row*48+10*4], xmm6

						MUL_Nx4_4x6_FIRST4COLUMNS_INIT
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
						MUL_Nx4_4x6_LAST2COLUMNS_INIT
						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )

						return;
					}
					case 6: {			// 6x4 * 4x6

						MUL_Nx4_4x6_FIRST4COLUMNS_INIT
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
						MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
						MUL_Nx4_4x6_LAST2COLUMNS_INIT
						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
						MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )

						return;
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
									 m1Ptr[3] * m2Ptr[3*l];
					m2Ptr++;
				}
				m1Ptr += 4;
			}
			break;
		}
		case 5: {
			if ( !(l^6) ) {
				switch( k ) {
					case 5: {			// 5x5 * 5x6

						#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT						\
						__asm mov			esi, m2Ptr								\
						__asm mov			edi, m1Ptr								\
						__asm mov			eax, dstPtr								\
						__asm movlps		xmm0, [esi+ 0*4]						\
						__asm movhps		xmm0, [esi+ 2*4]						\
						__asm movlps		xmm1, [esi+ 6*4]						\
						__asm movhps		xmm1, [esi+ 8*4]						\
						__asm movlps		xmm2, [esi+12*4]						\
						__asm movhps		xmm2, [esi+14*4]						\
						__asm movlps		xmm3, [esi+18*4]						\
						__asm movhps		xmm3, [esi+20*4]						\
						__asm movlps		xmm4, [esi+24*4]						\
						__asm movhps		xmm4, [esi+26*4]

						#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row )				\
						__asm movss			xmm6, [edi+row*20+0*4]					\
						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm6, xmm0								\
						__asm movss			xmm5, [edi+row*20+1*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm5, xmm1								\
						__asm addps			xmm6, xmm5								\
						__asm movss			xmm5, [edi+row*20+2*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm5, xmm2								\
						__asm addps			xmm6, xmm5								\
						__asm movss			xmm5, [edi+row*20+3*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm5, xmm3								\
						__asm addps			xmm6, xmm5								\
						__asm movss			xmm5, [edi+row*20+4*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps			xmm5, xmm4								\
						__asm addps			xmm6, xmm5								\
						__asm movlps		[eax+row*24+0], xmm6					\
						__asm movhps		[eax+row*24+8], xmm6

						#define MUL_Nx5_5x6_LAST2COLUMNS_INIT						\
						__asm movlps		xmm0, [esi+ 4*4]						\
						__asm movlps		xmm1, [esi+10*4]						\
						__asm movlps		xmm2, [esi+16*4]						\
						__asm movlps		xmm3, [esi+22*4]						\
						__asm movlps		xmm4, [esi+28*4]						\
						__asm shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps		xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )

						#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row )				\
						__asm movlps		xmm7, [edi+row*40+ 0*4]					\
						__asm movhps		xmm7, [edi+row*40+ 6*4]					\
						__asm movaps		xmm6, xmm7								\
						__asm shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 )	\
						__asm mulps			xmm6, xmm0								\
						__asm movaps		xmm5, xmm7								\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
						__asm mulps			xmm5, xmm1								\
						__asm addps			xmm6, xmm5								\
						__asm movlps		xmm7, [edi+row*40+ 2*4]					\
						__asm movhps		xmm7, [edi+row*40+ 8*4]					\
						__asm movaps		xmm5, xmm7								\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 )	\
						__asm mulps			xmm5, xmm2								\
						__asm addps			xmm6, xmm5								\
						__asm movaps		xmm5, xmm7								\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 )	\
						__asm mulps			xmm5, xmm3								\
						__asm addps			xmm6, xmm5								\
						__asm movlps		xmm5, [edi+row*40+ 4*4]					\
						__asm shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps			xmm5, xmm4								\
						__asm addps			xmm6, xmm5								\
						__asm movlps		[eax+row*48+ 4*4], xmm6					\
						__asm movhps		[eax+row*48+10*4], xmm6

						#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row )					\
						__asm movlps		xmm6, [edi+20*4+0*4]					\
						__asm unpcklps		xmm6, xmm6								\
						__asm mulps			xmm6, xmm0								\
						__asm movlps		xmm5, [edi+20*4+2*4]					\
						__asm unpcklps		xmm5, xmm5								\
						__asm mulps			xmm5, xmm2								\
						__asm addps			xmm6, xmm5								\
						__asm movss			xmm5, [edi+20*4+4*4]					\
						__asm unpcklps		xmm5, xmm5								\
						__asm mulps			xmm5, xmm4								\
						__asm addps			xmm6, xmm5								\
						__asm movhlps		xmm7, xmm6								\
						__asm addps			xmm6, xmm7								\
						__asm movlps		[eax+row*24+4*4], xmm6

						MUL_Nx5_5x6_FIRST4COLUMNS_INIT
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
						MUL_Nx5_5x6_LAST2COLUMNS_INIT
						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
						MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )

						return;
					}
					case 6: {			// 6x5 * 5x6

						MUL_Nx5_5x6_FIRST4COLUMNS_INIT
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
						MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
						MUL_Nx5_5x6_LAST2COLUMNS_INIT
						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
						MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )

						return;
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
									 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
					m2Ptr++;
				}
				m1Ptr += 5;
			}
			break;
		}
		case 6: {
			switch( k ) {
				case 1: {
					if ( !(l^1) ) {		// 1x6 * 6x1
						dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
									 m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
						return;
					}
					break;
				}
				case 2: {
					if ( !(l^2) ) {		// 2x6 * 6x2

						#define MUL_Nx6_6x2_INIT								\
						__asm mov		esi, m2Ptr								\
						__asm mov		edi, m1Ptr								\
						__asm mov		eax, dstPtr								\
						__asm movaps	xmm0, [esi]								\
						__asm movaps	xmm1, [esi+16]							\
						__asm movaps	xmm2, [esi+32]

						#define MUL_Nx6_6x2_ROW2( row )							\
						__asm movaps	xmm7, [edi+row*48+0*4]					\
						__asm movaps	xmm6, xmm7								\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm7, xmm0								\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm movaps	xmm6, [edi+row*48+4*4]					\
						__asm movaps	xmm5, xmm6								\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )	\
						__asm mulps		xmm5, xmm0								\
						__asm movaps	xmm6, [edi+row*48+24+2*4]				\
						__asm movaps	xmm4, xmm6								\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm5, xmm6								\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 )	\
						__asm mulps		xmm4, xmm2								\
						__asm addps		xmm5, xmm4								\
						__asm movaps	xmm4, xmm5								\
						__asm movhlps	xmm5, xmm7								\
						__asm movlhps	xmm7, xmm4								\
						__asm addps		xmm7, xmm5								\
						__asm movaps	[eax+row*16], xmm7

						MUL_Nx6_6x2_INIT
						MUL_Nx6_6x2_ROW2( 0 )

						return;
					}
					break;
				}
				case 3: {
					if ( !(l^3) ) {		// 3x6 * 6x3

						#define MUL_Nx6_6x3_INIT								\
						__asm mov		esi, m2Ptr								\
						__asm mov		edi, m1Ptr								\
						__asm mov		eax, dstPtr								\
						__asm movss		xmm0, [esi+ 0*4]						\
						__asm movhps	xmm0, [esi+ 1*4]						\
						__asm movss		xmm1, [esi+ 3*4]						\
						__asm movhps	xmm1, [esi+ 4*4]						\
						__asm movss		xmm2, [esi+ 6*4]						\
						__asm movhps	xmm2, [esi+ 7*4]						\
						__asm movss		xmm3, [esi+ 9*4]						\
						__asm movhps	xmm3, [esi+10*4]						\
						__asm movss		xmm4, [esi+12*4]						\
						__asm movhps	xmm4, [esi+13*4]						\
						__asm movss		xmm5, [esi+15*4]						\
						__asm movhps	xmm5, [esi+16*4]

						#define MUL_Nx6_6x3_ROW( row )							\
						__asm movss		xmm7, [edi+row*24+0]					\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm7, xmm0								\
						__asm movss		xmm6, [edi+row*24+4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+8]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+12]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm3								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+16]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+20]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm5								\
						__asm addps		xmm7, xmm6								\
						__asm movss		[eax+row*12+0], xmm7					\
						__asm movhps	[eax+row*12+4], xmm7

						MUL_Nx6_6x3_INIT
						MUL_Nx6_6x3_ROW( 0 )
						MUL_Nx6_6x3_ROW( 1 )
						MUL_Nx6_6x3_ROW( 2 )

						return;
					}
					break;
				}
				case 4: {
					if ( !(l^4) ) {		// 4x6 * 6x4

						#define MUL_Nx6_6x4_INIT								\
						__asm mov		esi, m2Ptr								\
						__asm mov		edi, m1Ptr								\
						__asm mov		eax, dstPtr								\
						__asm movaps	xmm0, [esi]								\
						__asm movaps	xmm1, [esi+16]							\
						__asm movaps	xmm2, [esi+32]							\
						__asm movaps	xmm3, [esi+48]							\
						__asm movaps	xmm4, [esi+64]							\
						__asm movaps	xmm5, [esi+80]

						#define MUL_Nx6_6x4_ROW( row )							\
						__asm movss		xmm7, [edi+row*24+0]					\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm7, xmm0								\
						__asm movss		xmm6, [edi+row*24+4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+8]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+12]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm3								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+16]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+row*24+20]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm5								\
						__asm addps		xmm7, xmm6								\
						__asm movaps	[eax+row*16], xmm7

						MUL_Nx6_6x4_INIT
						MUL_Nx6_6x4_ROW( 0 )
						MUL_Nx6_6x4_ROW( 1 )
						MUL_Nx6_6x4_ROW( 2 )
						MUL_Nx6_6x4_ROW( 3 )

						return;
					}
					break;
				}
				case 5: {
					if ( !(l^5) ) {		// 5x6 * 6x5

						#define MUL_Nx6_6x5_INIT								\
						__asm mov		esi, m2Ptr								\
						__asm mov		edi, m1Ptr								\
						__asm mov		eax, dstPtr								\
						__asm movaps	xmm0, [esi]								\
						__asm movlps	xmm1, [esi+20]							\
						__asm movhps	xmm1, [esi+28]							\
						__asm movlps	xmm2, [esi+40]							\
						__asm movhps	xmm2, [esi+48]							\
						__asm movlps	xmm3, [esi+60]							\
						__asm movhps	xmm3, [esi+68]							\
						__asm movaps	xmm4, [esi+80]							\
						__asm movlps	xmm5, [esi+100]							\
						__asm movhps	xmm5, [esi+108]

						#define MUL_Nx6_6x5_ROW( row )							\
						__asm movss		xmm7, [edi+row*24+0]					\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm7, xmm0								\
						__asm fld		dword ptr [edi+(row*6+0)*4]				\
						__asm fmul		dword ptr [esi+(4+0*5)*4]				\
						__asm movss		xmm6, [edi+row*24+4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm fld		dword ptr [edi+(row*6+1)*4]				\
						__asm fmul		dword ptr [esi+(4+1*5)*4]				\
						__asm faddp		st(1),st								\
						__asm movss		xmm6, [edi+row*24+8]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm fld		dword ptr [edi+(row*6+2)*4]				\
						__asm fmul		dword ptr [esi+(4+2*5)*4]				\
						__asm faddp		st(1),st								\
						__asm movss		xmm6, [edi+row*24+12]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm3								\
						__asm addps		xmm7, xmm6								\
						__asm fld		dword ptr [edi+(row*6+3)*4]				\
						__asm fmul		dword ptr [esi+(4+3*5)*4]				\
						__asm faddp		st(1),st								\
						__asm movss		xmm6, [edi+row*24+16]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm7, xmm6								\
						__asm fld		dword ptr [edi+(row*6+4)*4]				\
						__asm fmul		dword ptr [esi+(4+4*5)*4]				\
						__asm faddp		st(1),st								\
						__asm movss		xmm6, [edi+row*24+20]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm5								\
						__asm addps		xmm7, xmm6								\
						__asm fld		dword ptr [edi+(row*6+5)*4]				\
						__asm fmul		dword ptr [esi+(4+5*5)*4]				\
						__asm faddp		st(1),st								\
						__asm fstp		dword ptr [eax+(row*5+4)*4]				\
						__asm movlps	[eax+row*20], xmm7						\
						__asm movhps	[eax+row*20+8], xmm7

						MUL_Nx6_6x5_INIT
						MUL_Nx6_6x5_ROW( 0 )
						MUL_Nx6_6x5_ROW( 1 )
						MUL_Nx6_6x5_ROW( 2 )
						MUL_Nx6_6x5_ROW( 3 )
						MUL_Nx6_6x5_ROW( 4 )

						return;
					}
					break;
				}
				case 6: {
					switch( l ) {
						case 1: {		// 6x6 * 6x1
							__asm {
								mov			esi, m2Ptr
								mov			edi, m1Ptr
								mov			eax, dstPtr
								movlps		xmm7, qword ptr [esi]
								movlps		xmm6, qword ptr [esi+8]
								shufps		xmm7, xmm7, 0x44
								shufps		xmm6, xmm6, 0x44
								movlps		xmm0, qword ptr [edi    ]
								movhps		xmm0, qword ptr [edi+ 24]
								mulps		xmm0, xmm7
								movlps		xmm3, qword ptr [edi+  8]
								movhps		xmm3, qword ptr [edi+ 32]
								mulps		xmm3, xmm6
								movlps		xmm1, qword ptr [edi+ 48]
								movhps		xmm1, qword ptr [edi+ 72]
								mulps		xmm1, xmm7
								movlps		xmm2, qword ptr [edi+ 96]
								movhps		xmm2, qword ptr [edi+120]
								mulps		xmm2, xmm7
								movlps		xmm4, qword ptr [edi+ 56]
								movhps		xmm4, qword ptr [edi+ 80]
								movlps		xmm5, qword ptr [edi+104]
								movhps		xmm5, qword ptr [edi+128]
								mulps		xmm4, xmm6
								movlps		xmm7, qword ptr [esi+16]
								addps		xmm0, xmm3
								shufps		xmm7, xmm7, 0x44
								mulps		xmm5, xmm6
								addps		xmm1, xmm4
								movlps		xmm3, qword ptr [edi+ 16]
								movhps		xmm3, qword ptr [edi+ 40]
								addps		xmm2, xmm5
								movlps		xmm4, qword ptr [edi+ 64]
								movhps		xmm4, qword ptr [edi+ 88]
								mulps		xmm3, xmm7
								movlps		xmm5, qword ptr [edi+112]
								movhps		xmm5, qword ptr [edi+136]
								addps		xmm0, xmm3
								mulps		xmm4, xmm7
								mulps		xmm5, xmm7
								addps		xmm1, xmm4
								addps		xmm2, xmm5
								movaps		xmm6, xmm0
								shufps		xmm0, xmm1, 0x88
								shufps		xmm6, xmm1, 0xDD
								movaps		xmm7, xmm2
								shufps		xmm7, xmm2, 0x88
								shufps		xmm2, xmm2, 0xDD
								addps		xmm0, xmm6
								addps		xmm2, xmm7
								movlps		[eax], xmm0
								movhps		[eax+8], xmm0
								movlps		[eax+16], xmm2
							}
							return;
						}
						case 2: {		// 6x6 * 6x2

							MUL_Nx6_6x2_INIT
							MUL_Nx6_6x2_ROW2( 0 )
							MUL_Nx6_6x2_ROW2( 1 )
							MUL_Nx6_6x2_ROW2( 2 )

							return;
						}
						case 3: {		// 6x6 * 6x3

							MUL_Nx6_6x3_INIT
							MUL_Nx6_6x3_ROW( 0 )
							MUL_Nx6_6x3_ROW( 1 )
							MUL_Nx6_6x3_ROW( 2 )
							MUL_Nx6_6x3_ROW( 3 )
							MUL_Nx6_6x3_ROW( 4 )
							MUL_Nx6_6x3_ROW( 5 )

							return;
						}
						case 4: {		// 6x6 * 6x4

							MUL_Nx6_6x4_INIT
							MUL_Nx6_6x4_ROW( 0 )
							MUL_Nx6_6x4_ROW( 1 )
							MUL_Nx6_6x4_ROW( 2 )
							MUL_Nx6_6x4_ROW( 3 )
							MUL_Nx6_6x4_ROW( 4 )
							MUL_Nx6_6x4_ROW( 5 )

							return;
						}
						case 5: {		// 6x6 * 6x5

							MUL_Nx6_6x5_INIT
							MUL_Nx6_6x5_ROW( 0 )
							MUL_Nx6_6x5_ROW( 1 )
							MUL_Nx6_6x5_ROW( 2 )
							MUL_Nx6_6x5_ROW( 3 )
							MUL_Nx6_6x5_ROW( 4 )
							MUL_Nx6_6x5_ROW( 5 )

							return;
						}
						case 6: {		// 6x6 * 6x6
							__asm {
								mov			ecx, dword ptr m2Ptr
								movlps		xmm3, qword ptr [ecx+72]
								mov			edx, dword ptr m1Ptr
								// Loading first 4 columns (upper 4 rows) of m2Ptr.
								movaps		xmm0, xmmword ptr [ecx]
								movlps		xmm1, qword ptr [ecx+24]
								movhps		xmm1, qword ptr [ecx+32]
								movaps		xmm2, xmmword ptr [ecx+48]
								movhps		xmm3, qword ptr [ecx+80]
								// Calculating first 4 elements in the first row of the destination matrix.
								movss		xmm4, dword ptr [edx]
								movss		xmm5, dword ptr [edx+4]
								mov			eax, dword ptr dstPtr
								shufps		xmm4, xmm4, 0
								movss		xmm6, dword ptr [edx+8]
								shufps		xmm5, xmm5, 0
								movss		xmm7, dword ptr [edx+12]
								mulps		xmm4, xmm0
								shufps		xmm6, xmm6, 0
								shufps		xmm7, xmm7, 0
								mulps		xmm5, xmm1
								mulps		xmm6, xmm2
								addps		xmm5, xmm4
								mulps		xmm7, xmm3
								addps		xmm6, xmm5
								addps		xmm7, xmm6
								movaps		xmmword ptr [eax], xmm7
								// Calculating first 4 elements in the second row of the destination matrix.
								movss		xmm4, dword ptr [edx+24]
								shufps		xmm4, xmm4, 0
								mulps		xmm4, xmm0
								movss		xmm5, dword ptr [edx+28]
								shufps		xmm5, xmm5, 0
								mulps		xmm5, xmm1
								movss		xmm6, dword ptr [edx+32]
								shufps		xmm6, xmm6, 0
								movss		xmm7, dword ptr [edx+36]
								shufps		xmm7, xmm7, 0
								mulps		xmm6, xmm2
								mulps		xmm7, xmm3
								addps		xmm7, xmm6
								addps		xmm5, xmm4
								addps		xmm7, xmm5
								// Calculating first 4 elements in the third row of the destination matrix.
								movss		xmm4, dword ptr [edx+48]
								movss		xmm5, dword ptr [edx+52]
								movlps		qword ptr [eax+24], xmm7 ; save 2nd
								movhps		qword ptr [eax+32], xmm7 ; row
								movss		xmm6, dword ptr [edx+56]
								movss		xmm7, dword ptr [edx+60]
								shufps		xmm4, xmm4, 0
								shufps		xmm5, xmm5, 0
								shufps		xmm6, xmm6, 0
								shufps		xmm7, xmm7, 0
								mulps		xmm4, xmm0
								mulps		xmm5, xmm1
								mulps		xmm6, xmm2
								mulps		xmm7, xmm3
								addps		xmm5, xmm4
								addps		xmm7, xmm6
								addps		xmm7, xmm5
								movaps		xmmword ptr [eax+48], xmm7
								// Calculating first 4 elements in the fourth row of the destination matrix.
								movss		xmm4, dword ptr [edx+72]
								movss		xmm5, dword ptr [edx+76]
								movss		xmm6, dword ptr [edx+80]
								movss		xmm7, dword ptr [edx+84]
								shufps		xmm4, xmm4, 0
								shufps		xmm5, xmm5, 0
								shufps		xmm6, xmm6, 0
								shufps		xmm7, xmm7, 0
								mulps		xmm4, xmm0
								mulps		xmm5, xmm1
								mulps		xmm6, xmm2
								mulps		xmm7, xmm3
								addps		xmm4, xmm5
								addps		xmm6, xmm4
								addps		xmm7, xmm6
								movlps		qword ptr [eax+72], xmm7
								movhps		qword ptr [eax+80], xmm7
								// Calculating first 4 elements in the fifth row of the destination matrix.
								movss		xmm4, dword ptr [edx+96]
								movss		xmm5, dword ptr [edx+100]
								movss		xmm6, dword ptr [edx+104]
								movss		xmm7, dword ptr [edx+108]
								shufps		xmm4, xmm4, 0
								shufps		xmm5, xmm5, 0
								shufps		xmm6, xmm6, 0
								shufps		xmm7, xmm7, 0
								mulps		xmm4, xmm0
								mulps		xmm5, xmm1
								mulps		xmm6, xmm2
								mulps		xmm7, xmm3
								addps		xmm5, xmm4
								addps		xmm7, xmm6
								addps		xmm7, xmm5
								movaps		xmmword ptr [eax+96], xmm7
								// Calculating first 4 elements in the sixth row of the destination matrix.
								movss		xmm4, dword ptr [edx+120]
								movss		xmm5, dword ptr [edx+124]
								movss		xmm6, dword ptr [edx+128]
								movss		xmm7, dword ptr [edx+132]
								shufps		xmm4, xmm4, 0
								shufps		xmm5, xmm5, 0
								shufps		xmm6, xmm6, 0
								shufps		xmm7, xmm7, 0
								mulps		xmm4, xmm0
								mulps		xmm5, xmm1
								mulps		xmm6, xmm2
								mulps		xmm7, xmm3
								addps		xmm4, xmm5
								addps		xmm6, xmm4
								addps		xmm7, xmm6
								movhps		qword ptr [eax+128], xmm7
								movlps		qword ptr [eax+120], xmm7
								// Loading first 4 columns (lower 2 rows) of m2Ptr.
								movlps		xmm0, qword ptr [ecx+96]
								movhps		xmm0, qword ptr [ecx+104]
								movlps		xmm1, qword ptr [ecx+120]
								movhps		xmm1, qword ptr [ecx+128]
								// Calculating first 4 elements in the first row of the destination matrix.
								movss		xmm2, dword ptr [edx+16]
								shufps		xmm2, xmm2, 0
								movss		xmm4, dword ptr [edx+40]
								movss		xmm3, dword ptr [edx+20]
								movss		xmm5, dword ptr [edx+44]
								movaps		xmm6, xmmword ptr [eax]
								movlps		xmm7, qword ptr [eax+24]
								shufps		xmm3, xmm3, 0
								shufps		xmm5, xmm5, 0
								movhps		xmm7, qword ptr [eax+32]
								shufps		xmm4, xmm4, 0
								mulps		xmm5, xmm1
								mulps		xmm2, xmm0
								mulps		xmm3, xmm1
								mulps		xmm4, xmm0
								addps		xmm6, xmm2
								addps		xmm7, xmm4
								addps		xmm7, xmm5
								addps		xmm6, xmm3
								movlps		qword ptr [eax+24], xmm7
								movaps		xmmword ptr [eax], xmm6
								movhps		qword ptr [eax+32], xmm7
								// Calculating first 4 elements in the third row of the destination matrix.
								movss		xmm2, dword ptr [edx+64]
								movss		xmm4, dword ptr [edx+88]
								movss		xmm5, dword ptr [edx+92]
								movss		xmm3, dword ptr [edx+68]
								movaps		xmm6, xmmword ptr [eax+48]
								movlps		xmm7, qword ptr [eax+72]
								movhps		xmm7, qword ptr [eax+80]
								shufps		xmm2, xmm2, 0
								shufps		xmm4, xmm4, 0
								shufps		xmm5, xmm5, 0
								shufps		xmm3, xmm3, 0
								mulps		xmm2, xmm0
								mulps		xmm4, xmm0
								mulps		xmm5, xmm1
								mulps		xmm3, xmm1
								addps		xmm6, xmm2
								addps		xmm6, xmm3
								addps		xmm7, xmm4
								addps		xmm7, xmm5
								movlps		qword ptr [eax+72], xmm7
								movaps		xmmword ptr [eax+48], xmm6
								movhps		qword ptr [eax+80], xmm7
								// Calculating first 4 elements in the fifth row of the destination matrix.
								movss		xmm2, dword ptr [edx+112]
								movss		xmm3, dword ptr [edx+116]
								movaps		xmm6, xmmword ptr [eax+96]
								shufps		xmm2, xmm2, 0
								shufps		xmm3, xmm3, 0
								mulps		xmm2, xmm0
								mulps		xmm3, xmm1
								addps		xmm6, xmm2
								addps		xmm6, xmm3
								movaps		xmmword ptr [eax+96], xmm6
								// Calculating first 4 elements in the sixth row of the destination matrix.
								movss		xmm4, dword ptr [edx+136]
								movss		xmm5, dword ptr [edx+140]
								movhps		xmm7, qword ptr [eax+128]
								movlps		xmm7, qword ptr [eax+120]
								shufps		xmm4, xmm4, 0
								shufps		xmm5, xmm5, 0
								mulps		xmm4, xmm0
								mulps		xmm5, xmm1
								addps		xmm7, xmm4
								addps		xmm7, xmm5
								// Calculating last 2 columns of the destination matrix.
								movlps		xmm0, qword ptr [ecx+16]
								movhps		xmm0, qword ptr [ecx+40]
								movhps		qword ptr [eax+128], xmm7
								movlps		qword ptr [eax+120], xmm7
								movlps		xmm2, qword ptr [ecx+64]
								movhps		xmm2, qword ptr [ecx+88]
								movaps		xmm3, xmm2
								shufps		xmm3, xmm3, 4Eh
								movlps		xmm4, qword ptr [ecx+112]
								movhps		xmm4, qword ptr [ecx+136]
								movaps		xmm5, xmm4
								shufps		xmm5, xmm5, 4Eh
								movlps		xmm6, qword ptr [edx]
								movhps		xmm6, qword ptr [edx+24]
								movaps		xmm7, xmm6
								shufps		xmm7, xmm7, 0F0h
								mulps		xmm7, xmm0
								shufps		xmm6, xmm6, 0A5h
								movaps		xmm1, xmm0
								shufps		xmm1, xmm1, 4Eh
								mulps		xmm1, xmm6
								addps		xmm7, xmm1
								movlps		xmm6, qword ptr [edx+8]
								movhps		xmm6, qword ptr [edx+32]
								movaps		xmm1, xmm6
								shufps		xmm1, xmm1, 0F0h
								shufps		xmm6, xmm6, 0A5h
								mulps		xmm1, xmm2
								mulps		xmm6, xmm3
								addps		xmm7, xmm1
								addps		xmm7, xmm6
								movhps		xmm6, qword ptr [edx+40]
								movlps		xmm6, qword ptr [edx+16]
								movaps		xmm1, xmm6
								shufps		xmm1, xmm1, 0F0h
								shufps		xmm6, xmm6, 0A5h
								mulps		xmm1, xmm4
								mulps		xmm6, xmm5
								addps		xmm7, xmm1
								addps		xmm7, xmm6
								movlps		qword ptr [eax+16], xmm7
								movhps		qword ptr [eax+40], xmm7
								movlps		xmm6, qword ptr [edx+48]
								movhps		xmm6, qword ptr [edx+72]
								movaps		xmm7, xmm6
								shufps		xmm7, xmm7, 0F0h
								mulps		xmm7, xmm0
								shufps		xmm6, xmm6, 0A5h
								movaps		xmm1, xmm0
								shufps		xmm1, xmm1, 4Eh
								mulps		xmm1, xmm6
								addps		xmm7, xmm1
								movhps		xmm6, qword ptr [edx+80]
								movlps		xmm6, qword ptr [edx+56]
								movaps		xmm1, xmm6
								shufps		xmm1, xmm1, 0F0h
								shufps		xmm6, xmm6, 0A5h
								mulps		xmm1, xmm2
								mulps		xmm6, xmm3
								addps		xmm7, xmm1
								addps		xmm7, xmm6
								movlps		xmm6, qword ptr [edx+64]
								movhps		xmm6, qword ptr [edx+88]
								movaps		xmm1, xmm6
								shufps		xmm1, xmm1, 0F0h
								shufps		xmm6, xmm6, 0A5h
								mulps		xmm1, xmm4
								mulps		xmm6, xmm5
								addps		xmm7, xmm1
								addps		xmm7, xmm6
								movlps		qword ptr [eax+64], xmm7
								movhps		qword ptr [eax+88], xmm7
								movlps		xmm6, qword ptr [edx+96]
								movhps		xmm6, qword ptr [edx+120]
								movaps		xmm7, xmm6
								shufps		xmm7, xmm7, 0F0h
								mulps		xmm7, xmm0
								shufps		xmm6, xmm6, 0A5h
								movaps		xmm1, xmm0
								shufps		xmm1, xmm1, 4Eh
								mulps		xmm1, xmm6
								addps		xmm7, xmm1
								movlps		xmm6, qword ptr [edx+104]
								movhps		xmm6, qword ptr [edx+128]
								movaps		xmm1, xmm6
								shufps		xmm1, xmm1, 0F0h
								shufps		xmm6, xmm6, 0A5h
								mulps		xmm1, xmm2
								mulps		xmm6, xmm3
								addps		xmm7, xmm1
								addps		xmm7, xmm6
								movlps		xmm6, qword ptr [edx+112]
								movhps		xmm6, qword ptr [edx+136]
								movaps		xmm1, xmm6
								shufps		xmm1, xmm1, 0F0h
								shufps		xmm6, xmm6, 0A5h
								mulps		xmm1, xmm4
								mulps		xmm6, xmm5
								addps		xmm7, xmm1
								addps		xmm7, xmm6
								movlps		qword ptr [eax+112], xmm7
								movhps		qword ptr [eax+136], xmm7
							}
							return;
						}
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
									 m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
					m2Ptr++;
				}
				m1Ptr += 6;
			}
			break;
		}
		default: {
			for ( i = 0; i < k; i++ ) {
				for ( j = 0; j < l; j++ ) {
					m2Ptr = m2.ToFloatPtr() + j;
					sum = m1Ptr[0] * m2Ptr[0];
					for ( n = 1; n < m1.GetNumColumns(); n++ ) {
						m2Ptr += l;
						sum += m1Ptr[n] * m2Ptr[0];
					}
					*dstPtr++ = sum;
				}
				m1Ptr += m1.GetNumColumns();
			}
			break;
		}
	}
}

/*
============
idSIMD_SSE::MatX_TransposeMultiplyMatX

	optimizes the following transpose matrix multiplications:

	Nx6 * NxN
	6xN * 6x6

	with N in the range [1-6].
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
	int i, j, k, l, n;
	float *dstPtr;
	const float *m1Ptr, *m2Ptr;
	double sum;

	assert( m1.GetNumRows() == m2.GetNumRows() );

	m1Ptr = m1.ToFloatPtr();
	m2Ptr = m2.ToFloatPtr();
	dstPtr = dst.ToFloatPtr();
	k = m1.GetNumColumns();
	l = m2.GetNumColumns();

	switch( m1.GetNumRows() ) {
		case 1:
			if ( !((k^6)|(l^1)) ) {			// 1x6 * 1x1
				__asm {
					mov		esi, m2Ptr
					mov		edi, m1Ptr
					mov		eax, dstPtr
					movss	xmm0, [esi]
					shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
					movaps	xmm1, xmm0
					mulps	xmm0, [edi]
					mulps	xmm1, [edi+16]
					movaps	[eax], xmm0
					movlps	[eax+16], xmm1
				}
				return;
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		case 2:
			if ( !((k^6)|(l^2)) ) {			// 2x6 * 2x2
				#define MUL_2xN_2x2_INIT								\
				__asm mov		esi, m2Ptr								\
				__asm mov		edi, m1Ptr								\
				__asm mov		eax, dstPtr								\
				__asm movlps	xmm0, [esi]								\
				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
				__asm movlps	xmm1, [esi+8]							\
				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )

				#define MUL_2xN_2x2_ROW2( N, row )						\
				__asm movlps	xmm6, [edi+(row+0*N)*4]					\
				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
				__asm movlps	xmm7, [edi+(row+1*N)*4]					\
				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
				__asm mulps		xmm6, xmm0								\
				__asm mulps		xmm7, xmm1								\
				__asm addps		xmm6, xmm7								\
				__asm movaps	[eax+(row*2)*4], xmm6

				MUL_2xN_2x2_INIT
				MUL_2xN_2x2_ROW2( 6, 0 )
				MUL_2xN_2x2_ROW2( 6, 2 )
				MUL_2xN_2x2_ROW2( 6, 4 )

				return;
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		case 3:
			if ( !((k^6)|(l^3)) ) {			// 3x6 * 3x3

				#define MUL_3xN_3x3_INIT								\
				__asm mov		esi, m2Ptr								\
				__asm mov		edi, m1Ptr								\
				__asm mov		eax, dstPtr								\
				__asm movss		xmm0, [esi+(0*3+0)*4]					\
				__asm movhps	xmm0, [esi+(0*3+1)*4]					\
				__asm movss		xmm1, [esi+(1*3+0)*4]					\
				__asm movhps	xmm1, [esi+(1*3+1)*4]					\
				__asm movss		xmm2, [esi+(2*3+0)*4]					\
				__asm movhps	xmm2, [esi+(2*3+1)*4]

				#define MUL_3xN_3x3_INIT_ROW4							\
				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 )	\
				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 )	\
				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )

				#define MUL_3xN_3x3_ROW4( N, row )						\
				__asm movlps	xmm3, [edi+(row+0*N+0)*4]				\
				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 )	\
				__asm movlps	xmm4, [edi+(row+1*N+0)*4]				\
				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 )	\
				__asm movlps	xmm5, [edi+(row+2*N+0)*4]				\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 )	\
				__asm mulps		xmm3, xmm0								\
				__asm mulps		xmm4, xmm1								\
				__asm mulps		xmm5, xmm2								\
				__asm addps		xmm3, xmm4								\
				__asm addps		xmm3, xmm5								\
				__asm movaps	[eax+(row*3+0)*4], xmm3					\
				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 )	\
				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 )	\
				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 )	\
				__asm movlps	xmm3, [edi+(row+0*N+1)*4]				\
				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 )	\
				__asm movlps	xmm4, [edi+(row+1*N+1)*4]				\
				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 )	\
				__asm movlps	xmm5, [edi+(row+2*N+1)*4]				\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )	\
				__asm mulps		xmm3, xmm0								\
				__asm mulps		xmm4, xmm1								\
				__asm mulps		xmm5, xmm2								\
				__asm addps		xmm3, xmm4								\
				__asm addps		xmm3, xmm5								\
				__asm movaps	[eax+(row*3+4)*4], xmm3					\
				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 )	\
				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 )	\
				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 )	\
				__asm movlps	xmm3, [edi+(row+0*N+2)*4]				\
				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 )	\
				__asm movlps	xmm4, [edi+(row+1*N+2)*4]				\
				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 )	\
				__asm movlps	xmm5, [edi+(row+2*N+2)*4]				\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 )	\
				__asm mulps		xmm3, xmm0								\
				__asm mulps		xmm4, xmm1								\
				__asm mulps		xmm5, xmm2								\
				__asm addps		xmm3, xmm4								\
				__asm addps		xmm3, xmm5								\
				__asm movaps	[eax+(row*3+8)*4], xmm3

				#define MUL_3xN_3x3_INIT_ROW4_ROW4						\
				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )	\
				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )	\
				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

				#define MUL_3xN_3x3_INIT_ROW4_ROW						\
				__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 )	\
				__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 )	\
				__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )

				#define MUL_3xN_3x3_ROW( N, row )						\
				__asm movss		xmm3, [edi+(row+0*N)*4]					\
				__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm movss		xmm4, [edi+(row+1*N)*4]					\
				__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm movss		xmm5, [edi+(row+2*N)*4]					\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm3, xmm0								\
				__asm mulps		xmm4, xmm1								\
				__asm mulps		xmm5, xmm2								\
				__asm addps		xmm3, xmm4								\
				__asm addps		xmm3, xmm5								\
				__asm movss		[eax+(row*3+0)*4], xmm3					\
				__asm movhps	[eax+(row*3+1)*4], xmm3

				MUL_3xN_3x3_INIT
				MUL_3xN_3x3_INIT_ROW4
				MUL_3xN_3x3_ROW4( 6, 0 )
				MUL_3xN_3x3_INIT_ROW4_ROW
				MUL_3xN_3x3_ROW( 6, 4 )
				MUL_3xN_3x3_ROW( 6, 5 )

				return;
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		case 4:
			if ( !((k^6)|(l^4)) ) {			// 4x6 * 4x4

				#define MUL_4xN_4x4_INIT								\
				__asm mov		esi, m2Ptr								\
				__asm mov		edi, m1Ptr								\
				__asm mov		eax, dstPtr								\
				__asm movaps	xmm0, [esi]								\
				__asm movaps	xmm1, [esi+16]							\
				__asm movaps	xmm2, [esi+32]							\
				__asm movaps	xmm3, [esi+48]

				#define MUL_4xN_4x4_ROW( N, row )						\
				__asm movss		xmm7, [edi+(row+0*N)*4]					\
				__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm7, xmm0								\
				__asm movss		xmm6, [edi+(row+1*N)*4]					\
				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm6, xmm1								\
				__asm addps		xmm7, xmm6								\
				__asm movss		xmm6, [edi+(row+2*N)*4]					\
				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm6, xmm2								\
				__asm addps		xmm7, xmm6								\
				__asm movss		xmm6, [edi+(row+3*N)*4]					\
				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm6, xmm3								\
				__asm addps		xmm7, xmm6								\
				__asm movaps	[eax+row*16], xmm7

				MUL_4xN_4x4_INIT
				MUL_4xN_4x4_ROW( 6, 0 )
				MUL_4xN_4x4_ROW( 6, 1 )
				MUL_4xN_4x4_ROW( 6, 2 )
				MUL_4xN_4x4_ROW( 6, 3 )
				MUL_4xN_4x4_ROW( 6, 4 )
				MUL_4xN_4x4_ROW( 6, 5 )

				return;
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
									m1Ptr[3*k] * m2Ptr[3*l];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		case 5:
			if ( !((k^6)|(l^5)) ) {			// 5x6 * 5x5

				#define MUL_5xN_5x5_INIT								\
				__asm mov		esi, m2Ptr								\
				__asm mov		edi, m1Ptr								\
				__asm mov		eax, dstPtr								\
				__asm movlps	xmm0, [esi+ 0*4]						\
				__asm movhps	xmm0, [esi+ 2*4]						\
				__asm movlps	xmm1, [esi+ 5*4]						\
				__asm movhps	xmm1, [esi+ 7*4]						\
				__asm movlps	xmm2, [esi+10*4]						\
				__asm movhps	xmm2, [esi+12*4]						\
				__asm movlps	xmm3, [esi+15*4]						\
				__asm movhps	xmm3, [esi+17*4]						\
				__asm movlps	xmm4, [esi+20*4]						\
				__asm movhps	xmm4, [esi+22*4]

				#define MUL_5xN_5x5_ROW( N, row )						\
				__asm movss		xmm6, [edi+(row+0*N)*4]					\
				__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm6, xmm0								\
				__asm fld		dword ptr [edi+(row+0*N)*4]				\
				__asm fmul		dword ptr [esi+ 4*4]					\
				__asm movss		xmm5, [edi+(row+1*N)*4]					\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm5, xmm1								\
				__asm addps		xmm6, xmm5								\
				__asm fld		dword ptr [edi+(row+1*N)*4]				\
				__asm fmul		dword ptr [esi+ 9*4]					\
				__asm faddp		st(1),st								\
				__asm movss		xmm5, [edi+(row+2*N)*4]					\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm5, xmm2								\
				__asm addps		xmm6, xmm5								\
				__asm fld		dword ptr [edi+(row+2*N)*4]				\
				__asm fmul		dword ptr [esi+14*4]					\
				__asm faddp		st(1),st								\
				__asm movss		xmm5, [edi+(row+3*N)*4]					\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm5, xmm3								\
				__asm addps		xmm6, xmm5								\
				__asm fld		dword ptr [edi+(row+3*N)*4]				\
				__asm fmul		dword ptr [esi+19*4]					\
				__asm faddp		st(1),st								\
				__asm movss		xmm5, [edi+(row+4*N)*4]					\
				__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )	\
				__asm mulps		xmm5, xmm4								\
				__asm addps		xmm6, xmm5								\
				__asm fld		dword ptr [edi+(row+4*N)*4]				\
				__asm fmul		dword ptr [esi+24*4]					\
				__asm faddp		st(1),st								\
				__asm fstp		dword ptr [eax+(row*5+4)*4]				\
				__asm movlps	[eax+(row*5+0)*4], xmm6					\
				__asm movhps	[eax+(row*5+2)*4], xmm6

				MUL_5xN_5x5_INIT
				MUL_5xN_5x5_ROW( 6, 0 )
				MUL_5xN_5x5_ROW( 6, 1 )
				MUL_5xN_5x5_ROW( 6, 2 )
				MUL_5xN_5x5_ROW( 6, 3 )
				MUL_5xN_5x5_ROW( 6, 4 )
				MUL_5xN_5x5_ROW( 6, 5 )

				return;
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
									m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		case 6:
			if ( !(l^6) ) {
				switch( k ) {
					case 1: {					// 6x1 * 6x6
						#define MUL_6xN_6x6_FIRST4COLUMNS_INIT					\
						__asm mov		esi, m2Ptr								\
						__asm mov		edi, m1Ptr								\
						__asm mov		eax, dstPtr								\
						__asm movlps	xmm0, [esi+ 0*4]						\
						__asm movhps	xmm0, [esi+ 2*4]						\
						__asm movlps	xmm1, [esi+ 6*4]						\
						__asm movhps	xmm1, [esi+ 8*4]						\
						__asm movlps	xmm2, [esi+12*4]						\
						__asm movhps	xmm2, [esi+14*4]						\
						__asm movlps	xmm3, [esi+18*4]						\
						__asm movhps	xmm3, [esi+20*4]						\
						__asm movlps	xmm4, [esi+24*4]						\
						__asm movhps	xmm4, [esi+26*4]						\
						__asm movlps	xmm5, [esi+30*4]						\
						__asm movhps	xmm5, [esi+32*4]

						#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row )			\
						__asm movss		xmm7, [edi+(row+0*N)*4]					\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm7, xmm0								\
						__asm movss		xmm6, [edi+(row+1*N)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(row+2*N)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(row+3*N)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm3								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(row+4*N)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(row+5*N)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm5								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	[eax+(row*6+0)*4], xmm7					\
						__asm movhps	[eax+(row*6+2)*4], xmm7

						#define MUL_6xN_6x6_LAST2COLUMNS_INIT					\
						__asm movlps	xmm0, [esi+ 4*4]						\
						__asm movlps	xmm1, [esi+10*4]						\
						__asm shufps	xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps	xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm movlps	xmm2, [esi+16*4]						\
						__asm movlps	xmm3, [esi+22*4]						\
						__asm shufps	xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps	xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm movlps	xmm4, [esi+28*4]						\
						__asm movlps	xmm5, [esi+34*4]						\
						__asm shufps	xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 )	\
						__asm shufps	xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )

						#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row )			\
						__asm movlps	xmm7, [edi+(row*2+0*N)*4]				\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm7, xmm0								\
						__asm movlps	xmm6, [edi+(row*2+1*N)*4]				\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	xmm6, [edi+(row*2+2*N)*4]				\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	xmm6, [edi+(row*2+3*N)*4]				\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm3								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	xmm6, [edi+(row*2+4*N)*4]				\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	xmm6, [edi+(row*2+5*N)*4]				\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 )	\
						__asm mulps		xmm6, xmm5								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	[eax+(row*12+ 4)*4], xmm7				\
						__asm movhps	[eax+(row*12+10)*4], xmm7

						#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row )			\
						__asm movss		xmm7, [edi+(1*N-1)*4]					\
						__asm shufps	xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm7, xmm0								\
						__asm movss		xmm6, [edi+(2*N-1)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm1								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(3*N-1)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm2								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(4*N-1)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm3								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(5*N-1)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm4								\
						__asm addps		xmm7, xmm6								\
						__asm movss		xmm6, [edi+(6*N-1)*4]					\
						__asm shufps	xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )	\
						__asm mulps		xmm6, xmm5								\
						__asm addps		xmm7, xmm6								\
						__asm movlps	[eax+(row*6+4)*4], xmm7

						MUL_6xN_6x6_FIRST4COLUMNS_INIT
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
						MUL_6xN_6x6_LAST2COLUMNS_INIT
						MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )

						return;
					}
					case 2: {					// 6x2 * 6x6

						MUL_6xN_6x6_FIRST4COLUMNS_INIT
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
						MUL_6xN_6x6_LAST2COLUMNS_INIT
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )

						return;
					}
					case 3: {					// 6x3 * 6x6

						MUL_6xN_6x6_FIRST4COLUMNS_INIT
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
						MUL_6xN_6x6_LAST2COLUMNS_INIT
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
						MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )

						return;
					}
					case 4: {					// 6x4 * 6x6

						MUL_6xN_6x6_FIRST4COLUMNS_INIT
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
						MUL_6xN_6x6_LAST2COLUMNS_INIT
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )

						return;
					}
					case 5: {					// 6x5 * 6x6

						MUL_6xN_6x6_FIRST4COLUMNS_INIT
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
						MUL_6xN_6x6_LAST2COLUMNS_INIT
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
						MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )

						return;
					}
					case 6: {					// 6x6 * 6x6

						MUL_6xN_6x6_FIRST4COLUMNS_INIT
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
						MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
						MUL_6xN_6x6_LAST2COLUMNS_INIT
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
						MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )

						return;
					}
				}
			}
			for ( i = 0; i < k; i++ ) {
				m2Ptr = m2.ToFloatPtr();
				for ( j = 0; j < l; j++ ) {
					*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
									m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
					m2Ptr++;
				}
				m1Ptr++;
			}
			break;
		default:
			for ( i = 0; i < k; i++ ) {
				for ( j = 0; j < l; j++ ) {
					m1Ptr = m1.ToFloatPtr() + i;
					m2Ptr = m2.ToFloatPtr() + j;
					sum = m1Ptr[0] * m2Ptr[0];
					for ( n = 1; n < m1.GetNumRows(); n++ ) {
						m1Ptr += k;
						m2Ptr += l;
						sum += m1Ptr[0] * m2Ptr[0];
					}
					*dstPtr++ = sum;
				}
			}
		break;
	}
}

/*
============
idSIMD_SSE::MatX_LowerTriangularSolve

  solves x in Lx = b for the n * n sub-matrix of L
  if skip > 0 the first skip elements of x are assumed to be valid already
  L has to be a lower triangular matrix with (implicit) ones on the diagonal
  x == b is allowed
============
*/
void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
	int nc;
	const float *lptr;

	if ( skip >= n ) {
		return;
	}

	lptr = L.ToFloatPtr();
	nc = L.GetNumColumns();

	// unrolled cases for n < 8
	if ( n < 8 ) {
		#define NSKIP( n, s )	((n<<3)|(s&7))
		switch( NSKIP( n, skip ) ) {
			case NSKIP( 1, 0 ): x[0] = b[0];
				return;
			case NSKIP( 2, 0 ): x[0] = b[0];
			case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
				return;
			case NSKIP( 3, 0 ): x[0] = b[0];
			case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
			case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
				return;
			case NSKIP( 4, 0 ): x[0] = b[0];
			case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
			case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
			case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
				return;
			case NSKIP( 5, 0 ): x[0] = b[0];
			case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
			case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
			case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
			case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
				return;
			case NSKIP( 6, 0 ): x[0] = b[0];
			case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
			case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
			case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
			case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
			case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
				return;
			case NSKIP( 7, 0 ): x[0] = b[0];
			case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
			case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
			case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
			case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
			case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
			case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
				return;
		}
		return;
	}

	// process first 4 rows
	switch( skip ) {
		case 0: x[0] = b[0];
		case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
		case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
		case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
				skip = 4;
	}

	lptr = L[skip];

	// this code assumes n > 4
	__asm {
		push		ebx
		mov			eax, skip				// eax = i
		shl			eax, 2					// eax = i*4
		mov			edx, n					// edx = n
		shl			edx, 2					// edx = n*4
		mov			esi, x					// esi = x
		mov			edi, lptr				// edi = lptr
		add			esi, eax
		add			edi, eax
		mov			ebx, b					// ebx = b

		// check for aligned memory
		mov			ecx, nc
		shl			ecx, 2
		or			ecx, esi
		or			ecx, edi
		and			ecx, 15
		jnz			loopurow

		// aligned
	looprow:
		mov			ecx, eax
		neg			ecx
		movaps		xmm0, [esi+ecx]
		mulps		xmm0, [edi+ecx]
		add			ecx, 12*4
		jg			donedot8
	dot8:
		movaps		xmm1, [esi+ecx-(8*4)]
		mulps		xmm1, [edi+ecx-(8*4)]
		addps		xmm0, xmm1
		movaps		xmm3, [esi+ecx-(4*4)]
		mulps		xmm3, [edi+ecx-(4*4)]
		addps		xmm0, xmm3
		add			ecx, 8*4
		jle			dot8
	donedot8:
		sub			ecx, 4*4
		jg			donedot4
	//dot4:
		movaps		xmm1, [esi+ecx-(4*4)]
		mulps		xmm1, [edi+ecx-(4*4)]
		addps		xmm0, xmm1
		add			ecx, 4*4
	donedot4:
		movhlps		xmm1, xmm0
		addps		xmm0, xmm1
		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
		addss		xmm0, xmm1
		sub			ecx, 4*4
		jz			dot0
		add			ecx, 4
		jz			dot1
		add			ecx, 4
		jz			dot2
	//dot3:
		movss		xmm1, [esi-(3*4)]
		mulss		xmm1, [edi-(3*4)]
		addss		xmm0, xmm1
	dot2:
		movss		xmm3, [esi-(2*4)]
		mulss		xmm3, [edi-(2*4)]
		addss		xmm0, xmm3
	dot1:
		movss		xmm5, [esi-(1*4)]
		mulss		xmm5, [edi-(1*4)]
		addss		xmm0, xmm5
	dot0:
		movss		xmm1, [ebx+eax]
		subss		xmm1, xmm0
		movss		[esi], xmm1
		add			eax, 4
		cmp			eax, edx
		jge			done
		add			esi, 4
		mov			ecx, nc
		shl			ecx, 2
		add			edi, ecx
		add			edi, 4
		jmp			looprow

		// unaligned
	loopurow:
		mov			ecx, eax
		neg			ecx
		movups		xmm0, [esi+ecx]
		movups		xmm1, [edi+ecx]
		mulps		xmm0, xmm1
		add			ecx, 12*4
		jg			doneudot8
	udot8:
		movups		xmm1, [esi+ecx-(8*4)]
		movups		xmm2, [edi+ecx-(8*4)]
		mulps		xmm1, xmm2
		addps		xmm0, xmm1
		movups		xmm3, [esi+ecx-(4*4)]
		movups		xmm4, [edi+ecx-(4*4)]
		mulps		xmm3, xmm4
		addps		xmm0, xmm3
		add			ecx, 8*4
		jle			udot8
	doneudot8:
		sub			ecx, 4*4
		jg			doneudot4
	//udot4:
		movups		xmm1, [esi+ecx-(4*4)]
		movups		xmm2, [edi+ecx-(4*4)]
		mulps		xmm1, xmm2
		addps		xmm0, xmm1
		add			ecx, 4*4
	doneudot4:
		movhlps		xmm1, xmm0
		addps		xmm0, xmm1
		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
		addss		xmm0, xmm1
		sub			ecx, 4*4
		jz			udot0
		add			ecx, 4
		jz			udot1
		add			ecx, 4
		jz			udot2
	//udot3:
		movss		xmm1, [esi-(3*4)]
		movss		xmm2, [edi-(3*4)]
		mulss		xmm1, xmm2
		addss		xmm0, xmm1
	udot2:
		movss		xmm3, [esi-(2*4)]
		movss		xmm4, [edi-(2*4)]
		mulss		xmm3, xmm4
		addss		xmm0, xmm3
	udot1:
		movss		xmm5, [esi-(1*4)]
		movss		xmm6, [edi-(1*4)]
		mulss		xmm5, xmm6
		addss		xmm0, xmm5
	udot0:
		movss		xmm1, [ebx+eax]
		subss		xmm1, xmm0
		movss		[esi], xmm1
		add			eax, 4
		cmp			eax, edx
		jge			done
		add			esi, 4
		mov			ecx, nc
		shl			ecx, 2
		add			edi, ecx
		add			edi, 4
		jmp			loopurow
	done:
		pop			ebx
	}
}

/*
============
idSIMD_SSE::MatX_LowerTriangularSolveTranspose

  solves x in L'x = b for the n * n sub-matrix of L
  L has to be a lower triangular matrix with (implicit) ones on the diagonal
  x == b is allowed
============
*/
void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
	int nc;
	const float *lptr;

	lptr = L.ToFloatPtr();
	nc = L.GetNumColumns();

	// unrolled cases for n < 8
	if ( n < 8 ) {
		switch( n ) {
			case 0:
				return;
			case 1:
				x[0] = b[0];
				return;
			case 2:
				x[1] = b[1];
				x[0] = b[0] - lptr[1*nc+0] * x[1];
				return;
			case 3:
				x[2] = b[2];
				x[1] = b[1] - lptr[2*nc+1] * x[2];
				x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
				return;
			case 4:
				x[3] = b[3];
				x[2] = b[2] - lptr[3*nc+2] * x[3];
				x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
				x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
				return;
			case 5:
				x[4] = b[4];
				x[3] = b[3] - lptr[4*nc+3] * x[4];
				x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
				x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
				x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
				return;
			case 6:
				x[5] = b[5];
				x[4] = b[4] - lptr[5*nc+4] * x[5];
				x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
				x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
				x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
				x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
				return;
			case 7:
				x[6] = b[6];
				x[5] = b[5] - lptr[6*nc+5] * x[6];
				x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
				x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
				x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
				x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
				x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
				return;
		}
		return;
	}

#if 1

	int i, j, m;
	float *xptr;
	double s0;

	// if the number of columns is not a multiple of 2 we're screwed for alignment.
	// however, if the number of columns is a multiple of 2 but the number of to be
	// processed rows is not a multiple of 2 we can still run 8 byte aligned
	m = n;
	if ( m & 1 ) {

		m--;
		x[m] = b[m];

		lptr = L.ToFloatPtr() + m * nc + m - 4;
		xptr = x + m;
		__asm {
			push		ebx
			mov			eax, m					// eax = i
			mov			esi, xptr				// esi = xptr
			mov			edi, lptr				// edi = lptr
			mov			ebx, b					// ebx = b
			mov			edx, nc					// edx = nc*sizeof(float)
			shl			edx, 2
		process4rows_1:
			movlps		xmm0, [ebx+eax*4-16]	// load b[i-2], b[i-1]
			movhps		xmm0, [ebx+eax*4-8]		// load b[i-4], b[i-3]
			xor			ecx, ecx
			sub			eax, m
			neg			eax
			jz			done4x4_1
		process4x4_1:	// process 4x4 blocks
			movlps		xmm2, [edi+0]
			movhps		xmm2, [edi+8]
			add			edi, edx
			movss		xmm1, [esi+4*ecx+0]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			movlps		xmm3, [edi+0]
			movhps		xmm3, [edi+8]
			add			edi, edx
			mulps		xmm1, xmm2
			subps		xmm0, xmm1
			movss		xmm1, [esi+4*ecx+4]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			movlps		xmm4, [edi+0]
			movhps		xmm4, [edi+8]
			add			edi, edx
			mulps		xmm1, xmm3
			subps		xmm0, xmm1
			movss		xmm1, [esi+4*ecx+8]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			movlps		xmm5, [edi+0]
			movhps		xmm5, [edi+8]
			add			edi, edx
			mulps		xmm1, xmm4
			subps		xmm0, xmm1
			movss		xmm1, [esi+4*ecx+12]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			add			ecx, 4
			cmp			ecx, eax
			mulps		xmm1, xmm5
			subps		xmm0, xmm1
			jl			process4x4_1
		done4x4_1:		// process left over of the 4 rows
			movlps		xmm2, [edi+0]
			movhps		xmm2, [edi+8]
			movss		xmm1, [esi+4*ecx]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			mulps		xmm1, xmm2
			subps		xmm0, xmm1
			imul		ecx, edx
			sub			edi, ecx
			neg			eax

			add			eax, m
			sub			eax, 4
			movaps		xmm1, xmm0
			shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
			movaps		xmm2, xmm0
			shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
			movaps		xmm3, xmm0
			shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
			sub			edi, edx
			movss		[esi-4], xmm3			// xptr[-1] = s3
			movss		xmm4, xmm3
			movss		xmm5, xmm3
			mulss		xmm3, [edi+8]			// lptr[-1*nc+2] * s3
			mulss		xmm4, [edi+4]			// lptr[-1*nc+1] * s3
			mulss		xmm5, [edi+0]			// lptr[-1*nc+0] * s3
			subss		xmm2, xmm3
			movss		[esi-8], xmm2			// xptr[-2] = s2
			movss		xmm6, xmm2
			sub			edi, edx
			subss		xmm0, xmm5
			subss		xmm1, xmm4
			mulss		xmm2, [edi+4]			// lptr[-2*nc+1] * s2
			mulss		xmm6, [edi+0]			// lptr[-2*nc+0] * s2
			subss		xmm1, xmm2
			movss		[esi-12], xmm1			// xptr[-3] = s1
			subss		xmm0, xmm6
			sub			edi, edx
			cmp			eax, 4
			mulss		xmm1, [edi+0]			// lptr[-3*nc+0] * s1
			subss		xmm0, xmm1
			movss		[esi-16], xmm0			// xptr[-4] = s0
			jl			done4rows_1
			sub			edi, edx
			sub			edi, 16
			sub			esi, 16
			jmp			process4rows_1
		done4rows_1:
			pop			ebx
		}

	} else {

		lptr = L.ToFloatPtr() + m * nc + m - 4;
		xptr = x + m;
		__asm {
			push		ebx
			mov			eax, m					// eax = i
			mov			esi, xptr				// esi = xptr
			mov			edi, lptr				// edi = lptr
			mov			ebx, b					// ebx = b
			mov			edx, nc					// edx = nc*sizeof(float)
			shl			edx, 2
		process4rows:
			movlps		xmm0, [ebx+eax*4-16]	// load b[i-2], b[i-1]
			movhps		xmm0, [ebx+eax*4-8]		// load b[i-4], b[i-3]
			sub			eax, m
			jz			done4x4
			neg			eax
			xor			ecx, ecx
		process4x4:		// process 4x4 blocks
			movlps		xmm2, [edi+0]
			movhps		xmm2, [edi+8]
			add			edi, edx
			movss		xmm1, [esi+4*ecx+0]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			movlps		xmm3, [edi+0]
			movhps		xmm3, [edi+8]
			add			edi, edx
			mulps		xmm1, xmm2
			subps		xmm0, xmm1
			movss		xmm1, [esi+4*ecx+4]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			movlps		xmm4, [edi+0]
			movhps		xmm4, [edi+8]
			add			edi, edx
			mulps		xmm1, xmm3
			subps		xmm0, xmm1
			movss		xmm1, [esi+4*ecx+8]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			movlps		xmm5, [edi+0]
			movhps		xmm5, [edi+8]
			add			edi, edx
			mulps		xmm1, xmm4
			subps		xmm0, xmm1
			movss		xmm1, [esi+4*ecx+12]
			shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
			add			ecx, 4
			cmp			ecx, eax
			mulps		xmm1, xmm5
			subps		xmm0, xmm1
			jl			process4x4
			imul		ecx, edx
			sub			edi, ecx
			neg			eax
		done4x4:		// process left over of the 4 rows
			add			eax, m
			sub			eax, 4
			movaps		xmm1, xmm0
			shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
			movaps		xmm2, xmm0
			shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
			movaps		xmm3, xmm0
			shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
			sub			edi, edx
			movss		[esi-4], xmm3			// xptr[-1] = s3
			movss		xmm4, xmm3
			movss		xmm5, xmm3
			mulss		xmm3, [edi+8]			// lptr[-1*nc+2] * s3
			mulss		xmm4, [edi+4]			// lptr[-1*nc+1] * s3
			mulss		xmm5, [edi+0]			// lptr[-1*nc+0] * s3
			subss		xmm2, xmm3
			movss		[esi-8], xmm2			// xptr[-2] = s2
			movss		xmm6, xmm2
			sub			edi, edx
			subss		xmm0, xmm5
			subss		xmm1, xmm4
			mulss		xmm2, [edi+4]			// lptr[-2*nc+1] * s2
			mulss		xmm6, [edi+0]			// lptr[-2*nc+0] * s2
			subss		xmm1, xmm2
			movss		[esi-12], xmm1			// xptr[-3] = s1
			subss		xmm0, xmm6
			sub			edi, edx
			cmp			eax, 4
			mulss		xmm1, [edi+0]			// lptr[-3*nc+0] * s1
			subss		xmm0, xmm1
			movss		[esi-16], xmm0			// xptr[-4] = s0
			jl			done4rows
			sub			edi, edx
			sub			edi, 16
			sub			esi, 16
			jmp			process4rows
		done4rows:
			pop			ebx
		}
	}

	// process left over rows
	for ( i = (m&3)-1; i >= 0; i-- ) {
		s0 = b[i];
		lptr = L[0] + i;
		for ( j = i + 1; j < n; j++ ) {
			s0 -= lptr[j*nc] * x[j];
		}
		x[i] = s0;
	}

#else

	int i, j, m;
	double s0, s1, s2, s3, t;
	const float *lptr2;
	float *xptr, *xptr2;

	m = n;
	if ( m & 1 ) {

		m--;
		x[m] = b[m];

		lptr = L.ToFloatPtr() + m * nc + m - 4;
		xptr = x + m;
		// process 4 rows at a time
		for ( i = m; i >= 4; i -= 4 ) {
			s0 = b[i-4];
			s1 = b[i-3];
			s2 = b[i-2];
			s3 = b[i-1];
			// process 4x4 blocks
			xptr2 = xptr;	// x + i;
			lptr2 = lptr;	// ptr = L[i] + i - 4;
			for ( j = 0; j < m-i; j += 4 ) {
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
			}
			t = xptr2[0];
			s0 -= lptr2[0] * t;
			s1 -= lptr2[1] * t;
			s2 -= lptr2[2] * t;
			s3 -= lptr2[3] * t;
			// process left over of the 4 rows
			lptr -= nc;
			s0 -= lptr[0] * s3;
			s1 -= lptr[1] * s3;
			s2 -= lptr[2] * s3;
			lptr -= nc;
			s0 -= lptr[0] * s2;
			s1 -= lptr[1] * s2;
			lptr -= nc;
			s0 -= lptr[0] * s1;
			lptr -= nc;
			// store result
			xptr[-4] = s0;
			xptr[-3] = s1;
			xptr[-2] = s2;
			xptr[-1] = s3;
			// update pointers for next four rows
			lptr -= 4;
			xptr -= 4;
		}

	} else {

		lptr = L.ToFloatPtr() + m * nc + m - 4;
		xptr = x + m;
		// process 4 rows at a time
		for ( i = m; i >= 4; i -= 4 ) {
			s0 = b[i-4];
			s1 = b[i-3];
			s2 = b[i-2];
			s3 = b[i-1];
			// process 4x4 blocks
			xptr2 = xptr;	// x + i;
			lptr2 = lptr;	// ptr = L[i] + i - 4;
			for ( j = 0; j < m-i; j += 4 ) {
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
				t = xptr2[0];
				s0 -= lptr2[0] * t;
				s1 -= lptr2[1] * t;
				s2 -= lptr2[2] * t;
				s3 -= lptr2[3] * t;
				lptr2 += nc;
				xptr2++;
			}
			// process left over of the 4 rows
			lptr -= nc;
			s0 -= lptr[0] * s3;
			s1 -= lptr[1] * s3;
			s2 -= lptr[2] * s3;
			lptr -= nc;
			s0 -= lptr[0] * s2;
			s1 -= lptr[1] * s2;
			lptr -= nc;
			s0 -= lptr[0] * s1;
			lptr -= nc;
			// store result
			xptr[-4] = s0;
			xptr[-3] = s1;
			xptr[-2] = s2;
			xptr[-1] = s3;
			// update pointers for next four rows
			lptr -= 4;
			xptr -= 4;
		}
	}
	// process left over rows
	for ( i--; i >= 0; i-- ) {
		s0 = b[i];
		lptr = L[0] + i;
		for ( j = i + 1; j < m; j++ ) {
			s0 -= lptr[j*nc] * x[j];
		}
		x[i] = s0;
	}

#endif
}

/*
============
idSIMD_SSE::MatX_LDLTFactor

  in-place factorization LDL' of the n * n sub-matrix of mat
  the reciprocal of the diagonal elements are stored in invDiag
  currently assumes the number of columns of mat is a multiple of 4
============
*/
bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
#if 1

	int j, nc;
	float *v, *diag, *invDiagPtr, *mptr;
	double s0, s1, s2, sum, d;

	v = (float *) _alloca16( n * sizeof( float ) );
	diag = (float *) _alloca16( n * sizeof( float ) );
	invDiagPtr = invDiag.ToFloatPtr();

	nc = mat.GetNumColumns();

	assert( ( nc & 3 ) == 0 );

	if ( n <= 0 ) {
		return true;
	}

	mptr = mat[0];

	sum = mptr[0];

	if ( sum == 0.0f ) {
		return false;
	}

	diag[0] = sum;
	invDiagPtr[0] = d = 1.0f / sum;

	if ( n <= 1 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 1; j < n; j++ ) {
		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
	}

	mptr = mat[1];

	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
	sum = mptr[1] - s0;

	if ( sum == 0.0f ) {
		return false;
	}

	mat[1][1] = sum;
	diag[1] = sum;
	invDiagPtr[1] = d = 1.0f / sum;

	if ( n <= 2 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 2; j < n; j++ ) {
		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
	}

	mptr = mat[2];

	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
	sum = mptr[2] - s0 - s1;

	if ( sum == 0.0f ) {
		return false;
	}

	mat[2][2] = sum;
	diag[2] = sum;
	invDiagPtr[2] = d = 1.0f / sum;

	if ( n <= 3 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 3; j < n; j++ ) {
		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
	}

	mptr = mat[3];

	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
	sum = mptr[3] - s0 - s1 - s2;

	if ( sum == 0.0f ) {
		return false;
	}

	mat[3][3] = sum;
	diag[3] = sum;
	invDiagPtr[3] = d = 1.0f / sum;

	if ( n <= 4 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 4; j < n; j++ ) {
		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
	}

	int ncf = nc * sizeof( float );
	mptr = mat[0];

	__asm {
		xorps		xmm2, xmm2
		xorps		xmm3, xmm3
		xorps		xmm4, xmm4

		push		ebx
		mov			ebx, 4

	loopRow:
			cmp			ebx, n
			jge			done

			mov			ecx, ebx				// esi = i
			shl			ecx, 2					// esi = i * 4
			mov			edx, diag				// edx = diag
			add			edx, ecx				// edx = &diag[i]
			mov			edi, ebx				// edi = i
			imul		edi, ncf				// edi = i * nc * sizeof( float )
			add			edi, mptr				// edi = mat[i]
			add			edi, ecx				// edi = &mat[i][i]
			mov			esi, v					// ecx = v
			add			esi, ecx				// ecx = &v[i]
			mov			eax, invDiagPtr			// eax = invDiagPtr
			add			eax, ecx				// eax = &invDiagPtr[i]
			neg			ecx

			movaps		xmm0, [edx+ecx]
			mulps		xmm0, [edi+ecx]
			movaps		[esi+ecx], xmm0
			mulps		xmm0, [edi+ecx]
			add			ecx, 12*4
			jg			doneDot8
		dot8:
			movaps		xmm1, [edx+ecx-(8*4)]
			mulps		xmm1, [edi+ecx-(8*4)]
			movaps		[esi+ecx-(8*4)], xmm1
			mulps		xmm1, [edi+ecx-(8*4)]
			addps		xmm0, xmm1
			movaps		xmm2, [edx+ecx-(4*4)]
			mulps		xmm2, [edi+ecx-(4*4)]
			movaps		[esi+ecx-(4*4)], xmm2
			mulps		xmm2, [edi+ecx-(4*4)]
			addps		xmm0, xmm2
			add			ecx, 8*4
			jle			dot8
		doneDot8:
			sub			ecx, 4*4
			jg			doneDot4
			movaps		xmm1, [edx+ecx-(4*4)]
			mulps		xmm1, [edi+ecx-(4*4)]
			movaps		[esi+ecx-(4*4)], xmm1
			mulps		xmm1, [edi+ecx-(4*4)]
			addps		xmm0, xmm1
			add			ecx, 4*4
		doneDot4:
			sub			ecx, 2*4
			jg			doneDot2
			movlps		xmm3, [edx+ecx-(2*4)]
			movlps		xmm4, [edi+ecx-(2*4)]
			mulps		xmm3, xmm4
			movlps		[esi+ecx-(2*4)], xmm3
			mulps		xmm3, xmm4
			addps		xmm0, xmm3
			add			ecx, 2*4
		doneDot2:
			sub			ecx, 1*4
			jg			doneDot1
			movss		xmm3, [edx+ecx-(1*4)]
			movss		xmm4, [edi+ecx-(1*4)]
			mulss		xmm3, xmm4
			movss		[esi+ecx-(1*4)], xmm3
			mulss		xmm3, xmm4
			addss		xmm0, xmm3
		doneDot1:
			movhlps		xmm2, xmm0
			addps		xmm0, xmm2
			movaps		xmm2, xmm0
			shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
			addss		xmm0, xmm2
			movss		xmm1, [edi]
			subss		xmm1, xmm0
			movss		[edi], xmm1				// mptr[i] = sum;
			movss		[edx], xmm1				// diag[i] = sum;

			// if ( sum == 0.0f ) return false;
			movaps		xmm2, xmm1
			cmpeqss		xmm2, SIMD_SP_zero
			andps		xmm2, SIMD_SP_tiny
			orps		xmm1, xmm2

			rcpss		xmm7, xmm1
			mulss		xmm1, xmm7
			mulss		xmm1, xmm7
			addss		xmm7, xmm7
			subss		xmm7, xmm1
			movss		[eax], xmm7				// invDiagPtr[i] = 1.0f / sum;

			mov			edx, n					// edx = n
			sub			edx, ebx				// edx = n - i
			dec			edx						// edx = n - i - 1
			jle			doneSubRow				// if ( i + 1 >= n ) return true;

			mov			eax, ebx				// eax = i
			shl			eax, 2					// eax = i * 4
			neg			eax

		loopSubRow:
				add			edi, ncf
				mov			ecx, eax
				movaps		xmm0, [esi+ecx]
				mulps		xmm0, [edi+ecx]
				add			ecx, 12*4
				jg			doneSubDot8
			subDot8:
				movaps		xmm1, [esi+ecx-(8*4)]
				mulps		xmm1, [edi+ecx-(8*4)]
				addps		xmm0, xmm1
				movaps		xmm2, [esi+ecx-(4*4)]
				mulps		xmm2, [edi+ecx-(4*4)]
				addps		xmm0, xmm2
				add			ecx, 8*4
				jle			subDot8
			doneSubDot8:
				sub			ecx, 4*4
				jg			doneSubDot4
				movaps		xmm1, [esi+ecx-(4*4)]
				mulps		xmm1, [edi+ecx-(4*4)]
				addps		xmm0, xmm1
				add			ecx, 4*4
			doneSubDot4:
				sub			ecx, 2*4
				jg			doneSubDot2
				movlps		xmm3, [esi+ecx-(2*4)]
				movlps		xmm4, [edi+ecx-(2*4)]
				mulps		xmm3, xmm4
				addps		xmm0, xmm3
				add			ecx, 2*4
			doneSubDot2:
				sub			ecx, 1*4
				jg			doneSubDot1
				movss		xmm3, [esi+ecx-(1*4)]
				movss		xmm4, [edi+ecx-(1*4)]
				mulss		xmm3, xmm4
				addss		xmm0, xmm3
			doneSubDot1:
				movhlps		xmm2, xmm0
				addps		xmm0, xmm2
				movaps		xmm2, xmm0
				shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
				addss		xmm0, xmm2
				movss		xmm1, [edi]
				subss		xmm1, xmm0
				mulss		xmm1, xmm7
				movss		[edi], xmm1
				dec			edx
				jg			loopSubRow
		doneSubRow:
			inc		ebx
			jmp		loopRow
	done:
		pop		ebx
	}

	return true;

#else

	int i, j, k, nc;
	float *v, *diag, *mptr;
	double s0, s1, s2, s3, sum, d;

	v = (float *) _alloca16( n * sizeof( float ) );
	diag = (float *) _alloca16( n * sizeof( float ) );

	nc = mat.GetNumColumns();

	if ( n <= 0 ) {
		return true;
	}

	mptr = mat[0];

	sum = mptr[0];

	if ( sum == 0.0f ) {
		return false;
	}

	diag[0] = sum;
	invDiag[0] = d = 1.0f / sum;

	if ( n <= 1 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 1; j < n; j++ ) {
		mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
	}

	mptr = mat[1];

	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
	sum = mptr[1] - s0;

	if ( sum == 0.0f ) {
		return false;
	}

	mat[1][1] = sum;
	diag[1] = sum;
	invDiag[1] = d = 1.0f / sum;

	if ( n <= 2 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 2; j < n; j++ ) {
		mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
	}

	mptr = mat[2];

	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
	sum = mptr[2] - s0 - s1;

	if ( sum == 0.0f ) {
		return false;
	}

	mat[2][2] = sum;
	diag[2] = sum;
	invDiag[2] = d = 1.0f / sum;

	if ( n <= 3 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 3; j < n; j++ ) {
		mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
	}

	mptr = mat[3];

	v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
	v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
	v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
	sum = mptr[3] - s0 - s1 - s2;

	if ( sum == 0.0f ) {
		return false;
	}

	mat[3][3] = sum;
	diag[3] = sum;
	invDiag[3] = d = 1.0f / sum;

	if ( n <= 4 ) {
		return true;
	}

	mptr = mat[0];
	for ( j = 4; j < n; j++ ) {
		mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
	}

	for ( i = 4; i < n; i++ ) {

		mptr = mat[i];

		v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
		v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
		v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
		v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
		for ( k = 4; k < i-3; k += 4 ) {
			v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
			v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
			v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
			v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
		}
		switch( i - k ) {
			case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
			case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
			case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
		}
		sum = s3;
		sum += s2;
		sum += s1;
		sum += s0;
		sum = mptr[i] - sum;

		if ( sum == 0.0f ) {
			return false;
		}

		mat[i][i] = sum;
		diag[i] = sum;
		invDiag[i] = d = 1.0f / sum;

		if ( i + 1 >= n ) {
			return true;
		}

		mptr = mat[i+1];
		for ( j = i+1; j < n; j++ ) {
			s0 = mptr[0] * v[0];
			s1 = mptr[1] * v[1];
			s2 = mptr[2] * v[2];
			s3 = mptr[3] * v[3];
			for ( k = 4; k < i-7; k += 8 ) {
				s0 += mptr[k+0] * v[k+0];
				s1 += mptr[k+1] * v[k+1];
				s2 += mptr[k+2] * v[k+2];
				s3 += mptr[k+3] * v[k+3];
				s0 += mptr[k+4] * v[k+4];
				s1 += mptr[k+5] * v[k+5];
				s2 += mptr[k+6] * v[k+6];
				s3 += mptr[k+7] * v[k+7];
			}
			switch( i - k ) {
				case 7: s0 += mptr[k+6] * v[k+6];
				case 6: s1 += mptr[k+5] * v[k+5];
				case 5: s2 += mptr[k+4] * v[k+4];
				case 4: s3 += mptr[k+3] * v[k+3];
				case 3: s0 += mptr[k+2] * v[k+2];
				case 2: s1 += mptr[k+1] * v[k+1];
				case 1: s2 += mptr[k+0] * v[k+0];
			}
			sum = s3;
			sum += s2;
			sum += s1;
			sum += s0;
			mptr[i] = ( mptr[i] - sum ) * d;
			mptr += nc;
		}
	}

	return true;

#endif
}

/*
============
idSIMD_SSE::BlendJoints
============
*/
#define REFINE_BLENDJOINTS_RECIPROCAL

void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
	int i;

	if ( lerp <= 0.0f ) {
		return;
	} else if ( lerp >= 1.0f ) {
		for ( i = 0; i < numJoints; i++ ) {
			int j = index[i];
			joints[j] = blendJoints[j];
		}
		return;
	}

	for ( i = 0; i <= numJoints - 4; i += 4 ) {
		ALIGN16( float jointVert0[4] );
		ALIGN16( float jointVert1[4] );
		ALIGN16( float jointVert2[4] );
		ALIGN16( float blendVert0[4] );
		ALIGN16( float blendVert1[4] );
		ALIGN16( float blendVert2[4] );
		ALIGN16( float jointQuat0[4] );
		ALIGN16( float jointQuat1[4] );
		ALIGN16( float jointQuat2[4] );
		ALIGN16( float jointQuat3[4] );
		ALIGN16( float blendQuat0[4] );
		ALIGN16( float blendQuat1[4] );
		ALIGN16( float blendQuat2[4] );
		ALIGN16( float blendQuat3[4] );

		for ( int j = 0; j < 4; j++ ) {
			int n = index[i+j];

			jointVert0[j] = joints[n].t[0];
			jointVert1[j] = joints[n].t[1];
			jointVert2[j] = joints[n].t[2];

			blendVert0[j] = blendJoints[n].t[0];
			blendVert1[j] = blendJoints[n].t[1];
			blendVert2[j] = blendJoints[n].t[2];

			jointQuat0[j] = joints[n].q[0];
			jointQuat1[j] = joints[n].q[1];
			jointQuat2[j] = joints[n].q[2];
			jointQuat3[j] = joints[n].q[3];

			blendQuat0[j] = blendJoints[n].q[0];
			blendQuat1[j] = blendJoints[n].q[1];
			blendQuat2[j] = blendJoints[n].q[2];
			blendQuat3[j] = blendJoints[n].q[3];
		}

#if 1
		__asm {
			// lerp translation
			movss		xmm7, lerp
			shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
			movaps		xmm0, blendVert0
			subps		xmm0, jointVert0
			mulps		xmm0, xmm7
			addps		xmm0, jointVert0
			movaps		jointVert0, xmm0
			movaps		xmm1, blendVert1
			subps		xmm1, jointVert1
			mulps		xmm1, xmm7
			addps		xmm1, jointVert1
			movaps		jointVert1, xmm1
			movaps		xmm2, blendVert2
			subps		xmm2, jointVert2
			mulps		xmm2, xmm7
			addps		xmm2, jointVert2
			movaps		jointVert2, xmm2

			// lerp quaternions
			movaps		xmm0, jointQuat0
			mulps		xmm0, blendQuat0
			movaps		xmm1, jointQuat1
			mulps		xmm1, blendQuat1
			addps		xmm0, xmm1
			movaps		xmm2, jointQuat2
			mulps		xmm2, blendQuat2
			addps		xmm0, xmm2
			movaps		xmm3, jointQuat3
			mulps		xmm3, blendQuat3
			addps		xmm0, xmm3					// xmm0 = cosom

			movaps		xmm1, xmm0
			movaps		xmm2, xmm0
			andps		xmm1, SIMD_SP_signBitMask	// xmm1 = signBit
			xorps		xmm0, xmm1
			mulps		xmm2, xmm2

			xorps		xmm4, xmm4
			movaps		xmm3, SIMD_SP_one
			subps		xmm3, xmm2					// xmm3 = scale0
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4

#ifdef REFINE_BLENDJOINTS_RECIPROCAL
			movaps		xmm2, xmm3
			rsqrtps		xmm4, xmm2
			mulps		xmm2, xmm4
			mulps		xmm2, xmm4
			subps		xmm2, SIMD_SP_rsqrt_c0
			mulps		xmm4, SIMD_SP_rsqrt_c1
			mulps		xmm2, xmm4
#else
			rsqrtps		xmm2, xmm3					// xmm2 = sinom
#endif
			mulps		xmm3, xmm2					// xmm3 = sqrt( scale0 )

			// omega0 = atan2( xmm3, xmm0 )
			movaps		xmm4, xmm0
			minps		xmm0, xmm3
			maxps		xmm3, xmm4
			cmpeqps		xmm4, xmm0

#ifdef REFINE_BLENDJOINTS_RECIPROCAL
			rcpps		xmm5, xmm3
			mulps		xmm3, xmm5
			mulps		xmm3, xmm5
			addps		xmm5, xmm5
			subps		xmm5, xmm3					// xmm5 = 1 / y or 1 / x
			mulps		xmm0, xmm5					// xmm0 = x / y or y / x
#else
			rcpps		xmm3, xmm3					// xmm3 = 1 / y or 1 / x
			mulps		xmm0, xmm3					// xmm0 = x / y or y / x
#endif
			movaps		xmm3, xmm4
			andps		xmm3, SIMD_SP_signBitMask
			xorps		xmm0, xmm3					// xmm0 = -x / y or y / x
			andps		xmm4, SIMD_SP_halfPI		// xmm4 = HALF_PI or 0.0f
			movaps		xmm3, xmm0
			mulps		xmm3, xmm3					// xmm3 = s
			movaps		xmm5, SIMD_SP_atan_c0
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c1
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c2
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c3
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c4
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c5
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c6
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_atan_c7
			mulps		xmm5, xmm3
			addps		xmm5, SIMD_SP_one
			mulps		xmm5, xmm0
			addps		xmm5, xmm4					// xmm5 = omega0

			movaps		xmm6, xmm7					// xmm6 = lerp
			mulps		xmm6, xmm5					// xmm6 = omega1
			subps		xmm5, xmm6					// xmm5 = omega0

			// scale0 = sin( xmm5 ) * xmm2
			// scale1 = sin( xmm6 ) * xmm2
			movaps		xmm3, xmm5
			movaps		xmm7, xmm6
			mulps		xmm3, xmm3
			mulps		xmm7, xmm7
			movaps		xmm4, SIMD_SP_sin_c0
			movaps		xmm0, SIMD_SP_sin_c0
			mulps		xmm4, xmm3
			mulps		xmm0, xmm7
			addps		xmm4, SIMD_SP_sin_c1
			addps		xmm0, SIMD_SP_sin_c1
			mulps		xmm4, xmm3
			mulps		xmm0, xmm7
			addps		xmm4, SIMD_SP_sin_c2
			addps		xmm0, SIMD_SP_sin_c2
			mulps		xmm4, xmm3
			mulps		xmm0, xmm7
			addps		xmm4, SIMD_SP_sin_c3
			addps		xmm0, SIMD_SP_sin_c3
			mulps		xmm4, xmm3
			mulps		xmm0, xmm7
			addps		xmm4, SIMD_SP_sin_c4
			addps		xmm0, SIMD_SP_sin_c4
			mulps		xmm4, xmm3
			mulps		xmm0, xmm7
			addps		xmm4, SIMD_SP_one
			addps		xmm0, SIMD_SP_one
			mulps		xmm5, xmm4
			mulps		xmm6, xmm0
			mulps		xmm5, xmm2					// xmm5 = scale0
			mulps		xmm6, xmm2					// xmm6 = scale1

			xorps		xmm6, xmm1

			movaps		xmm0, jointQuat0
			mulps		xmm0, xmm5
			movaps		xmm1, blendQuat0
			mulps		xmm1, xmm6
			addps		xmm0, xmm1
			movaps		jointQuat0, xmm0

			movaps		xmm1, jointQuat1
			mulps		xmm1, xmm5
			movaps		xmm2, blendQuat1
			mulps		xmm2, xmm6
			addps		xmm1, xmm2
			movaps		jointQuat1, xmm1

			movaps		xmm2, jointQuat2
			mulps		xmm2, xmm5
			movaps		xmm3, blendQuat2
			mulps		xmm3, xmm6
			addps		xmm2, xmm3
			movaps		jointQuat2, xmm2

			movaps		xmm3, jointQuat3
			mulps		xmm3, xmm5
			movaps		xmm4, blendQuat3
			mulps		xmm4, xmm6
			addps		xmm3, xmm4
			movaps		jointQuat3, xmm3
		}

#else

		jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
		jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
		jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
		jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );

		jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
		jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
		jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
		jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );

		jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
		jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
		jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
		jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );

		ALIGN16( float cosom[4] );
		ALIGN16( float sinom[4] );
		ALIGN16( float omega0[4] );
		ALIGN16( float omega1[4] );
		ALIGN16( float scale0[4] );
		ALIGN16( float scale1[4] );
		ALIGN16( unsigned long signBit[4] );

		cosom[0] = jointQuat0[0] * blendQuat0[0];
		cosom[1] = jointQuat0[1] * blendQuat0[1];
		cosom[2] = jointQuat0[2] * blendQuat0[2];
		cosom[3] = jointQuat0[3] * blendQuat0[3];

		cosom[0] += jointQuat1[0] * blendQuat1[0];
		cosom[1] += jointQuat1[1] * blendQuat1[1];
		cosom[2] += jointQuat1[2] * blendQuat1[2];
		cosom[3] += jointQuat1[3] * blendQuat1[3];

		cosom[0] += jointQuat2[0] * blendQuat2[0];
		cosom[1] += jointQuat2[1] * blendQuat2[1];
		cosom[2] += jointQuat2[2] * blendQuat2[2];
		cosom[3] += jointQuat2[3] * blendQuat2[3];

		cosom[0] += jointQuat3[0] * blendQuat3[0];
		cosom[1] += jointQuat3[1] * blendQuat3[1];
		cosom[2] += jointQuat3[2] * blendQuat3[2];
		cosom[3] += jointQuat3[3] * blendQuat3[3];

		signBit[0] = (*(unsigned long *)&cosom[0]) & ( 1 << 31 );
		signBit[1] = (*(unsigned long *)&cosom[1]) & ( 1 << 31 );
		signBit[2] = (*(unsigned long *)&cosom[2]) & ( 1 << 31 );
		signBit[3] = (*(unsigned long *)&cosom[3]) & ( 1 << 31 );

		(*(unsigned long *)&cosom[0]) ^= signBit[0];
		(*(unsigned long *)&cosom[1]) ^= signBit[1];
		(*(unsigned long *)&cosom[2]) ^= signBit[2];
		(*(unsigned long *)&cosom[3]) ^= signBit[3];

		scale0[0] = 1.0f - cosom[0] * cosom[0];
		scale0[1] = 1.0f - cosom[1] * cosom[1];
		scale0[2] = 1.0f - cosom[2] * cosom[2];
		scale0[3] = 1.0f - cosom[3] * cosom[3];

		scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
		scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
		scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
		scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];

		sinom[0] = idMath::RSqrt( scale0[0] );
		sinom[1] = idMath::RSqrt( scale0[1] );
		sinom[2] = idMath::RSqrt( scale0[2] );
		sinom[3] = idMath::RSqrt( scale0[3] );

		scale0[0] *= sinom[0];
		scale0[1] *= sinom[1];
		scale0[2] *= sinom[2];
		scale0[3] *= sinom[3];

		omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
		omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
		omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
		omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );

		omega1[0] = lerp * omega0[0];
		omega1[1] = lerp * omega0[1];
		omega1[2] = lerp * omega0[2];
		omega1[3] = lerp * omega0[3];

		omega0[0] -= omega1[0];
		omega0[1] -= omega1[1];
		omega0[2] -= omega1[2];
		omega0[3] -= omega1[3];

		scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
		scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
		scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
		scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];

		scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
		scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
		scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
		scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];

		(*(unsigned long *)&scale1[0]) ^= signBit[0];
		(*(unsigned long *)&scale1[1]) ^= signBit[1];
		(*(unsigned long *)&scale1[2]) ^= signBit[2];
		(*(unsigned long *)&scale1[3]) ^= signBit[3];

		jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
		jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
		jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
		jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];

		jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
		jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
		jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
		jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];

		jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
		jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
		jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
		jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];

		jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
		jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
		jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
		jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];

#endif

		for ( int j = 0; j < 4; j++ ) {
			int n = index[i+j];

			joints[n].t[0] = jointVert0[j];
			joints[n].t[1] = jointVert1[j];
			joints[n].t[2] = jointVert2[j];

			joints[n].q[0] = jointQuat0[j];
			joints[n].q[1] = jointQuat1[j];
			joints[n].q[2] = jointQuat2[j];
			joints[n].q[3] = jointQuat3[j];
		}
	}

	for ( ; i < numJoints; i++ ) {
		int n = index[i];

		idVec3 &jointVert = joints[n].t;
		const idVec3 &blendVert = blendJoints[n].t;

		jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
		jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
		jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );

		idQuat &jointQuat = joints[n].q;
		const idQuat &blendQuat = blendJoints[n].q;

		float cosom;
		float sinom;
		float omega;
		float scale0;
		float scale1;
		unsigned long signBit;

		cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;

		signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );

		(*(unsigned long *)&cosom) ^= signBit;

		scale0 = 1.0f - cosom * cosom;
		scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
		sinom = idMath::InvSqrt( scale0 );
		omega = idMath::ATan16( scale0 * sinom, cosom );
		scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
		scale1 = idMath::Sin16( lerp * omega ) * sinom;

		(*(unsigned long *)&scale1) ^= signBit;

		jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
		jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
		jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
		jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
	}
}

/*
============
idSIMD_SSE::ConvertJointQuatsToJointMats
============
*/
void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {

	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );

	for ( int i = 0; i < numJoints; i++ ) {

		const float *q = jointQuats[i].q.ToFloatPtr();
		float *m = jointMats[i].ToFloatPtr();

		m[0*4+3] = q[4];
		m[1*4+3] = q[5];
		m[2*4+3] = q[6];

		float x2 = q[0] + q[0];
		float y2 = q[1] + q[1];
		float z2 = q[2] + q[2];

		{
			float xx = q[0] * x2;
			float yy = q[1] * y2;
			float zz = q[2] * z2;

			m[0*4+0] = 1.0f - yy - zz;
			m[1*4+1] = 1.0f - xx - zz;
			m[2*4+2] = 1.0f - xx - yy;
		}

		{
			float yz = q[1] * z2;
			float wx = q[3] * x2;

			m[2*4+1] = yz - wx;
			m[1*4+2] = yz + wx;
		}

		{
			float xy = q[0] * y2;
			float wz = q[3] * z2;

			m[1*4+0] = xy - wz;
			m[0*4+1] = xy + wz;
		}

		{
			float xz = q[0] * z2;
			float wy = q[3] * y2;

			m[0*4+2] = xz - wy;
			m[2*4+0] = xz + wy;
		}
	}
}

/*
============
idSIMD_SSE::ConvertJointMatsToJointQuats
============
*/
void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {

	assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
	assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );

#if 1

	ALIGN16( byte shuffle[16] );

	__asm {
		mov			eax, numJoints
		mov			esi, jointMats
		mov			edi, jointQuats
		and			eax, ~3
		jz			done4
		imul		eax, JOINTMAT_SIZE
		add			esi, eax
		neg			eax

	loopMat4:
		movss		xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
		movss		xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
		movss		xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]

		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
		movss		xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
		movss		xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]

		movss		xmm5, xmm0
		movss		xmm6, xmm1
		movss		xmm7, xmm2

		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
		movss		xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
		movss		xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]

		movss		xmm5, xmm0
		movss		xmm6, xmm1
		movss		xmm7, xmm2

		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
		movss		xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
		movss		xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]

		movss		xmm5, xmm0
		movss		xmm6, xmm1
		movss		xmm7, xmm2

		// -------------------

		movaps		xmm0, xmm5
		addps		xmm0, xmm6
		addps		xmm0, xmm7
		cmpnltps	xmm0, SIMD_SP_zero						// xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f

		movaps		xmm1, xmm5
		movaps		xmm2, xmm5
		cmpnltps	xmm1, xmm6
		cmpnltps	xmm2, xmm7
		andps		xmm2, xmm1								// xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]

		movaps		xmm4, xmm6
		cmpnltps	xmm4, xmm7								// xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]

		movaps		xmm1, xmm0
		andnps		xmm1, xmm2
		orps		xmm2, xmm0
		movaps		xmm3, xmm2
		andnps		xmm2, xmm4
		orps		xmm3, xmm2
		xorps		xmm3, SIMD_SP_not

		andps		xmm0, SIMD_DW_mat2quatShuffle0
		movaps		xmm4, xmm1
		andps		xmm4, SIMD_DW_mat2quatShuffle1
		orps		xmm0, xmm4
		movaps		xmm4, xmm2
		andps		xmm4, SIMD_DW_mat2quatShuffle2
		orps		xmm0, xmm4
		movaps		xmm4, xmm3
		andps		xmm4, SIMD_DW_mat2quatShuffle3
		orps		xmm4, xmm0

		movaps		shuffle, xmm4

		movaps		xmm0, xmm2
		orps		xmm0, xmm3								// xmm0 = xmm2 | xmm3	= s0
		orps		xmm2, xmm1								// xmm2 = xmm1 | xmm2	= s2
		orps		xmm1, xmm3								// xmm1 = xmm1 | xmm3	= s1

		andps		xmm0, SIMD_SP_signBitMask
		andps		xmm1, SIMD_SP_signBitMask
		andps		xmm2, SIMD_SP_signBitMask

		xorps		xmm5, xmm0
		xorps		xmm6, xmm1
		xorps		xmm7, xmm2
		addps		xmm5, xmm6
		addps		xmm7, SIMD_SP_one
		addps		xmm5, xmm7								// xmm5 = t

		movaps		xmm7, xmm5								// xmm7 = t
		rsqrtps		xmm6, xmm5
		mulps		xmm5, xmm6
		mulps		xmm5, xmm6
		subps		xmm5, SIMD_SP_rsqrt_c0
		mulps		xmm6, SIMD_SP_mat2quat_rsqrt_c1
		mulps		xmm6, xmm5								// xmm5 = s

		mulps		xmm7, xmm6								// xmm7 = s * t
		xorps		xmm6, SIMD_SP_signBitMask				// xmm6 = -s

		// -------------------

		add			edi, 4*JOINTQUAT_SIZE

		movzx		ecx, byte ptr shuffle[0*4+0]			// ecx = k0
		movss		[edi+ecx*4-4*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;

		movzx		edx, byte ptr shuffle[0*4+1]			// edx = k1
		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
		xorps		xmm4, xmm2
		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-4*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;

		movzx		ecx, byte ptr shuffle[0*4+2]			// ecx = k2
		movss		xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
		xorps		xmm3, xmm1
		subss		xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
		mulss		xmm3, xmm6
		movss		[edi+ecx*4-4*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;

		movzx		edx, byte ptr shuffle[0*4+3]			// edx = k3
		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
		xorps		xmm4, xmm0
		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-4*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
		mov			[edi-4*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
		mov			edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
		mov			[edi-4*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
		mov			[edi-4*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];

		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movzx		ecx, byte ptr shuffle[1*4+0]			// ecx = k0
		movss		[edi+ecx*4-3*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;

		movzx		edx, byte ptr shuffle[1*4+1]			// edx = k1
		movss		xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
		xorps		xmm4, xmm2
		subss		xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-3*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;

		movzx		ecx, byte ptr shuffle[1*4+2]			// ecx = k2
		movss		xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
		xorps		xmm3, xmm1
		subss		xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
		mulss		xmm3, xmm6
		movss		[edi+ecx*4-3*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;

		movzx		edx, byte ptr shuffle[1*4+3]			// edx = k3
		movss		xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
		xorps		xmm4, xmm0
		subss		xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-3*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		mov			ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
		mov			[edi-3*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
		mov			edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
		mov			[edi-3*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
		mov			ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
		mov			[edi-3*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];

		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movzx		ecx, byte ptr shuffle[2*4+0]			// ecx = k0
		movss		[edi+ecx*4-2*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;

		movzx		edx, byte ptr shuffle[2*4+1]			// edx = k1
		movss		xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
		xorps		xmm4, xmm2
		subss		xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-2*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;

		movzx		ecx, byte ptr shuffle[2*4+2]			// ecx = k2
		movss		xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
		xorps		xmm3, xmm1
		subss		xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
		mulss		xmm3, xmm6
		movss		[edi+ecx*4-2*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;

		movzx		edx, byte ptr shuffle[2*4+3]			// edx = k3
		movss		xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
		xorps		xmm4, xmm0
		subss		xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-2*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		mov			ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
		mov			[edi-2*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
		mov			edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
		mov			[edi-2*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
		mov			ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
		mov			[edi-2*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];

		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movzx		ecx, byte ptr shuffle[3*4+0]			// ecx = k0
		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;

		movzx		edx, byte ptr shuffle[3*4+1]			// edx = k1
		movss		xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
		xorps		xmm4, xmm2
		subss		xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;

		movzx		ecx, byte ptr shuffle[3*4+2]			// ecx = k2
		movss		xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
		xorps		xmm3, xmm1
		subss		xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
		mulss		xmm3, xmm6
		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;

		movzx		edx, byte ptr shuffle[3*4+3]			// edx = k3
		movss		xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
		xorps		xmm4, xmm0
		subss		xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		mov			ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
		mov			[edi-1*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
		mov			edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
		mov			[edi-1*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
		mov			ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
		mov			[edi-1*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];

		add			eax, 4*JOINTMAT_SIZE
		jl			loopMat4

	done4:
		mov			eax, numJoints
		and			eax, 3
		jz			done1
		imul		eax, JOINTMAT_SIZE
		add			esi, eax
		neg			eax

	loopMat1:
		movss		xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
		movss		xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
		movss		xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]

		// -------------------

		movaps		xmm0, xmm5
		addss		xmm0, xmm6
		addss		xmm0, xmm7
		cmpnltss	xmm0, SIMD_SP_zero						// xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f

		movaps		xmm1, xmm5
		movaps		xmm2, xmm5
		cmpnltss	xmm1, xmm6
		cmpnltss	xmm2, xmm7
		andps		xmm2, xmm1								// xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]

		movaps		xmm4, xmm6
		cmpnltss	xmm4, xmm7								// xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]

		movaps		xmm1, xmm0
		andnps		xmm1, xmm2
		orps		xmm2, xmm0
		movaps		xmm3, xmm2
		andnps		xmm2, xmm4
		orps		xmm3, xmm2
		xorps		xmm3, SIMD_SP_not

		andps		xmm0, SIMD_DW_mat2quatShuffle0
		movaps		xmm4, xmm1
		andps		xmm4, SIMD_DW_mat2quatShuffle1
		orps		xmm0, xmm4
		movaps		xmm4, xmm2
		andps		xmm4, SIMD_DW_mat2quatShuffle2
		orps		xmm0, xmm4
		movaps		xmm4, xmm3
		andps		xmm4, SIMD_DW_mat2quatShuffle3
		orps		xmm4, xmm0

		movss		shuffle, xmm4

		movaps		xmm0, xmm2
		orps		xmm0, xmm3								// xmm0 = xmm2 | xmm3	= s0
		orps		xmm2, xmm1								// xmm2 = xmm1 | xmm2	= s2
		orps		xmm1, xmm3								// xmm1 = xmm1 | xmm3	= s1

		andps		xmm0, SIMD_SP_signBitMask
		andps		xmm1, SIMD_SP_signBitMask
		andps		xmm2, SIMD_SP_signBitMask

		xorps		xmm5, xmm0
		xorps		xmm6, xmm1
		xorps		xmm7, xmm2
		addss		xmm5, xmm6
		addss		xmm7, SIMD_SP_one
		addss		xmm5, xmm7								// xmm5 = t

		movss		xmm7, xmm5								// xmm7 = t
		rsqrtss		xmm6, xmm5
		mulss		xmm5, xmm6
		mulss		xmm5, xmm6
		subss		xmm5, SIMD_SP_rsqrt_c0
		mulss		xmm6, SIMD_SP_mat2quat_rsqrt_c1
		mulss		xmm6, xmm5								// xmm5 = s

		mulss		xmm7, xmm6								// xmm7 = s * t
		xorps		xmm6, SIMD_SP_signBitMask				// xmm6 = -s

		// -------------------

		movzx		ecx, byte ptr shuffle[0]				// ecx = k0
		add			edi, JOINTQUAT_SIZE
		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm7		// q[k0] = s * t;

		movzx		edx, byte ptr shuffle[1]				// edx = k1
		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
		xorps		xmm4, xmm2
		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;

		movzx		ecx, byte ptr shuffle[2]				// ecx = k2
		movss		xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
		xorps		xmm3, xmm1
		subss		xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
		mulss		xmm3, xmm6
		movss		[edi+ecx*4-1*JOINTQUAT_SIZE], xmm3		// q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;

		movzx		edx, byte ptr shuffle[3]				// edx = k3
		movss		xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
		xorps		xmm4, xmm0
		subss		xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
		mulss		xmm4, xmm6
		movss		[edi+edx*4-1*JOINTQUAT_SIZE], xmm4		// q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
		mov			[edi-1*JOINTQUAT_SIZE+16], ecx			// q[4] = m[0 * 4 + 3];
		mov			edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
		mov			[edi-1*JOINTQUAT_SIZE+20], edx			// q[5] = m[1 * 4 + 3];
		mov			ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
		mov			[edi-1*JOINTQUAT_SIZE+24], ecx			// q[6] = m[2 * 4 + 3];

		add			eax, JOINTMAT_SIZE
		jl			loopMat1

	done1:
	}

#elif 0

	for ( int i = 0; i < numJoints; i++ ) {
		float s0, s1, s2;
		int k0, k1, k2, k3;

		float *q = jointQuats[i].q.ToFloatPtr();
		const float *m = jointMats[i].ToFloatPtr();

		if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {

			k0 = 3;
			k1 = 2;
			k2 = 1;
			k3 = 0;
			s0 = 1.0f;
			s1 = 1.0f;
			s2 = 1.0f;

		} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {

			k0 = 0;
			k1 = 1;
			k2 = 2;
			k3 = 3;
			s0 = 1.0f;
			s1 = -1.0f;
			s2 = -1.0f;

		} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {

			k0 = 1;
			k1 = 0;
			k2 = 3;
			k3 = 2;
			s0 = -1.0f;
			s1 = 1.0f;
			s2 = -1.0f;

		} else {

			k0 = 2;
			k1 = 3;
			k2 = 0;
			k3 = 1;
			s0 = -1.0f;
			s1 = -1.0f;
			s2 = 1.0f;

		}

		float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
		float s = idMath::InvSqrt( t ) * 0.5f;

		q[k0] = s * t;
		q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
		q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
		q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;

		q[4] = m[0 * 4 + 3];
		q[5] = m[1 * 4 + 3];
		q[6] = m[2 * 4 + 3];
	}

#elif 1

	for ( int i = 0; i < numJoints; i++ ) {

		float *q = jointQuats[i].q.ToFloatPtr();
		const float *m = jointMats[i].ToFloatPtr();

		if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {

			float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
			float s = idMath::InvSqrt( t ) * 0.5f;

			q[3] = s * t;
			q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
			q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
			q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;

		} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {

			float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
			float s = idMath::InvSqrt( t ) * 0.5f;

			q[0] = s * t;
			q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
			q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
			q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;

		} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {

			float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
			float s = idMath::InvSqrt( t ) * 0.5f;

			q[1] = s * t;
			q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
			q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
			q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;

		} else {

			float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
			float s = idMath::InvSqrt( t ) * 0.5f;

			q[2] = s * t;
			q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
			q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
			q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;

		}

		q[4] = m[0 * 4 + 3];
		q[5] = m[1 * 4 + 3];
		q[6] = m[2 * 4 + 3];
	}

#endif
}

/*
============
idSIMD_SSE::TransformJoints
============
*/
void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
#if 1

	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );

	__asm {

		mov			ecx, firstJoint
		mov			eax, lastJoint
		sub			eax, ecx
		jl			done
		imul		ecx, 4
		mov			edi, parents
		add			edi, ecx
		imul		ecx, 12
		mov			esi, jointMats
		imul		eax, 4
		add			edi, eax
		neg			eax

	loopJoint:

		movaps		xmm0, [esi+ecx+ 0]						// xmm0 = m0, m1, m2, t0
		mov			edx, [edi+eax]
		movaps		xmm1, [esi+ecx+16]						// xmm1 = m2, m3, m4, t1
		imul		edx, JOINTMAT_SIZE
		movaps		xmm2, [esi+ecx+32]						// xmm2 = m5, m6, m7, t2

		movss		xmm4, [esi+edx+ 0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm4, xmm0

		movss		xmm5, [esi+edx+ 4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm5, xmm1
		addps		xmm4, xmm5
		movss		xmm6, [esi+edx+ 8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm2
		addps		xmm4, xmm6

		movss		xmm5, [esi+edx+16]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm5, xmm0

		movss		xmm7, [esi+edx+12]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
		addps		xmm4, xmm7

		movaps		[esi+ecx+ 0], xmm4

		movss		xmm6, [esi+edx+20]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm1
		addps		xmm5, xmm6
		movss		xmm7, [esi+edx+24]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm2
		addps		xmm5, xmm7

		movss		xmm6, [esi+edx+32]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm0

		movss		xmm3, [esi+edx+28]
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
		addps		xmm5, xmm3

		movaps		[esi+ecx+16], xmm5

		movss		xmm7, [esi+edx+36]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm1
		addps		xmm6, xmm7
		movss		xmm3, [esi+edx+40]
		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm3, xmm2
		addps		xmm6, xmm3

		movss		xmm7, [esi+edx+44]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
		addps		xmm6, xmm7

		movaps		[esi+ecx+32], xmm6

		add			ecx, JOINTMAT_SIZE
		add			eax, 4
		jle			loopJoint
	done:
	}

#else

	int i;

	for( i = firstJoint; i <= lastJoint; i++ ) {
		assert( parents[i] < i );
		jointMats[i] *= jointMats[parents[i]];
	}

#endif
}

/*
============
idSIMD_SSE::UntransformJoints
============
*/
void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
#if 1

	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );

	__asm {

		mov			edx, firstJoint
		mov			eax, lastJoint
		mov			ecx, eax
		sub			eax, edx
		jl			done
		mov			esi, jointMats
		imul		ecx, JOINTMAT_SIZE
		imul		edx, 4
		mov			edi, parents
		add			edi, edx
		imul		eax, 4

	loopJoint:

		movaps		xmm0, [esi+ecx+ 0]						// xmm0 = m0, m1, m2, t0
		mov			edx, [edi+eax]
		movaps		xmm1, [esi+ecx+16]						// xmm1 = m2, m3, m4, t1
		imul		edx, JOINTMAT_SIZE
		movaps		xmm2, [esi+ecx+32]						// xmm2 = m5, m6, m7, t2

		movss		xmm6, [esi+edx+12]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		subps		xmm0, xmm6
		movss		xmm7, [esi+edx+28]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
		subps		xmm1, xmm7
		movss		xmm3, [esi+edx+44]
		shufps		xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
		subps		xmm2, xmm3

		movss		xmm4, [esi+edx+ 0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm4, xmm0
		movss		xmm5, [esi+edx+16]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm5, xmm1
		addps		xmm4, xmm5
		movss		xmm6, [esi+edx+32]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm2
		addps		xmm4, xmm6

		movaps		[esi+ecx+ 0], xmm4

		movss		xmm5, [esi+edx+ 4]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm5, xmm0
		movss		xmm6, [esi+edx+20]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm1
		addps		xmm5, xmm6
		movss		xmm7, [esi+edx+36]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm2
		addps		xmm5, xmm7

		movaps		[esi+ecx+16], xmm5

		movss		xmm6, [esi+edx+ 8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm0
		movss		xmm7, [esi+edx+24]
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm1
		addps		xmm6, xmm7
		movss		xmm3, [esi+edx+40]
		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm3, xmm2
		addps		xmm6, xmm3

		movaps		[esi+ecx+32], xmm6

		sub			ecx, JOINTMAT_SIZE
		sub			eax, 4
		jge			loopJoint
	done:
	}

#else

	int i;

	for( i = lastJoint; i >= firstJoint; i-- ) {
		assert( parents[i] < i );
		jointMats[i] /= jointMats[parents[i]];
	}

#endif
}

/*
============
idSIMD_SSE::TransformVerts
============
*/
void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
#if 1

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
	assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );

	__asm
	{
		mov			eax, numVerts
		test		eax, eax
		jz			done
		imul		eax, DRAWVERT_SIZE

		mov			ecx, verts
		mov			edx, index
		mov			esi, weights
		mov			edi, joints

		add			ecx, eax
		neg			eax

	loopVert:
		mov			ebx, [edx]
		movaps		xmm2, [esi]
		add			edx, 8
		movaps		xmm0, xmm2
		add			esi, JOINTWEIGHT_SIZE
		movaps		xmm1, xmm2

		mulps		xmm0, [edi+ebx+ 0]						// xmm0 = m0, m1, m2, t0
		mulps		xmm1, [edi+ebx+16]						// xmm1 = m3, m4, m5, t1
		mulps		xmm2, [edi+ebx+32]						// xmm2 = m6, m7, m8, t2

		cmp			dword ptr [edx-4], 0

		jne			doneWeight

	loopWeight:
		mov			ebx, [edx]
		movaps		xmm5, [esi]
		add			edx, 8
		movaps		xmm3, xmm5
		add			esi, JOINTWEIGHT_SIZE
		movaps		xmm4, xmm5

		mulps		xmm3, [edi+ebx+ 0]						// xmm3 = m0, m1, m2, t0
		mulps		xmm4, [edi+ebx+16]						// xmm4 = m3, m4, m5, t1
		mulps		xmm5, [edi+ebx+32]						// xmm5 = m6, m7, m8, t2

		cmp			dword ptr [edx-4], 0

		addps		xmm0, xmm3
		addps		xmm1, xmm4
		addps		xmm2, xmm5

		je			loopWeight

	doneWeight:
		add			eax, DRAWVERT_SIZE

		movaps		xmm6, xmm0								// xmm6 =    m0,    m1,          m2,          t0
		unpcklps	xmm6, xmm1								// xmm6 =    m0,    m3,          m1,          m4
		unpckhps	xmm0, xmm1								// xmm1 =    m2,    m5,          t0,          t1
		addps		xmm6, xmm0								// xmm6 = m0+m2, m3+m5,       m1+t0,       m4+t1

		movaps		xmm7, xmm2								// xmm7 =    m6,    m7,          m8,          t2
		movlhps		xmm2, xmm6								// xmm2 =    m6,    m7,       m0+m2,       m3+m5
		movhlps		xmm6, xmm7								// xmm6 =    m8,    t2,       m1+t0,       m4+t1
		addps		xmm6, xmm2								// xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1

		movhps		[ecx+eax-DRAWVERT_SIZE+0], xmm6

		movaps		xmm5, xmm6								// xmm5 = m6+m8, m7+t2
		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 )	// xmm5 = m7+t2, m6+m8
		addss		xmm5, xmm6								// xmm5 = m6+m8+m7+t2

		movss		[ecx+eax-DRAWVERT_SIZE+8], xmm5

		jl			loopVert
	done:
	}

#else

	int i, j;
	const byte *jointsPtr = (byte *)joints;

	for( j = i = 0; i < numVerts; i++ ) {
		idVec3 v;

		v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
		while( index[j*2+1] == 0 ) {
			j++;
			v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
		}
		j++;

		verts[i].xyz = v;
	}

#endif
}

/*
============
idSIMD_SSE::TracePointCull
============
*/
void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
#if 1

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__asm {
		push		ebx
		mov			eax, numVerts
		test		eax, eax
		jz			done

		mov			edi, planes
		movlps		xmm1, [edi]								// xmm1 =  0,  1,  X,  X
		movhps		xmm1, [edi+16]							// xmm1 =  0,  1,  4,  5
		movlps		xmm3, [edi+8]							// xmm3 =  2,  3,  X,  X
		movhps		xmm3, [edi+24]							// xmm3 =  2,  3,  6,  7
		movlps		xmm4, [edi+32]							// xmm4 =  8,  9,  X,  X
		movhps		xmm4, [edi+48]							// xmm4 =  8,  9, 12, 13
		movlps		xmm5, [edi+40]							// xmm5 = 10, 11,  X,  X
		movhps		xmm5, [edi+56]							// xmm5 = 10, 11, 14, 15
		movaps		xmm0, xmm1								// xmm0 =  0,  1,  4,  5
		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm0 =  0,  4,  8, 12
		shufps		xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm1 =  1,  5,  9, 13
		movaps		xmm2, xmm3								// xmm2 =  2,  3,  6,  7
		shufps		xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm2 =  2,  6, 10, 14
		shufps		xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm3 =  3,  7, 11, 15
		movss		xmm7, radius
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		xor			edx, edx
		mov			esi, verts
		mov			edi, cullBits
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax

	loopVert:
		movss		xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
		mulps		xmm4, xmm0
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		movss		xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
		mulps		xmm5, xmm1
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		addps		xmm4, xmm5
		mulps		xmm6, xmm2
		addps		xmm4, xmm3
		addps		xmm4, xmm6
		movaps		xmm5, xmm4
		xorps		xmm5, SIMD_SP_signBitMask
		cmpltps		xmm4, xmm7
		movmskps	ecx, xmm4
		cmpltps		xmm5, xmm7
		movmskps	ebx, xmm5
		shl			cx, 4
		or			cl, bl
		inc			edi
		or			dl, cl
		add			eax, DRAWVERT_SIZE
		mov			byte ptr [edi-1], cl
		jl			loopVert

	done:
		mov			esi, totalOr
        mov			byte ptr [esi], dl
		pop			ebx
	}

#else

	int i;
	byte tOr;

	tOr = 0;

	for ( i = 0; i < numVerts; i++ ) {
		byte bits;
		float d0, d1, d2, d3, t;
		const idVec3 &v = verts[i].xyz;

		d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
		d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
		d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
		d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];

		t = d0 + radius;
		bits  = FLOATSIGNBITSET( t ) << 0;
		t = d1 + radius;
		bits |= FLOATSIGNBITSET( t ) << 1;
		t = d2 + radius;
		bits |= FLOATSIGNBITSET( t ) << 2;
		t = d3 + radius;
		bits |= FLOATSIGNBITSET( t ) << 3;

		t = d0 - radius;
		bits |= FLOATSIGNBITSET( t ) << 4;
		t = d1 - radius;
		bits |= FLOATSIGNBITSET( t ) << 5;
		t = d2 - radius;
		bits |= FLOATSIGNBITSET( t ) << 6;
		t = d3 - radius;
		bits |= FLOATSIGNBITSET( t ) << 7;

		bits ^= 0x0F;		// flip lower four bits

		tOr |= bits;
		cullBits[i] = bits;
	}

	totalOr = tOr;

#endif
}

/*
============
idSIMD_SSE::DecalPointCull
============
*/
void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
#if 1

	ALIGN16( float p0[4] );
	ALIGN16( float p1[4] );
	ALIGN16( float p2[4] );
	ALIGN16( float p3[4] );
	ALIGN16( float p4[4] );
	ALIGN16( float p5[4] );
	ALIGN16( float p6[4] );
	ALIGN16( float p7[4] );

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__asm {
		mov			ecx, planes
		movlps		xmm1, [ecx]								// xmm1 =  0,  1,  X,  X
		movhps		xmm1, [ecx+16]							// xmm1 =  0,  1,  4,  5
		movlps		xmm3, [ecx+8]							// xmm3 =  2,  3,  X,  X
		movhps		xmm3, [ecx+24]							// xmm3 =  2,  3,  6,  7
		movlps		xmm4, [ecx+32]							// xmm4 =  8,  9,  X,  X
		movhps		xmm4, [ecx+48]							// xmm4 =  8,  9, 12, 13
		movlps		xmm5, [ecx+40]							// xmm5 = 10, 11,  X,  X
		movhps		xmm5, [ecx+56]							// xmm5 = 10, 11, 14, 15
		movaps		xmm0, xmm1								// xmm0 =  0,  1,  4,  5
		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm0 =  0,  4,  8, 12
		shufps		xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm1 =  1,  5,  9, 13
		movaps		xmm2, xmm3								// xmm2 =  2,  3,  6,  7
		shufps		xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm2 =  2,  6, 10, 14
		shufps		xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm3 =  3,  7, 11, 15

		movaps		p0, xmm0
		movaps		p1, xmm1
		movaps		p2, xmm2
		movaps		p3, xmm3

		movlps		xmm4, [ecx+64]							// xmm4 = p40, p41,   X,   X
		movhps		xmm4, [ecx+80]							// xmm4 = p40, p41, p50, p51
		movaps		xmm5, xmm4								// xmm5 = p40, p41, p50, p51
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm4 = p40, p50, p40, p50
		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm5 = p41, p51, p41, p51
		movlps		xmm6, [ecx+72]							// xmm6 = p42, p43,   X,   X
		movhps		xmm6, [ecx+88]							// xmm6 = p42, p43, p52, p53
		movaps		xmm7, xmm6								// xmm7 = p42, p43, p52, p53
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )	// xmm6 = p42, p52, p42, p52
		shufps		xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 )	// xmm7 = p43, p53, p43, p53

		movaps		p4, xmm4
		movaps		p5, xmm5
		movaps		p6, xmm6
		movaps		p7, xmm7

		mov			esi, verts
		mov			edi, cullBits
		mov			eax, numVerts
		and			eax, ~1
		jz			done2
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax

	loopVert2:
		movaps		xmm6, p0
		movss		xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm0
		movaps		xmm7, p1
		movss		xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm1
		addps		xmm6, xmm7
		movaps		xmm7, p2
		movss		xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm2
		addps		xmm6, xmm7
		addps		xmm6, p3

		cmpnltps	xmm6, SIMD_SP_zero
		movmskps	ecx, xmm6

		movaps		xmm6, p0
		movss		xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm3
		movaps		xmm7, p1
		movss		xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm4
		addps		xmm6, xmm7
		movaps		xmm7, p2
		movss		xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm5
		addps		xmm6, xmm7
		addps		xmm6, p3

		cmpnltps	xmm6, SIMD_SP_zero
		movmskps	edx, xmm6
		mov			ch, dl

		shufps		xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm0, p4
		shufps		xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm1, p5
		addps		xmm0, xmm1
		shufps		xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm2, p6
		addps		xmm0, xmm2
		addps		xmm0, p7

		cmpnltps	xmm0, SIMD_SP_zero
		movmskps	edx, xmm0

		add			edi, 2

		mov			dh, dl
		shl			dl, 4
		shl			dh, 2
		and			edx, (3<<4)|(3<<12)
		or			ecx, edx

		add			eax, 2*DRAWVERT_SIZE
		mov			word ptr [edi-2], cx
		jl			loopVert2

	done2:

		mov			eax, numVerts
		and			eax, 1
		jz			done

		movaps		xmm6, p0
		movss		xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm6, xmm0
		movaps		xmm7, p1
		movss		xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm1
		addps		xmm6, xmm7
		movaps		xmm7, p2
		movss		xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm7, xmm2
		addps		xmm6, xmm7
		addps		xmm6, p3

		cmpnltps	xmm6, SIMD_SP_zero
		movmskps	ecx, xmm6

		mulps		xmm0, p4
		mulps		xmm1, p5
		addps		xmm0, xmm1
		mulps		xmm2, p6
		addps		xmm0, xmm2
		addps		xmm0, p7

		cmpnltps	xmm0, SIMD_SP_zero
		movmskps	edx, xmm0

		and			edx, 3
		shl			edx, 4
		or			ecx, edx

		mov			byte ptr [edi], cl

	done:
	}


#else

	int i;

	for ( i = 0; i < numVerts; i += 2 ) {
		unsigned short bits0, bits1;
		float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
		const idVec3 &v0 = verts[i+0].xyz;
		const idVec3 &v1 = verts[i+1].xyz;

		d0  = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
		d1  = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
		d2  = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
		d3  = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];

		d4  = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
		d5  = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
		d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
		d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];

		d6  = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
		d7  = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
		d8  = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
		d9  = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];

		bits0  = FLOATSIGNBITSET( d0  ) << (0+0);
		bits0 |= FLOATSIGNBITSET( d1  ) << (0+1);
		bits0 |= FLOATSIGNBITSET( d2  ) << (0+2);
		bits0 |= FLOATSIGNBITSET( d3  ) << (0+3);
		bits0 |= FLOATSIGNBITSET( d4  ) << (0+4);
		bits0 |= FLOATSIGNBITSET( d5  ) << (0+5);

		bits1  = FLOATSIGNBITSET( d6  ) << (8+0);
		bits1 |= FLOATSIGNBITSET( d7  ) << (8+1);
		bits1 |= FLOATSIGNBITSET( d8  ) << (8+2);
		bits1 |= FLOATSIGNBITSET( d9  ) << (8+3);
		bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
		bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);

		*(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
	}

	if ( numVerts & 1 ) {
		byte bits;
		float d0, d1, d2, d3, d4, d5;
		const idVec3 &v = verts[numVerts - 1].xyz;

		d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
		d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
		d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
		d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];

		d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
		d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];

		bits  = FLOATSIGNBITSET( d0 ) << 0;
		bits |= FLOATSIGNBITSET( d1 ) << 1;
		bits |= FLOATSIGNBITSET( d2 ) << 2;
		bits |= FLOATSIGNBITSET( d3 ) << 3;

		bits |= FLOATSIGNBITSET( d4 ) << 4;
		bits |= FLOATSIGNBITSET( d5 ) << 5;

		cullBits[numVerts - 1] = bits ^ 0x3F;		// flip lower 6 bits
	}

#endif
}

/*
============
idSIMD_SSE::OverlayPointCull
============
*/
void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
#if 1

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__asm {
		mov			eax, numVerts
		mov			edx, verts
		mov			esi, texCoords
		mov			edi, cullBits

		mov			ecx, planes
		movss		xmm4, [ecx+ 0]
		movss		xmm5, [ecx+16]
		shufps		xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
		movss		xmm5, [ecx+ 4]
		movss		xmm6, [ecx+20]
		shufps		xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
		movss		xmm6, [ecx+ 8]
		movss		xmm7, [ecx+24]
		shufps		xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
		movss		xmm7, [ecx+12]
		movss		xmm0, [ecx+28]
		shufps		xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )

		and			eax, ~1
		jz			done2
		add			edi, eax
		neg			eax

	loopVert2:
		movss		xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm0, xmm4
		movss		xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm1, xmm5
		movss		xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		shufps		xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm2, xmm6
		addps		xmm0, xmm1
		addps		xmm0, xmm2
		addps		xmm0, xmm7
		movaps		[esi], xmm0
		movaps		xmm1, xmm0
		movaps		xmm2, SIMD_SP_one
		subps		xmm2, xmm0
		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
		shufps		xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
		add			edx, 2*DRAWVERT_SIZE
		movmskps	ecx, xmm0
		mov			byte ptr [edi+eax+0], cl
		add			esi, 4*4
		movmskps	ecx, xmm1
		mov			byte ptr [edi+eax+1], cl
		add			eax, 2
		jl			loopVert2

	done2:
		mov			eax, numVerts
		and			eax, 1
		jz			done

		movss		xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm0, xmm4
		movss		xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm1, xmm5
		movss		xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm2, xmm6
		addps		xmm0, xmm1
		addps		xmm0, xmm2
		addps		xmm0, xmm7
		movlps		[esi], xmm0
		movaps		xmm1, xmm0
		movaps		xmm2, SIMD_SP_one
		subps		xmm2, xmm0
		shufps		xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
		movmskps	ecx, xmm0
		mov			byte ptr [edi], cl

	done:
	}

#else

	const idPlane &p0 = planes[0];
	const idPlane &p1 = planes[1];

	for ( int i = 0; i < numVerts - 1; i += 2 ) {
		unsigned short bits;
		float d0, d1, d2, d3;

		const idVec3 &v0 = verts[i+0].xyz;
		const idVec3 &v1 = verts[i+1].xyz;

		d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
		d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
		d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
		d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];

		texCoords[i+0][0] = d0;
		texCoords[i+0][1] = d1;
		texCoords[i+1][0] = d2;
		texCoords[i+1][1] = d3;

		bits  = FLOATSIGNBITSET( d0 ) << 0;
		bits |= FLOATSIGNBITSET( d1 ) << 1;
		bits |= FLOATSIGNBITSET( d2 ) << 8;
		bits |= FLOATSIGNBITSET( d3 ) << 9;

		d0 = 1.0f - d0;
		d1 = 1.0f - d1;
		d2 = 1.0f - d2;
		d3 = 1.0f - d3;

		bits |= FLOATSIGNBITSET( d0 ) << 2;
		bits |= FLOATSIGNBITSET( d1 ) << 3;
		bits |= FLOATSIGNBITSET( d2 ) << 10;
		bits |= FLOATSIGNBITSET( d3 ) << 11;

		*(unsigned short *)(cullBits + i) = bits;
	}

	if ( numVerts & 1 ) {
		byte bits;
		float d0, d1;

		const idPlane &p0 = planes[0];
		const idPlane &p1 = planes[1];
		const idVec3 &v0 = verts[numVerts - 1].xyz;

		d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
		d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];

		texCoords[i][0] = d0;
		texCoords[i][1] = d1;

		bits  = FLOATSIGNBITSET( d0 ) << 0;
		bits |= FLOATSIGNBITSET( d1 ) << 1;

		d0 = 1.0f - d0;
		d1 = 1.0f - d1;

		bits |= FLOATSIGNBITSET( d0 ) << 2;
		bits |= FLOATSIGNBITSET( d1 ) << 3;

		cullBits[numVerts - 1] = bits;
	}

#endif
}

/*
============
idSIMD_SSE::DeriveTriPlanes
============
*/
void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
#if 1

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );

	__asm {
		mov			eax, numIndexes
		shl			eax, 2
		mov			esi, verts
		mov			edi, indexes
		mov			edx, planes

		add			edi, eax
		neg			eax

		add			eax, 4*12
		jge			done4

	loopPlane4:
		mov			ebx, [edi+eax-4*12+4]
		imul		ebx, DRAWVERT_SIZE
		mov			ecx, [edi+eax-4*12+0]
		imul		ecx, DRAWVERT_SIZE

		movss		xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]

		movss		xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]

		movss		xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		mov			ebx, [edi+eax-4*12+8]
		imul		ebx, DRAWVERT_SIZE

		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]

		movss		xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]

		movss		xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		mov			ebx, [edi+eax-3*12+4]
		imul		ebx, DRAWVERT_SIZE
		mov			ecx, [edi+eax-3*12+0]
		imul		ecx, DRAWVERT_SIZE

		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm0, xmm6

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm1, xmm7

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm2, xmm6

		mov			ebx, [edi+eax-3*12+8]
		imul		ebx, DRAWVERT_SIZE

		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm3, xmm7

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm4, xmm6

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm5, xmm7

		mov			ebx, [edi+eax-2*12+4]
		imul		ebx, DRAWVERT_SIZE
		mov			ecx, [edi+eax-2*12+0]
		imul		ecx, DRAWVERT_SIZE

		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm0, xmm6

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm1, xmm7

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm2, xmm6

		mov			ebx, [edi+eax-2*12+8]
		imul		ebx, DRAWVERT_SIZE

		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm3, xmm7

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm4, xmm6

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm5, xmm7

		mov			ebx, [edi+eax-1*12+4]
		imul		ebx, DRAWVERT_SIZE
		mov			ecx, [edi+eax-1*12+0]
		imul		ecx, DRAWVERT_SIZE

		shufps		xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
		shufps		xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm0, xmm6

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm1, xmm7

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm2, xmm6

		mov			ebx, [edi+eax-1*12+8]
		imul		ebx, DRAWVERT_SIZE

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		movss		xmm3, xmm7

		movss		xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		movss		xmm4, xmm6

		movss		xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
		movss		xmm5, xmm7

		movaps		xmm6, xmm4
		mulps		xmm6, xmm2
		movaps		xmm7, xmm5
		mulps		xmm7, xmm1
		subps		xmm6, xmm7

		mulps		xmm5, xmm0
		mulps		xmm2, xmm3
		subps		xmm5, xmm2

		mulps		xmm3, xmm1
		mulps		xmm4, xmm0
		subps		xmm3, xmm4

		movaps		xmm0, xmm6
		mulps		xmm6, xmm6
		movaps		xmm1, xmm5
		mulps		xmm5, xmm5
		movaps		xmm2, xmm3
		mulps		xmm3, xmm3

		addps		xmm3, xmm5
		addps		xmm3, xmm6
		rsqrtps		xmm3, xmm3

		add			edx, 4*16
		mov			ecx, [edi+eax-1*12+0]
		imul		ecx, DRAWVERT_SIZE

		mulps		xmm0, xmm3
		mulps		xmm1, xmm3
		mulps		xmm2, xmm3

		movss		[edx-1*16+0], xmm0
		movss		[edx-1*16+4], xmm1
		movss		[edx-1*16+8], xmm2

		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		xorps		xmm0, SIMD_SP_singleSignBitMask
		subss		xmm0, xmm1
		subss		xmm0, xmm2
		movss		[edx-1*16+12], xmm0

		mov			ecx, [edi+eax-2*12+0]
		imul		ecx, DRAWVERT_SIZE

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[edx-2*16+0], xmm0
		movss		[edx-2*16+4], xmm1
		movss		[edx-2*16+8], xmm2

		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		xorps		xmm0, SIMD_SP_singleSignBitMask
		subss		xmm0, xmm1
		subss		xmm0, xmm2
		movss		[edx-2*16+12], xmm0

		mov			ecx, [edi+eax-3*12+0]
		imul		ecx, DRAWVERT_SIZE

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[edx-3*16+0], xmm0
		movss		[edx-3*16+4], xmm1
		movss		[edx-3*16+8], xmm2

		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		xorps		xmm0, SIMD_SP_singleSignBitMask
		subss		xmm0, xmm1
		subss		xmm0, xmm2
		movss		[edx-3*16+12], xmm0

		mov			ecx, [edi+eax-4*12+0]
		imul		ecx, DRAWVERT_SIZE

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[edx-4*16+0], xmm0
		movss		[edx-4*16+4], xmm1
		movss		[edx-4*16+8], xmm2

		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		xorps		xmm0, SIMD_SP_singleSignBitMask
		subss		xmm0, xmm1
		subss		xmm0, xmm2
		movss		[edx-4*16+12], xmm0

		add			eax, 4*12
		jle			loopPlane4

	done4:

		sub			eax, 4*12
		jge			done

	loopPlane1:
		mov			ebx, [edi+eax+4]
		imul		ebx, DRAWVERT_SIZE
		mov			ecx, [edi+eax+0]
		imul		ecx, DRAWVERT_SIZE

		movss		xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]

		movss		xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]

		movss		xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		mov			ebx, [edi+eax+8]
		imul		ebx, DRAWVERT_SIZE

		movss		xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]

		movss		xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]

		movss		xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
		subss		xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		movss		xmm6, xmm4
		mulss		xmm6, xmm2
		movss		xmm7, xmm5
		mulss		xmm7, xmm1
		subss		xmm6, xmm7

		mulss		xmm5, xmm0
		mulss		xmm2, xmm3
		subss		xmm5, xmm2

		mulss		xmm3, xmm1
		mulss		xmm4, xmm0
		subss		xmm3, xmm4

		movss		xmm0, xmm6
		mulss		xmm6, xmm6
		movss		xmm1, xmm5
		mulss		xmm5, xmm5
		movss		xmm2, xmm3
		mulss		xmm3, xmm3

		addss		xmm3, xmm5
		addss		xmm3, xmm6
		rsqrtss		xmm3, xmm3

		add			edx, 1*16

		mulss		xmm0, xmm3
		mulss		xmm1, xmm3
		mulss		xmm2, xmm3

		movss		[edx-1*16+0], xmm0
		movss		[edx-1*16+4], xmm1
		movss		[edx-1*16+8], xmm2

		mulss		xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
		mulss		xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
		mulss		xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]

		xorps		xmm0, SIMD_SP_singleSignBitMask
		subss		xmm0, xmm1
		subss		xmm0, xmm2
		movss		[edx-1*16+12], xmm0

		add			eax, 1*12
		jl			loopPlane1

	done:
	}

#else

	int i, j;

	for ( i = 0; i <= numIndexes - 12; i += 12 ) {
		ALIGN16( float d0[4] );
		ALIGN16( float d1[4] );
		ALIGN16( float d2[4] );
		ALIGN16( float d3[4] );
		ALIGN16( float d4[4] );
		ALIGN16( float d5[4] );
		ALIGN16( float n0[4] );
		ALIGN16( float n1[4] );
		ALIGN16( float n2[4] );

		for ( j = 0; j < 4; j++ ) {
			const idDrawVert *a, *b, *c;

			a = verts + indexes[i + j * 3 + 0];
			b = verts + indexes[i + j * 3 + 1];
			c = verts + indexes[i + j * 3 + 2];

			d0[j] = b->xyz[0] - a->xyz[0];
			d1[j] = b->xyz[1] - a->xyz[1];
			d2[j] = b->xyz[2] - a->xyz[2];

			d3[j] = c->xyz[0] - a->xyz[0];
			d4[j] = c->xyz[1] - a->xyz[1];
			d5[j] = c->xyz[2] - a->xyz[2];
		}

		ALIGN16( float tmp[4] );

		n0[0] = d4[0] * d2[0];
		n0[1] = d4[1] * d2[1];
		n0[2] = d4[2] * d2[2];
		n0[3] = d4[3] * d2[3];

		n0[0] -= d5[0] * d1[0];
		n0[1] -= d5[1] * d1[1];
		n0[2] -= d5[2] * d1[2];
		n0[3] -= d5[3] * d1[3];

		n1[0] = d5[0] * d0[0];
		n1[1] = d5[1] * d0[1];
		n1[2] = d5[2] * d0[2];
		n1[3] = d5[3] * d0[3];

		n1[0] -= d3[0] * d2[0];
		n1[1] -= d3[1] * d2[1];
		n1[2] -= d3[2] * d2[2];
		n1[3] -= d3[3] * d2[3];

		n2[0] = d3[0] * d1[0];
		n2[1] = d3[1] * d1[1];
		n2[2] = d3[2] * d1[2];
		n2[3] = d3[3] * d1[3];

		n2[0] -= d4[0] * d0[0];
		n2[1] -= d4[1] * d0[1];
		n2[2] -= d4[2] * d0[2];
		n2[3] -= d4[3] * d0[3];

		tmp[0] = n0[0] * n0[0];
		tmp[1] = n0[1] * n0[1];
		tmp[2] = n0[2] * n0[2];
		tmp[3] = n0[3] * n0[3];

		tmp[0] += n1[0] * n1[0];
		tmp[1] += n1[1] * n1[1];
		tmp[2] += n1[2] * n1[2];
		tmp[3] += n1[3] * n1[3];

		tmp[0] += n2[0] * n2[0];
		tmp[1] += n2[1] * n2[1];
		tmp[2] += n2[2] * n2[2];
		tmp[3] += n2[3] * n2[3];

		tmp[0] = idMath::RSqrt( tmp[0] );
		tmp[1] = idMath::RSqrt( tmp[1] );
		tmp[2] = idMath::RSqrt( tmp[2] );
		tmp[3] = idMath::RSqrt( tmp[3] );

		n0[0] *= tmp[0];
		n0[1] *= tmp[1];
		n0[2] *= tmp[2];
		n0[3] *= tmp[3];

		n1[0] *= tmp[0];
		n1[1] *= tmp[1];
		n1[2] *= tmp[2];
		n1[3] *= tmp[3];

		n2[0] *= tmp[0];
		n2[1] *= tmp[1];
		n2[2] *= tmp[2];
		n2[3] *= tmp[3];


		for ( j = 0; j < 4; j++ ) {
			const idDrawVert *a;

			a = verts + indexes[i + j * 3];

			planes->Normal()[0] = n0[j];
			planes->Normal()[1] = n1[j];
			planes->Normal()[2] = n2[j];
			planes->FitThroughPoint( a->xyz );
			planes++;
		}
	}

	for ( ; i < numIndexes; i += 3 ) {
		const idDrawVert *a, *b, *c;
		float d0, d1, d2, d3, d4, d5;
		float n0, n1, n2;

		a = verts + indexes[i + 0];
		b = verts + indexes[i + 1];
		c = verts + indexes[i + 2];

		d0 = b->xyz[0] - a->xyz[0];
		d1 = b->xyz[1] - a->xyz[1];
		d2 = b->xyz[2] - a->xyz[2];

		d3 = c->xyz[0] - a->xyz[0];
		d4 = c->xyz[1] - a->xyz[1];
		d5 = c->xyz[2] - a->xyz[2];

		float tmp;

		n0 = d4 * d2 - d5 * d1;
		n1 = d5 * d0 - d3 * d2;
		n2 = d3 * d1 - d4 * d0;

		tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );

		n0 *= tmp;
		n1 *= tmp;
		n2 *= tmp;

		planes->Normal()[0] = n0;
		planes->Normal()[1] = n1;
		planes->Normal()[2] = n2;
		planes->FitThroughPoint( a->xyz );
		planes++;
	}

#endif
}

/*
============
idSIMD_SSE::DeriveTangents
============
*/
//#define REFINE_TANGENT_SQUAREROOT
#define FIX_DEGENERATE_TANGENT

void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
	int i;

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );

	assert( planes != NULL );
	assert( verts != NULL );
	assert( numVerts >= 0 );

#ifdef REFINE_TANGENT_SQUAREROOT
	__asm {
		movaps		xmm6, SIMD_SP_rsqrt_c0
		movaps		xmm7, SIMD_SP_rsqrt_c1
	}
#endif

	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
	memset( used, 0, numVerts * sizeof( used[0] ) );

	for ( i = 0; i <= numIndexes - 12; i += 12 ) {
		idDrawVert *a, *b, *c;
		ALIGN16( unsigned long signBit[4] );
		ALIGN16( float d0[4] );
		ALIGN16( float d1[4] );
		ALIGN16( float d2[4] );
		ALIGN16( float d3[4] );
		ALIGN16( float d4[4] );
		ALIGN16( float d5[4] );
		ALIGN16( float d6[4] );
		ALIGN16( float d7[4] );
		ALIGN16( float d8[4] );
		ALIGN16( float d9[4] );
		ALIGN16( float n0[4] );
		ALIGN16( float n1[4] );
		ALIGN16( float n2[4] );
		ALIGN16( float t0[4] );
		ALIGN16( float t1[4] );
		ALIGN16( float t2[4] );
		ALIGN16( float t3[4] );
		ALIGN16( float t4[4] );
		ALIGN16( float t5[4] );

		for ( int j = 0; j < 4; j++ ) {

			a = verts + indexes[i + j * 3 + 0];
			b = verts + indexes[i + j * 3 + 1];
			c = verts + indexes[i + j * 3 + 2];

			d0[j] = b->xyz[0] - a->xyz[0];
			d1[j] = b->xyz[1] - a->xyz[1];
			d2[j] = b->xyz[2] - a->xyz[2];
			d3[j] = b->st[0] - a->st[0];
			d4[j] = b->st[1] - a->st[1];

			d5[j] = c->xyz[0] - a->xyz[0];
			d6[j] = c->xyz[1] - a->xyz[1];
			d7[j] = c->xyz[2] - a->xyz[2];
			d8[j] = c->st[0] - a->st[0];
			d9[j] = c->st[1] - a->st[1];
		}

#if 1

		__asm {
			// normal
			movaps		xmm0, d6
			mulps		xmm0, d2
			movaps		xmm1, d7
			mulps		xmm1, d1
			subps		xmm0, xmm1

			movaps		xmm1, d7
			mulps		xmm1, d0
			movaps		xmm2, d5
			mulps		xmm2, d2
			subps		xmm1, xmm2

			movaps		xmm2, d5
			mulps		xmm2, d1
			movaps		xmm3, d6
			mulps		xmm3, d0
			subps		xmm2, xmm3

			movaps		xmm3, xmm0
			movaps		xmm4, xmm1
			movaps		xmm5, xmm2

			mulps		xmm3, xmm3
			mulps		xmm4, xmm4
			mulps		xmm5, xmm5

			addps		xmm3, xmm4
			addps		xmm3, xmm5

#ifdef FIX_DEGENERATE_TANGENT
			xorps		xmm4, xmm4
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4
#endif

#ifdef REFINE_TANGENT_SQUAREROOT
			rsqrtps		xmm4, xmm3
			mulps		xmm3, xmm4
			mulps		xmm3, xmm4
			subps		xmm3, xmm6
			mulps		xmm4, xmm7
			mulps		xmm3, xmm4
#else
			rsqrtps		xmm3, xmm3
#endif
			mulps		xmm0, xmm3
			movaps		n0, xmm0
			mulps		xmm1, xmm3
			movaps		n1, xmm1
			mulps		xmm2, xmm3
			movaps		n2, xmm2

			// area sign bit
			movaps		xmm0, d3
			mulps		xmm0, d9
			movaps		xmm1, d4
			mulps		xmm1, d8
			subps		xmm0, xmm1
			andps		xmm0, SIMD_SP_signBitMask
			movaps		signBit, xmm0

			// first tangent
			movaps		xmm0, d0
			mulps		xmm0, d9
			movaps		xmm1, d4
			mulps		xmm1, d5
			subps		xmm0, xmm1

			movaps		xmm1, d1
			mulps		xmm1, d9
			movaps		xmm2, d4
			mulps		xmm2, d6
			subps		xmm1, xmm2

			movaps		xmm2, d2
			mulps		xmm2, d9
			movaps		xmm3, d4
			mulps		xmm3, d7
			subps		xmm2, xmm3

			movaps		xmm3, xmm0
			movaps		xmm4, xmm1
			movaps		xmm5, xmm2

			mulps		xmm3, xmm3
			mulps		xmm4, xmm4
			mulps		xmm5, xmm5

			addps		xmm3, xmm4
			addps		xmm3, xmm5

#ifdef FIX_DEGENERATE_TANGENT
			xorps		xmm4, xmm4
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4
#endif

#ifdef REFINE_TANGENT_SQUAREROOT
			rsqrtps		xmm4, xmm3
			mulps		xmm3, xmm4
			mulps		xmm3, xmm4
			subps		xmm3, xmm6
			mulps		xmm4, xmm7
			mulps		xmm3, xmm4
#else
			rsqrtps		xmm3, xmm3
#endif
			xorps		xmm3, signBit

			mulps		xmm0, xmm3
			movaps		t0, xmm0
			mulps		xmm1, xmm3
			movaps		t1, xmm1
			mulps		xmm2, xmm3
			movaps		t2, xmm2

			// second tangent
			movaps		xmm0, d3
			mulps		xmm0, d5
			movaps		xmm1, d0
			mulps		xmm1, d8
			subps		xmm0, xmm1

			movaps		xmm1, d3
			mulps		xmm1, d6
			movaps		xmm2, d1
			mulps		xmm2, d8
			subps		xmm1, xmm2

			movaps		xmm2, d3
			mulps		xmm2, d7
			movaps		xmm3, d2
			mulps		xmm3, d8
			subps		xmm2, xmm3

			movaps		xmm3, xmm0
			movaps		xmm4, xmm1
			movaps		xmm5, xmm2

			mulps		xmm3, xmm3
			mulps		xmm4, xmm4
			mulps		xmm5, xmm5

			addps		xmm3, xmm4
			addps		xmm3, xmm5

#ifdef FIX_DEGENERATE_TANGENT
			xorps		xmm4, xmm4
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4
#endif

#ifdef REFINE_TANGENT_SQUAREROOT
			rsqrtps		xmm4, xmm3
			mulps		xmm3, xmm4
			mulps		xmm3, xmm4
			subps		xmm3, xmm6
			mulps		xmm4, xmm7
			mulps		xmm3, xmm4
#else
			rsqrtps		xmm3, xmm3
#endif
			xorps		xmm3, signBit

			mulps		xmm0, xmm3
			movaps		t3, xmm0
			mulps		xmm1, xmm3
			movaps		t4, xmm1
			mulps		xmm2, xmm3
			movaps		t5, xmm2
		}

#else

		ALIGN16( float tmp[4] );

		// normal
		n0[0] = d6[0] * d2[0];
		n0[1] = d6[1] * d2[1];
		n0[2] = d6[2] * d2[2];
		n0[3] = d6[3] * d2[3];

		n0[0] -= d7[0] * d1[0];
		n0[1] -= d7[1] * d1[1];
		n0[2] -= d7[2] * d1[2];
		n0[3] -= d7[3] * d1[3];

		n1[0] = d7[0] * d0[0];
		n1[1] = d7[1] * d0[1];
		n1[2] = d7[2] * d0[2];
		n1[3] = d7[3] * d0[3];

		n1[0] -= d5[0] * d2[0];
		n1[1] -= d5[1] * d2[1];
		n1[2] -= d5[2] * d2[2];
		n1[3] -= d5[3] * d2[3];

		n2[0] = d5[0] * d1[0];
		n2[1] = d5[1] * d1[1];
		n2[2] = d5[2] * d1[2];
		n2[3] = d5[3] * d1[3];

		n2[0] -= d6[0] * d0[0];
		n2[1] -= d6[1] * d0[1];
		n2[2] -= d6[2] * d0[2];
		n2[3] -= d6[3] * d0[3];

		tmp[0] = n0[0] * n0[0];
		tmp[1] = n0[1] * n0[1];
		tmp[2] = n0[2] * n0[2];
		tmp[3] = n0[3] * n0[3];

		tmp[0] += n1[0] * n1[0];
		tmp[1] += n1[1] * n1[1];
		tmp[2] += n1[2] * n1[2];
		tmp[3] += n1[3] * n1[3];

		tmp[0] += n2[0] * n2[0];
		tmp[1] += n2[1] * n2[1];
		tmp[2] += n2[2] * n2[2];
		tmp[3] += n2[3] * n2[3];

		tmp[0] = idMath::RSqrt( tmp[0] );
		tmp[1] = idMath::RSqrt( tmp[1] );
		tmp[2] = idMath::RSqrt( tmp[2] );
		tmp[3] = idMath::RSqrt( tmp[3] );

		n0[0] *= tmp[0];
		n0[1] *= tmp[1];
		n0[2] *= tmp[2];
		n0[3] *= tmp[3];

		n1[0] *= tmp[0];
		n1[1] *= tmp[1];
		n1[2] *= tmp[2];
		n1[3] *= tmp[3];

		n2[0] *= tmp[0];
		n2[1] *= tmp[1];
		n2[2] *= tmp[2];
		n2[3] *= tmp[3];

		// area sign bit
		tmp[0] = d3[0] * d9[0];
		tmp[1] = d3[1] * d9[1];
		tmp[2] = d3[2] * d9[2];
		tmp[3] = d3[3] * d9[3];

		tmp[0] -= d4[0] * d8[0];
		tmp[1] -= d4[1] * d8[1];
		tmp[2] -= d4[2] * d8[2];
		tmp[3] -= d4[3] * d8[3];

		signBit[0] = ( *(unsigned long *)&tmp[0] ) & ( 1 << 31 );
		signBit[1] = ( *(unsigned long *)&tmp[1] ) & ( 1 << 31 );
		signBit[2] = ( *(unsigned long *)&tmp[2] ) & ( 1 << 31 );
		signBit[3] = ( *(unsigned long *)&tmp[3] ) & ( 1 << 31 );

		// first tangent
		t0[0] = d0[0] * d9[0];
		t0[1] = d0[1] * d9[1];
		t0[2] = d0[2] * d9[2];
		t0[3] = d0[3] * d9[3];

		t0[0] -= d4[0] * d5[0];
		t0[1] -= d4[1] * d5[1];
		t0[2] -= d4[2] * d5[2];
		t0[3] -= d4[3] * d5[3];

		t1[0] = d1[0] * d9[0];
		t1[1] = d1[1] * d9[1];
		t1[2] = d1[2] * d9[2];
		t1[3] = d1[3] * d9[3];

		t1[0] -= d4[0] * d6[0];
		t1[1] -= d4[1] * d6[1];
		t1[2] -= d4[2] * d6[2];
		t1[3] -= d4[3] * d6[3];

		t2[0] = d2[0] * d9[0];
		t2[1] = d2[1] * d9[1];
		t2[2] = d2[2] * d9[2];
		t2[3] = d2[3] * d9[3];

		t2[0] -= d4[0] * d7[0];
		t2[1] -= d4[1] * d7[1];
		t2[2] -= d4[2] * d7[2];
		t2[3] -= d4[3] * d7[3];

		tmp[0] = t0[0] * t0[0];
		tmp[1] = t0[1] * t0[1];
		tmp[2] = t0[2] * t0[2];
		tmp[3] = t0[3] * t0[3];

		tmp[0] += t1[0] * t1[0];
		tmp[1] += t1[1] * t1[1];
		tmp[2] += t1[2] * t1[2];
		tmp[3] += t1[3] * t1[3];

		tmp[0] += t2[0] * t2[0];
		tmp[1] += t2[1] * t2[1];
		tmp[2] += t2[2] * t2[2];
		tmp[3] += t2[3] * t2[3];

		tmp[0] = idMath::RSqrt( tmp[0] );
		tmp[1] = idMath::RSqrt( tmp[1] );
		tmp[2] = idMath::RSqrt( tmp[2] );
		tmp[3] = idMath::RSqrt( tmp[3] );

		*(unsigned long *)&tmp[0] ^= signBit[0];
		*(unsigned long *)&tmp[1] ^= signBit[1];
		*(unsigned long *)&tmp[2] ^= signBit[2];
		*(unsigned long *)&tmp[3] ^= signBit[3];

		t0[0] *= tmp[0];
		t0[1] *= tmp[1];
		t0[2] *= tmp[2];
		t0[3] *= tmp[3];

		t1[0] *= tmp[0];
		t1[1] *= tmp[1];
		t1[2] *= tmp[2];
		t1[3] *= tmp[3];

		t2[0] *= tmp[0];
		t2[1] *= tmp[1];
		t2[2] *= tmp[2];
		t2[3] *= tmp[3];

		// second tangent
		t3[0] = d3[0] * d5[0];
		t3[1] = d3[1] * d5[1];
		t3[2] = d3[2] * d5[2];
		t3[3] = d3[3] * d5[3];

		t3[0] -= d0[0] * d8[0];
		t3[1] -= d0[1] * d8[1];
		t3[2] -= d0[2] * d8[2];
		t3[3] -= d0[3] * d8[3];

		t4[0] = d3[0] * d6[0];
		t4[1] = d3[1] * d6[1];
		t4[2] = d3[2] * d6[2];
		t4[3] = d3[3] * d6[3];

		t4[0] -= d1[0] * d8[0];
		t4[1] -= d1[1] * d8[1];
		t4[2] -= d1[2] * d8[2];
		t4[3] -= d1[3] * d8[3];

		t5[0] = d3[0] * d7[0];
		t5[1] = d3[1] * d7[1];
		t5[2] = d3[2] * d7[2];
		t5[3] = d3[3] * d7[3];

		t5[0] -= d2[0] * d8[0];
		t5[1] -= d2[1] * d8[1];
		t5[2] -= d2[2] * d8[2];
		t5[3] -= d2[3] * d8[3];

		tmp[0] = t3[0] * t3[0];
		tmp[1] = t3[1] * t3[1];
		tmp[2] = t3[2] * t3[2];
		tmp[3] = t3[3] * t3[3];

		tmp[0] += t4[0] * t4[0];
		tmp[1] += t4[1] * t4[1];
		tmp[2] += t4[2] * t4[2];
		tmp[3] += t4[3] * t4[3];

		tmp[0] += t5[0] * t5[0];
		tmp[1] += t5[1] * t5[1];
		tmp[2] += t5[2] * t5[2];
		tmp[3] += t5[3] * t5[3];

		tmp[0] = idMath::RSqrt( tmp[0] );
		tmp[1] = idMath::RSqrt( tmp[1] );
		tmp[2] = idMath::RSqrt( tmp[2] );
		tmp[3] = idMath::RSqrt( tmp[3] );

		*(unsigned long *)&tmp[0] ^= signBit[0];
		*(unsigned long *)&tmp[1] ^= signBit[1];
		*(unsigned long *)&tmp[2] ^= signBit[2];
		*(unsigned long *)&tmp[3] ^= signBit[3];

		t3[0] *= tmp[0];
		t3[1] *= tmp[1];
		t3[2] *= tmp[2];
		t3[3] *= tmp[3];

		t4[0] *= tmp[0];
		t4[1] *= tmp[1];
		t4[2] *= tmp[2];
		t4[3] *= tmp[3];

		t5[0] *= tmp[0];
		t5[1] *= tmp[1];
		t5[2] *= tmp[2];
		t5[3] *= tmp[3];

#endif

		for ( int j = 0; j < 4; j++ ) {

			const int v0 = indexes[i + j * 3 + 0];
			const int v1 = indexes[i + j * 3 + 1];
			const int v2 = indexes[i + j * 3 + 2];

			a = verts + v0;
			b = verts + v1;
			c = verts + v2;

			planes->Normal()[0] = n0[j];
			planes->Normal()[1] = n1[j];
			planes->Normal()[2] = n2[j];
			planes->FitThroughPoint( a->xyz );
			planes++;

			if ( used[v0] ) {
				a->normal[0] += n0[j];
				a->normal[1] += n1[j];
				a->normal[2] += n2[j];

				a->tangents[0][0] += t0[j];
				a->tangents[0][1] += t1[j];
				a->tangents[0][2] += t2[j];

				a->tangents[1][0] += t3[j];
				a->tangents[1][1] += t4[j];
				a->tangents[1][2] += t5[j];
			} else {
				a->normal[0] = n0[j];
				a->normal[1] = n1[j];
				a->normal[2] = n2[j];

				a->tangents[0][0] = t0[j];
				a->tangents[0][1] = t1[j];
				a->tangents[0][2] = t2[j];

				a->tangents[1][0] = t3[j];
				a->tangents[1][1] = t4[j];
				a->tangents[1][2] = t5[j];

				used[v0] = true;
			}

			if ( used[v1] ) {
				b->normal[0] += n0[j];
				b->normal[1] += n1[j];
				b->normal[2] += n2[j];

				b->tangents[0][0] += t0[j];
				b->tangents[0][1] += t1[j];
				b->tangents[0][2] += t2[j];

				b->tangents[1][0] += t3[j];
				b->tangents[1][1] += t4[j];
				b->tangents[1][2] += t5[j];
			} else {
				b->normal[0] = n0[j];
				b->normal[1] = n1[j];
				b->normal[2] = n2[j];

				b->tangents[0][0] = t0[j];
				b->tangents[0][1] = t1[j];
				b->tangents[0][2] = t2[j];

				b->tangents[1][0] = t3[j];
				b->tangents[1][1] = t4[j];
				b->tangents[1][2] = t5[j];

				used[v1] = true;
			}

			if ( used[v2] ) {
				c->normal[0] += n0[j];
				c->normal[1] += n1[j];
				c->normal[2] += n2[j];

				c->tangents[0][0] += t0[j];
				c->tangents[0][1] += t1[j];
				c->tangents[0][2] += t2[j];

				c->tangents[1][0] += t3[j];
				c->tangents[1][1] += t4[j];
				c->tangents[1][2] += t5[j];
			} else {
				c->normal[0] = n0[j];
				c->normal[1] = n1[j];
				c->normal[2] = n2[j];

				c->tangents[0][0] = t0[j];
				c->tangents[0][1] = t1[j];
				c->tangents[0][2] = t2[j];

				c->tangents[1][0] = t3[j];
				c->tangents[1][1] = t4[j];
				c->tangents[1][2] = t5[j];

				used[v2] = true;
			}
		}
	}

	for ( ; i < numIndexes; i += 3 ) {
		idDrawVert *a, *b, *c;
		ALIGN16( unsigned long signBit[4] );
		float d0, d1, d2, d3, d4;
		float d5, d6, d7, d8, d9;
		float n0, n1, n2;
		float t0, t1, t2;
		float t3, t4, t5;

		const int v0 = indexes[i + 0];
		const int v1 = indexes[i + 1];
		const int v2 = indexes[i + 2];

		a = verts + v0;
		b = verts + v1;
		c = verts + v2;

		d0 = b->xyz[0] - a->xyz[0];
		d1 = b->xyz[1] - a->xyz[1];
		d2 = b->xyz[2] - a->xyz[2];
		d3 = b->st[0] - a->st[0];
		d4 = b->st[1] - a->st[1];

		d5 = c->xyz[0] - a->xyz[0];
		d6 = c->xyz[1] - a->xyz[1];
		d7 = c->xyz[2] - a->xyz[2];
		d8 = c->st[0] - a->st[0];
		d9 = c->st[1] - a->st[1];

#if 1

		__asm {
			// normal
			movss		xmm0, d6
			mulss		xmm0, d2
			movss		xmm1, d7
			mulss		xmm1, d1
			subss		xmm0, xmm1

			movss		xmm1, d7
			mulss		xmm1, d0
			movss		xmm2, d5
			mulss		xmm2, d2
			subss		xmm1, xmm2

			movss		xmm2, d5
			mulss		xmm2, d1
			movss		xmm3, d6
			mulss		xmm3, d0
			subss		xmm2, xmm3

			movss		xmm3, xmm0
			movss		xmm4, xmm1
			movss		xmm5, xmm2

			mulss		xmm3, xmm3
			mulss		xmm4, xmm4
			mulss		xmm5, xmm5

			addss		xmm3, xmm4
			addss		xmm3, xmm5

#ifdef FIX_DEGENERATE_TANGENT
			xorps		xmm4, xmm4
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4
#endif

#ifdef REFINE_TANGENT_SQUAREROOT
			rsqrtss		xmm4, xmm3
			mulss		xmm3, xmm4
			mulss		xmm3, xmm4
			subss		xmm3, xmm6
			mulss		xmm4, xmm7
			mulss		xmm3, xmm4
#else
			rsqrtss		xmm3, xmm3
#endif
			mulss		xmm0, xmm3
			movss		n0, xmm0
			mulss		xmm1, xmm3
			movss		n1, xmm1
			mulss		xmm2, xmm3
			movss		n2, xmm2

			// area sign bit
			movss		xmm0, d3
			mulss		xmm0, d9
			movss		xmm1, d4
			mulss		xmm1, d8
			subss		xmm0, xmm1
			andps		xmm0, SIMD_SP_signBitMask
			movaps		signBit, xmm0

			// first tangent
			movss		xmm0, d0
			mulss		xmm0, d9
			movss		xmm1, d4
			mulss		xmm1, d5
			subss		xmm0, xmm1

			movss		xmm1, d1
			mulss		xmm1, d9
			movss		xmm2, d4
			mulss		xmm2, d6
			subss		xmm1, xmm2

			movss		xmm2, d2
			mulss		xmm2, d9
			movss		xmm3, d4
			mulss		xmm3, d7
			subss		xmm2, xmm3

			movss		xmm3, xmm0
			movss		xmm4, xmm1
			movss		xmm5, xmm2

			mulss		xmm3, xmm3
			mulss		xmm4, xmm4
			mulss		xmm5, xmm5

			addss		xmm3, xmm4
			addss		xmm3, xmm5

#ifdef FIX_DEGENERATE_TANGENT
			xorps		xmm4, xmm4
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4
#endif

#ifdef REFINE_TANGENT_SQUAREROOT
			rsqrtss		xmm4, xmm3
			mulss		xmm3, xmm4
			mulss		xmm3, xmm4
			subss		xmm3, xmm6
			mulss		xmm4, xmm7
			mulss		xmm3, xmm4
#else
			rsqrtss		xmm3, xmm3
#endif
			xorps		xmm3, signBit

			mulss		xmm0, xmm3
			movss		t0, xmm0
			mulss		xmm1, xmm3
			movss		t1, xmm1
			mulss		xmm2, xmm3
			movss		t2, xmm2

			// second tangent
			movss		xmm0, d3
			mulss		xmm0, d5
			movss		xmm1, d0
			mulss		xmm1, d8
			subss		xmm0, xmm1

			movss		xmm1, d3
			mulss		xmm1, d6
			movss		xmm2, d1
			mulss		xmm2, d8
			subss		xmm1, xmm2

			movss		xmm2, d3
			mulss		xmm2, d7
			movss		xmm3, d2
			mulss		xmm3, d8
			subss		xmm2, xmm3

			movss		xmm3, xmm0
			movss		xmm4, xmm1
			movss		xmm5, xmm2

			mulss		xmm3, xmm3
			mulss		xmm4, xmm4
			mulss		xmm5, xmm5

			addss		xmm3, xmm4
			addss		xmm3, xmm5

#ifdef FIX_DEGENERATE_TANGENT
			xorps		xmm4, xmm4
			cmpeqps		xmm4, xmm3
			andps		xmm4, SIMD_SP_tiny			// if values are zero replace them with a tiny number
			andps		xmm3, SIMD_SP_absMask		// make sure the values are positive
			orps		xmm3, xmm4
#endif

#ifdef REFINE_TANGENT_SQUAREROOT
			rsqrtss		xmm4, xmm3
			mulss		xmm3, xmm4
			mulss		xmm3, xmm4
			subss		xmm3, xmm6
			mulss		xmm4, xmm7
			mulss		xmm3, xmm4
#else
			rsqrtss		xmm3, xmm3
#endif
			xorps		xmm3, signBit

			mulss		xmm0, xmm3
			movss		t3, xmm0
			mulss		xmm1, xmm3
			movss		t4, xmm1
			mulss		xmm2, xmm3
			movss		t5, xmm2
		}

#else

		float tmp;

		// normal
		n0 = d6 * d2 - d7 * d1;
		n1 = d7 * d0 - d5 * d2;
		n2 = d5 * d1 - d6 * d0;

		tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );

		n0 *= tmp;
		n1 *= tmp;
		n2 *= tmp;

		// area sign bit
		tmp = d3 * d9 - d4 * d8;
		signBit[0] = ( *(unsigned long *)&tmp ) & ( 1 << 31 );

		// first tangent
		t0 = d0 * d9 - d4 * d5;
		t1 = d1 * d9 - d4 * d6;
		t2 = d2 * d9 - d4 * d7;

		tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
		*(unsigned long *)&tmp ^= signBit[0];

		t0 *= tmp;
		t1 *= tmp;
		t2 *= tmp;

		// second tangent
		t3 = d3 * d5 - d0 * d8;
		t4 = d3 * d6 - d1 * d8;
		t5 = d3 * d7 - d2 * d8;

		tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
		*(unsigned long *)&tmp ^= signBit[0];

		t3 *= tmp;
		t4 *= tmp;
		t5 *= tmp;

#endif

		planes->Normal()[0] = n0;
		planes->Normal()[1] = n1;
		planes->Normal()[2] = n2;
		planes->FitThroughPoint( a->xyz );
		planes++;

		if ( used[v0] ) {
			a->normal[0] += n0;
			a->normal[1] += n1;
			a->normal[2] += n2;

			a->tangents[0][0] += t0;
			a->tangents[0][1] += t1;
			a->tangents[0][2] += t2;

			a->tangents[1][0] += t3;
			a->tangents[1][1] += t4;
			a->tangents[1][2] += t5;
		} else {
			a->normal[0] = n0;
			a->normal[1] = n1;
			a->normal[2] = n2;

			a->tangents[0][0] = t0;
			a->tangents[0][1] = t1;
			a->tangents[0][2] = t2;

			a->tangents[1][0] = t3;
			a->tangents[1][1] = t4;
			a->tangents[1][2] = t5;

			used[v0] = true;
		}

		if ( used[v1] ) {
			b->normal[0] += n0;
			b->normal[1] += n1;
			b->normal[2] += n2;

			b->tangents[0][0] += t0;
			b->tangents[0][1] += t1;
			b->tangents[0][2] += t2;

			b->tangents[1][0] += t3;
			b->tangents[1][1] += t4;
			b->tangents[1][2] += t5;
		} else {
			b->normal[0] = n0;
			b->normal[1] = n1;
			b->normal[2] = n2;

			b->tangents[0][0] = t0;
			b->tangents[0][1] = t1;
			b->tangents[0][2] = t2;

			b->tangents[1][0] = t3;
			b->tangents[1][1] = t4;
			b->tangents[1][2] = t5;

			used[v1] = true;
		}

		if ( used[v2] ) {
			c->normal[0] += n0;
			c->normal[1] += n1;
			c->normal[2] += n2;

			c->tangents[0][0] += t0;
			c->tangents[0][1] += t1;
			c->tangents[0][2] += t2;

			c->tangents[1][0] += t3;
			c->tangents[1][1] += t4;
			c->tangents[1][2] += t5;
		} else {
			c->normal[0] = n0;
			c->normal[1] = n1;
			c->normal[2] = n2;

			c->tangents[0][0] = t0;
			c->tangents[0][1] = t1;
			c->tangents[0][2] = t2;

			c->tangents[1][0] = t3;
			c->tangents[1][1] = t4;
			c->tangents[1][2] = t5;

			used[v2] = true;
		}
	}
}

/*
============
idSIMD_SSE::DeriveUnsmoothedTangents
============
*/
#define DERIVE_UNSMOOTHED_BITANGENT

void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
	int i, j;

	for ( i = 0; i <= numVerts - 4; i += 4 ) {
		ALIGN16( float s0[4] );
		ALIGN16( float s1[4] );
		ALIGN16( float s2[4] );
		ALIGN16( float d0[4] );
		ALIGN16( float d1[4] );
		ALIGN16( float d2[4] );
		ALIGN16( float d3[4] );
		ALIGN16( float d4[4] );
		ALIGN16( float d5[4] );
		ALIGN16( float d6[4] );
		ALIGN16( float d7[4] );
		ALIGN16( float d8[4] );
		ALIGN16( float d9[4] );
		ALIGN16( float n0[4] );
		ALIGN16( float n1[4] );
		ALIGN16( float n2[4] );
		ALIGN16( float t0[4] );
		ALIGN16( float t1[4] );
		ALIGN16( float t2[4] );
		ALIGN16( float t3[4] );
		ALIGN16( float t4[4] );
		ALIGN16( float t5[4] );

		for ( j = 0; j < 4; j++ ) {
			const idDrawVert *a, *b, *c;

			const dominantTri_s &dt = dominantTris[i+j];

			s0[j] = dt.normalizationScale[0];
			s1[j] = dt.normalizationScale[1];
			s2[j] = dt.normalizationScale[2];

			a = verts + i + j;
			b = verts + dt.v2;
			c = verts + dt.v3;

			d0[j] = b->xyz[0] - a->xyz[0];
			d1[j] = b->xyz[1] - a->xyz[1];
			d2[j] = b->xyz[2] - a->xyz[2];
			d3[j] = b->st[0] - a->st[0];
			d4[j] = b->st[1] - a->st[1];

			d5[j] = c->xyz[0] - a->xyz[0];
			d6[j] = c->xyz[1] - a->xyz[1];
			d7[j] = c->xyz[2] - a->xyz[2];
			d8[j] = c->st[0] - a->st[0];
			d9[j] = c->st[1] - a->st[1];
		}

#if 1

		__asm {

			movaps		xmm0, d6
			mulps		xmm0, d2
			movaps		xmm1, d7
			mulps		xmm1, d1

			movaps		xmm2, d7
			mulps		xmm2, d0
			movaps		xmm3, d5
			mulps		xmm3, d2

			movaps		xmm4, d5
			mulps		xmm4, d1
			movaps		xmm5, d6
			mulps		xmm5, d0

			subps		xmm0, xmm1
			subps		xmm2, xmm3
			movaps		xmm7, s2
			subps		xmm4, xmm5

			mulps		xmm0, xmm7
			movaps		n0, xmm0
			mulps		xmm2, xmm7
			movaps		n1, xmm2
			mulps		xmm4, xmm7
			movaps		n2, xmm4

			movaps		xmm0, d0
			mulps		xmm0, d9
			movaps		xmm1, d4
			mulps		xmm1, d5

			movaps		xmm2, d1
			mulps		xmm2, d9
			movaps		xmm3, d4
			mulps		xmm3, d6

			movaps		xmm4, d2
			mulps		xmm4, d9
			movaps		xmm5, d4
			mulps		xmm5, d7

			subps		xmm0, xmm1
			subps		xmm2, xmm3
			movaps		xmm7, s0
			subps		xmm4, xmm5

			mulps		xmm0, xmm7
			movaps		t0, xmm0
			mulps		xmm2, xmm7
			movaps		t1, xmm2
			mulps		xmm4, xmm7
			movaps		t2, xmm4

#ifndef DERIVE_UNSMOOTHED_BITANGENT
			movaps		xmm0, d3
			mulps		xmm0, d5
			movaps		xmm1, d0
			mulps		xmm1, d8

			movaps		xmm2, d3
			mulps		xmm2, d6
			movaps		xmm3, d1
			mulps		xmm3, d8

			movaps		xmm4, d3
			mulps		xmm4, d7
			movaps		xmm5, d2
			mulps		xmm5, d8
#else
			movaps		xmm0, n2
			mulps		xmm0, t1
			movaps		xmm1, n1
			mulps		xmm1, t2

			movaps		xmm2, n0
			mulps		xmm2, t2
			movaps		xmm3, n2
			mulps		xmm3, t0

			movaps		xmm4, n1
			mulps		xmm4, t0
			movaps		xmm5, n0
			mulps		xmm5, t1
#endif
			subps		xmm0, xmm1
			subps		xmm2, xmm3
			movaps		xmm7, s1
			subps		xmm4, xmm5

			mulps		xmm0, xmm7
			movaps		t3, xmm0
			mulps		xmm2, xmm7
			movaps		t4, xmm2
			mulps		xmm4, xmm7
			movaps		t5, xmm4
		}

#else

		n0[0] = d6[0] * d2[0];
		n0[1] = d6[1] * d2[1];
		n0[2] = d6[2] * d2[2];
		n0[3] = d6[3] * d2[3];

		n1[0] = d7[0] * d0[0];
		n1[1] = d7[1] * d0[1];
		n1[2] = d7[2] * d0[2];
		n1[3] = d7[3] * d0[3];

		n2[0] = d5[0] * d1[0];
		n2[1] = d5[1] * d1[1];
		n2[2] = d5[2] * d1[2];
		n2[3] = d5[3] * d1[3];

		n0[0] -= d7[0] * d1[0];
		n0[1] -= d7[1] * d1[1];
		n0[2] -= d7[2] * d1[2];
		n0[3] -= d7[3] * d1[3];

		n1[0] -= d5[0] * d2[0];
		n1[1] -= d5[1] * d2[1];
		n1[2] -= d5[2] * d2[2];
		n1[3] -= d5[3] * d2[3];

		n2[0] -= d6[0] * d0[0];
		n2[1] -= d6[1] * d0[1];
		n2[2] -= d6[2] * d0[2];
		n2[3] -= d6[3] * d0[3];

		n0[0] *= s2[0];
		n0[1] *= s2[1];
		n0[2] *= s2[2];
		n0[3] *= s2[3];

		n1[0] *= s2[0];
		n1[1] *= s2[1];
		n1[2] *= s2[2];
		n1[3] *= s2[3];

		n2[0] *= s2[0];
		n2[1] *= s2[1];
		n2[2] *= s2[2];
		n2[3] *= s2[3];

		t0[0] = d0[0] * d9[0];
		t0[1] = d0[1] * d9[1];
		t0[2] = d0[2] * d9[2];
		t0[3] = d0[3] * d9[3];

		t1[0] = d1[0] * d9[0];
		t1[1] = d1[1] * d9[1];
		t1[2] = d1[2] * d9[2];
		t1[3] = d1[3] * d9[3];

		t2[0] = d2[0] * d9[0];
		t2[1] = d2[1] * d9[1];
		t2[2] = d2[2] * d9[2];
		t2[3] = d2[3] * d9[3];

		t0[0] -= d4[0] * d5[0];
		t0[1] -= d4[1] * d5[1];
		t0[2] -= d4[2] * d5[2];
		t0[3] -= d4[3] * d5[3];

		t1[0] -= d4[0] * d6[0];
		t1[1] -= d4[1] * d6[1];
		t1[2] -= d4[2] * d6[2];
		t1[3] -= d4[3] * d6[3];

		t2[0] -= d4[0] * d7[0];
		t2[1] -= d4[1] * d7[1];
		t2[2] -= d4[2] * d7[2];
		t2[3] -= d4[3] * d7[3];

		t0[0] *= s0[0];
		t0[1] *= s0[1];
		t0[2] *= s0[2];
		t0[3] *= s0[3];

		t1[0] *= s0[0];
		t1[1] *= s0[1];
		t1[2] *= s0[2];
		t1[3] *= s0[3];

		t2[0] *= s0[0];
		t2[1] *= s0[1];
		t2[2] *= s0[2];
		t2[3] *= s0[3];

#ifndef DERIVE_UNSMOOTHED_BITANGENT
		t3[0] = d3[0] * d5[0];
		t3[1] = d3[1] * d5[1];
		t3[2] = d3[2] * d5[2];
		t3[3] = d3[3] * d5[3];

		t4[0] = d3[0] * d6[0];
		t4[1] = d3[1] * d6[1];
		t4[2] = d3[2] * d6[2];
		t4[3] = d3[3] * d6[3];

		t5[0] = d3[0] * d7[0];
		t5[1] = d3[1] * d7[1];
		t5[2] = d3[2] * d7[2];
		t5[3] = d3[3] * d7[3];

		t3[0] -= d0[0] * d8[0];
		t3[1] -= d0[1] * d8[1];
		t3[2] -= d0[2] * d8[2];
		t3[3] -= d0[3] * d8[3];

		t4[0] -= d1[0] * d8[0];
		t4[1] -= d1[1] * d8[1];
		t4[2] -= d1[2] * d8[2];
		t4[3] -= d1[3] * d8[3];

		t5[0] -= d2[0] * d8[0];
		t5[1] -= d2[1] * d8[1];
		t5[2] -= d2[2] * d8[2];
		t5[3] -= d2[3] * d8[3];
#else
		t3[0] = n2[0] * t1[0];
		t3[1] = n2[1] * t1[1];
		t3[2] = n2[2] * t1[2];
		t3[3] = n2[3] * t1[3];

		t4[0] = n0[0] * t2[0];
		t4[1] = n0[1] * t2[1];
		t4[2] = n0[2] * t2[2];
		t4[3] = n0[3] * t2[3];

		t5[0] = n1[0] * t0[0];
		t5[1] = n1[1] * t0[1];
		t5[2] = n1[2] * t0[2];
		t5[3] = n1[3] * t0[3];

		t3[0] -= n1[0] * t2[0];
		t3[1] -= n1[1] * t2[1];
		t3[2] -= n1[2] * t2[2];
		t3[3] -= n1[3] * t2[3];

		t4[0] -= n2[0] * t0[0];
		t4[1] -= n2[1] * t0[1];
		t4[2] -= n2[2] * t0[2];
		t4[3] -= n2[3] * t0[3];

		t5[0] -= n0[0] * t1[0];
		t5[1] -= n0[1] * t1[1];
		t5[2] -= n0[2] * t1[2];
		t5[3] -= n0[3] * t1[3];
#endif
		t3[0] *= s1[0];
		t3[1] *= s1[1];
		t3[2] *= s1[2];
		t3[3] *= s1[3];

		t4[0] *= s1[0];
		t4[1] *= s1[1];
		t4[2] *= s1[2];
		t4[3] *= s1[3];

		t5[0] *= s1[0];
		t5[1] *= s1[1];
		t5[2] *= s1[2];
		t5[3] *= s1[3];

#endif

		for ( j = 0; j < 4; j++ ) {
			idDrawVert *a;

			a = verts + i + j;

			a->normal[0] = n0[j];
			a->normal[1] = n1[j];
			a->normal[2] = n2[j];

			a->tangents[0][0] = t0[j];
			a->tangents[0][1] = t1[j];
			a->tangents[0][2] = t2[j];

			a->tangents[1][0] = t3[j];
			a->tangents[1][1] = t4[j];
			a->tangents[1][2] = t5[j];
		}
	}

	for ( ; i < numVerts; i++ ) {
		idDrawVert *a, *b, *c;
		float d0, d1, d2, d3, d4;
		float d5, d6, d7, d8, d9;
		float s0, s1, s2;
		float n0, n1, n2;
		float t0, t1, t2;
		float t3, t4, t5;

		const dominantTri_s &dt = dominantTris[i];

		s0 = dt.normalizationScale[0];
		s1 = dt.normalizationScale[1];
		s2 = dt.normalizationScale[2];

		a = verts + i;
		b = verts + dt.v2;
		c = verts + dt.v3;

		d0 = b->xyz[0] - a->xyz[0];
		d1 = b->xyz[1] - a->xyz[1];
		d2 = b->xyz[2] - a->xyz[2];
		d3 = b->st[0] - a->st[0];
		d4 = b->st[1] - a->st[1];

		d5 = c->xyz[0] - a->xyz[0];
		d6 = c->xyz[1] - a->xyz[1];
		d7 = c->xyz[2] - a->xyz[2];
		d8 = c->st[0] - a->st[0];
		d9 = c->st[1] - a->st[1];

#if 1

		__asm {

			movss		xmm0, d6
			mulss		xmm0, d2
			movss		xmm1, d7
			mulss		xmm1, d1

			movss		xmm2, d7
			mulss		xmm2, d0
			movss		xmm3, d5
			mulss		xmm3, d2

			movss		xmm4, d5
			mulss		xmm4, d1
			movss		xmm5, d6
			mulss		xmm5, d0

			subss		xmm0, xmm1
			subss		xmm2, xmm3
			movss		xmm7, s2
			subss		xmm4, xmm5

			mulss		xmm0, xmm7
			movss		n0, xmm0
			mulss		xmm2, xmm7
			movss		n1, xmm2
			mulss		xmm4, xmm7
			movss		n2, xmm4

			movss		xmm0, d0
			mulss		xmm0, d9
			movss		xmm1, d4
			mulss		xmm1, d5

			movss		xmm2, d1
			mulss		xmm2, d9
			movss		xmm3, d4
			mulss		xmm3, d6

			movss		xmm4, d2
			mulss		xmm4, d9
			movss		xmm5, d4
			mulss		xmm5, d7

			subss		xmm0, xmm1
			subss		xmm2, xmm3
			movss		xmm7, s0
			subss		xmm4, xmm5

			mulss		xmm0, xmm7
			movss		t0, xmm0
			mulss		xmm2, xmm7
			movss		t1, xmm2
			mulss		xmm4, xmm7
			movss		t2, xmm4

#ifndef DERIVE_UNSMOOTHED_BITANGENT
			movss		xmm0, d3
			mulss		xmm0, d5
			movss		xmm1, d0
			mulss		xmm1, d8

			movss		xmm2, d3
			mulss		xmm2, d6
			movss		xmm3, d1
			mulss		xmm3, d8

			movss		xmm4, d3
			mulss		xmm4, d7
			movss		xmm5, d2
			mulss		xmm5, d8
#else
			movss		xmm0, n2
			mulss		xmm0, t1
			movss		xmm1, n1
			mulss		xmm1, t2

			movss		xmm2, n0
			mulss		xmm2, t2
			movss		xmm3, n2
			mulss		xmm3, t0

			movss		xmm4, n1
			mulss		xmm4, t0
			movss		xmm5, n0
			mulss		xmm5, t1
#endif
			subss		xmm0, xmm1
			subss		xmm2, xmm3
			movss		xmm7, s1
			subss		xmm4, xmm5

			mulss		xmm0, xmm7
			movss		t3, xmm0
			mulss		xmm2, xmm7
			movss		t4, xmm2
			mulss		xmm4, xmm7
			movss		t5, xmm4
		}

#else

		n0 = s2 * ( d6 * d2 - d7 * d1 );
		n1 = s2 * ( d7 * d0 - d5 * d2 );
		n2 = s2 * ( d5 * d1 - d6 * d0 );

		t0 = s0 * ( d0 * d9 - d4 * d5 );
		t1 = s0 * ( d1 * d9 - d4 * d6 );
		t2 = s0 * ( d2 * d9 - d4 * d7 );

#ifndef DERIVE_UNSMOOTHED_BITANGENT
		t3 = s1 * ( d3 * d5 - d0 * d8 );
		t4 = s1 * ( d3 * d6 - d1 * d8 );
		t5 = s1 * ( d3 * d7 - d2 * d8 );
#else
		t3 = s1 * ( n2 * t1 - n1 * t2 );
		t4 = s1 * ( n0 * t2 - n2 * t0 );
		t5 = s1 * ( n1 * t0 - n0 * t1 );
#endif

#endif

		a->normal[0] = n0;
		a->normal[1] = n1;
		a->normal[2] = n2;

		a->tangents[0][0] = t0;
		a->tangents[0][1] = t1;
		a->tangents[0][2] = t2;

		a->tangents[1][0] = t3;
		a->tangents[1][1] = t4;
		a->tangents[1][2] = t5;
	}
}

/*
============
idSIMD_SSE::NormalizeTangents
============
*/
void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
	ALIGN16( float normal[12] );

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );

	assert( verts != NULL );
	assert( numVerts >= 0 );

	__asm {
		mov			eax, numVerts
		test		eax, eax
		jz			done
#ifdef REFINE_TANGENT_SQUAREROOT
		movaps		xmm6, SIMD_SP_rsqrt_c0
		movaps		xmm7, SIMD_SP_rsqrt_c1
#endif
		mov			esi, verts
		imul		eax, DRAWVERT_SIZE
		add			esi, eax
		neg			eax
		add			eax, DRAWVERT_SIZE*4
		jle			loopVert4

		sub			eax, DRAWVERT_SIZE*4
		jl			loopVert1

	loopVert4:

		sub			eax, DRAWVERT_SIZE*4

		// normalize 4 idDrawVert::normal

		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0]	//  0,  X,  X,  X
		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0]	//  0,  X,  3,  4
		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8]	//  5,  X,  X,  X
		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4]	//	5,  X,  1,  2
		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0]	//  6,  X,  X,  X
		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0]	//  6,  X,  9, 10
		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8]	// 11,  X,  X,  X
		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4]	// 11,  X,  7,  8

		movaps		xmm1, xmm0
		movaps		xmm5, xmm2
		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )		//  0,  3,  6,  9
		shufps		xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )		//  2,  5,  8, 11
		shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )		//  4,  4,  1,  1
		shufps		xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )		// 10, 10,  7,  7
		shufps		xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )		//  1,  4,  7, 10

		movaps		xmm3, xmm0
		movaps		xmm4, xmm1
		movaps		xmm5, xmm2

		mulps		xmm3, xmm3
		mulps		xmm4, xmm4
		mulps		xmm5, xmm5
		addps		xmm3, xmm4
		addps		xmm3, xmm5

#ifdef REFINE_TANGENT_SQUAREROOT
		rsqrtps		xmm4, xmm3
		mulps		xmm3, xmm4
		mulps		xmm3, xmm4
		subps		xmm3, xmm6
		mulps		xmm4, xmm7
		mulps		xmm3, xmm4
#else
		rsqrtps		xmm3, xmm3
#endif

		mulps		xmm0, xmm3
		mulps		xmm1, xmm3
		mulps		xmm2, xmm3

		// save the 4 idDrawVert::normal to project the tangents

		movaps		[normal+ 0], xmm0
		movaps		[normal+16], xmm1
		movaps		[normal+32], xmm2

		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2

		// project and normalize 4 idDrawVert::tangent[0]

		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0]	//  0,  X,  X,  X
		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0]	//  0,  X,  3,  4
		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8]	//  5,  X,  X,  X
		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4]	//	5,  X,  1,  2
		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0]	//  6,  X,  X,  X
		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0]	//  6,  X,  9, 10
		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8]	// 11,  X,  X,  X
		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4]	// 11,  X,  7,  8

		movaps		xmm1, xmm0
		movaps		xmm5, xmm2
		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )		//  0,  3,  6,  9
		shufps		xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )		//  2,  5,  8, 11
		shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )		//  4,  4,  1,  1
		shufps		xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )		// 10, 10,  7,  7
		shufps		xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )		//  1,  4,  7, 10

		movaps		xmm3, xmm0
		movaps		xmm4, xmm1
		movaps		xmm5, xmm2

		mulps		xmm3, [normal+ 0]
		mulps		xmm4, [normal+16]
		mulps		xmm5, [normal+32]
		addps		xmm3, xmm4
		addps		xmm3, xmm5

		movaps		xmm4, xmm3
		movaps		xmm5, xmm3
		mulps		xmm3, [normal+ 0]
		mulps		xmm4, [normal+16]
		mulps		xmm5, [normal+32]
		subps		xmm0, xmm3
		subps		xmm1, xmm4
		subps		xmm2, xmm5

		movaps		xmm3, xmm0
		movaps		xmm4, xmm1
		movaps		xmm5, xmm2

		mulps		xmm3, xmm3
		mulps		xmm4, xmm4
		mulps		xmm5, xmm5
		addps		xmm3, xmm4
		addps		xmm3, xmm5

#ifdef REFINE_TANGENT_SQUAREROOT
		rsqrtps		xmm4, xmm3
		mulps		xmm3, xmm4
		mulps		xmm3, xmm4
		subps		xmm3, xmm6
		mulps		xmm4, xmm7
		mulps		xmm3, xmm4
#else
		rsqrtps		xmm3, xmm3
#endif

		mulps		xmm0, xmm3
		mulps		xmm1, xmm3
		mulps		xmm2, xmm3

		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2

		// project and normalize 4 idDrawVert::tangent[1]

		movss		xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0]	//  0,  X,  X,  X
		movhps		xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0]	//  0,  X,  3,  4
		movss		xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8]	//  5,  X,  X,  X
		movhps		xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4]	//	5,  X,  1,  2
		movss		xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0]	//  6,  X,  X,  X
		movhps		xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0]	//  6,  X,  9, 10
		movss		xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8]	// 11,  X,  X,  X
		movhps		xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4]	// 11,  X,  7,  8

		movaps		xmm1, xmm0
		movaps		xmm5, xmm2
		shufps		xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )		//  0,  3,  6,  9
		shufps		xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )		//  2,  5,  8, 11
		shufps		xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 )		//  4,  4,  1,  1
		shufps		xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 )		// 10, 10,  7,  7
		shufps		xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 )		//  1,  4,  7, 10

		movaps		xmm3, xmm0
		movaps		xmm4, xmm1
		movaps		xmm5, xmm2

		mulps		xmm3, [normal+ 0]
		mulps		xmm4, [normal+16]
		mulps		xmm5, [normal+32]
		addps		xmm3, xmm4
		addps		xmm3, xmm5

		movaps		xmm4, xmm3
		movaps		xmm5, xmm3
		mulps		xmm3, [normal+ 0]
		mulps		xmm4, [normal+16]
		mulps		xmm5, [normal+32]
		subps		xmm0, xmm3
		subps		xmm1, xmm4
		subps		xmm2, xmm5

		movaps		xmm3, xmm0
		movaps		xmm4, xmm1
		movaps		xmm5, xmm2

		mulps		xmm3, xmm3
		mulps		xmm4, xmm4
		mulps		xmm5, xmm5
		addps		xmm3, xmm4
		addps		xmm3, xmm5

#ifdef REFINE_TANGENT_SQUAREROOT
		rsqrtps		xmm4, xmm3
		mulps		xmm3, xmm4
		mulps		xmm3, xmm4
		subps		xmm3, xmm6
		mulps		xmm4, xmm7
		mulps		xmm3, xmm4
#else
		rsqrtps		xmm3, xmm3
#endif

		mulps		xmm0, xmm3
		mulps		xmm1, xmm3
		mulps		xmm2, xmm3

		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2

		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
		shufps		xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2

		add			eax, DRAWVERT_SIZE*8

		jle			loopVert4

		sub			eax, DRAWVERT_SIZE*4
		jge			done

	loopVert1:

		// normalize one idDrawVert::normal

		movss		xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
		movss		xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
		movss		xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
		movss		xmm3, xmm0
		movss		xmm4, xmm1
		movss		xmm5, xmm2

		mulss		xmm3, xmm3
		mulss		xmm4, xmm4
		mulss		xmm5, xmm5
		addss		xmm3, xmm4
		addss		xmm3, xmm5

#ifdef REFINE_TANGENT_SQUAREROOT
		rsqrtss		xmm4, xmm3
		mulss		xmm3, xmm4
		mulss		xmm3, xmm4
		subss		xmm3, xmm6
		mulss		xmm4, xmm7
		mulss		xmm3, xmm4
#else
		rsqrtss		xmm3, xmm3
#endif

		mulss		xmm0, xmm3
		mulss		xmm1, xmm3
		mulss		xmm2, xmm3

		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2

		// project and normalize one idDrawVert::tangent[0]

		movss		xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
		movss		xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
		movss		xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
		movss		xmm3, xmm0
		movss		xmm4, xmm1
		movss		xmm5, xmm2

		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
		addss		xmm3, xmm4
		addss		xmm3, xmm5

		movss		xmm4, xmm3
		movss		xmm5, xmm3
		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
		subss		xmm0, xmm3
		subss		xmm1, xmm4
		subss		xmm2, xmm5

		movss		xmm3, xmm0
		movss		xmm4, xmm1
		movss		xmm5, xmm2

		mulss		xmm3, xmm3
		mulss		xmm4, xmm4
		mulss		xmm5, xmm5
		addss		xmm3, xmm4
		addss		xmm3, xmm5

#ifdef REFINE_TANGENT_SQUAREROOT
		rsqrtss		xmm4, xmm3
		mulss		xmm3, xmm4
		mulss		xmm3, xmm4
		subss		xmm3, xmm6
		mulss		xmm4, xmm7
		mulss		xmm3, xmm4
#else
		rsqrtss		xmm3, xmm3
#endif

		mulss		xmm0, xmm3
		mulss		xmm1, xmm3
		mulss		xmm2, xmm3

		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2

		// project and normalize one idDrawVert::tangent[1]

		movss		xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
		movss		xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
		movss		xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
		movss		xmm3, xmm0
		movss		xmm4, xmm1
		movss		xmm5, xmm2

		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
		addss		xmm3, xmm4
		addss		xmm3, xmm5

		movss		xmm4, xmm3
		movss		xmm5, xmm3
		mulss		xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
		mulss		xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
		mulss		xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
		subss		xmm0, xmm3
		subss		xmm1, xmm4
		subss		xmm2, xmm5

		movss		xmm3, xmm0
		movss		xmm4, xmm1
		movss		xmm5, xmm2

		mulss		xmm3, xmm3
		mulss		xmm4, xmm4
		mulss		xmm5, xmm5
		addss		xmm3, xmm4
		addss		xmm3, xmm5

#ifdef REFINE_TANGENT_SQUAREROOT
		rsqrtss		xmm4, xmm3
		mulss		xmm3, xmm4
		mulss		xmm3, xmm4
		subss		xmm3, xmm6
		mulss		xmm4, xmm7
		mulss		xmm3, xmm4
#else
		rsqrtss		xmm3, xmm3
#endif

		mulss		xmm0, xmm3
		mulss		xmm1, xmm3
		mulss		xmm2, xmm3

		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
		movss		[esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2

		add			eax, DRAWVERT_SIZE

		jl			loopVert1
	done:
	}
}

/*
============
idSIMD_SSE::CreateTextureSpaceLightVectors
============
*/
void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );

	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
	memset( used, 0, numVerts * sizeof( used[0] ) );

	for ( int i = numIndexes - 1; i >= 0; i-- ) {
		used[indexes[i]] = true;
	}

#if 0

	__asm {

		mov			eax, numVerts

		mov			esi, used
		add			esi, eax

		mov			edi, verts
		sub			edi, DRAWVERT_SIZE

		neg			eax
		dec			eax

		mov			ecx, lightOrigin
		movss		xmm7, [ecx+0]
		movhps		xmm7, [ecx+4]

		mov			ecx, lightVectors
		sub			ecx, 3*4

	loopVert:
		inc			eax
		jge			done

		add			edi, DRAWVERT_SIZE
		add			ecx, 3*4

		cmp			byte ptr [esi+eax], 0
		je			loopVert

		movaps		xmm0, xmm7
		movss		xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
		subps		xmm0, xmm1

		// 0,  X,  1,  2
		// 3,  X,  4,  5
		// 6,  X,  7,  8

		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
		movhps		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
		mulps		xmm2, xmm0

		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
		movhps		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
		mulps		xmm3, xmm0

		movaps		xmm5, xmm2								// xmm5 = 0,  X,  1,  2
		unpcklps	xmm5, xmm3								// xmm5 = 0,  3,  X,  X
		unpckhps	xmm2, xmm3								// xmm2 = 1,  4,  2,  5

		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
		movhps		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
		mulps		xmm4, xmm0

		movlhps		xmm5, xmm4								// xmm5 = 0,  3,  6,  X
		movhlps		xmm4, xmm2								// xmm4 = 2,  5,  7,  8
		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 )	// xmm2 = 2,  5,  8,  7

		addps		xmm5, xmm4
		addps		xmm5, xmm2
		movlps		[ecx+0], xmm5
		shufps		xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
		movss		[ecx+8], xmm5

		jmp			loopVert

	done:
	}

#elif 1

	for ( int i = 0; i < numVerts; i++ ) {
		if ( !used[i] ) {
			continue;
		}

		const idDrawVert *v = &verts[i];
		idVec3 lightDir;

		lightDir[0] = lightOrigin[0] - v->xyz[0];
		lightDir[1] = lightOrigin[1] - v->xyz[1];
		lightDir[2] = lightOrigin[2] - v->xyz[2];

		lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
		lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
		lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
	}

#elif 1

	ALIGN16( int usedVertNums[4] );
	ALIGN16( float lightDir0[4] );
	ALIGN16( float lightDir1[4] );
	ALIGN16( float lightDir2[4] );
	ALIGN16( float normal0[4] );
	ALIGN16( float normal1[4] );
	ALIGN16( float normal2[4] );
	ALIGN16( float tangent0[4] );
	ALIGN16( float tangent1[4] );
	ALIGN16( float tangent2[4] );
	ALIGN16( float tangent3[4] );
	ALIGN16( float tangent4[4] );
	ALIGN16( float tangent5[4] );
	idVec3 localLightOrigin = lightOrigin;

	__asm {

		xor			ecx, ecx
		mov			eax, numVerts

		mov			esi, used
		add			esi, eax

		mov			edi, verts
		sub			edi, DRAWVERT_SIZE

		neg			eax
		dec			eax

	loopVert4:
		inc			eax
		jge			done4

		add			edi, DRAWVERT_SIZE

		cmp			byte ptr [esi+eax], 0
		je			loopVert4

		mov			usedVertNums[ecx*4], eax

		inc			ecx
		cmp			ecx, 4

		movss		xmm0, localLightOrigin[0]
		movss		xmm1, localLightOrigin[4]
		movss		xmm2, localLightOrigin[8]

		subss		xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]

		movss		lightDir0[ecx*4-4], xmm0
		movss		lightDir1[ecx*4-4], xmm1
		movss		lightDir2[ecx*4-4], xmm2

		movss		xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
		movss		xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]

		movss		normal0[ecx*4-4], xmm3
		movss		normal1[ecx*4-4], xmm4
		movss		normal2[ecx*4-4], xmm5

		movss		xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
		movss		xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]

		movss		tangent0[ecx*4-4], xmm0
		movss		tangent1[ecx*4-4], xmm1
		movss		tangent2[ecx*4-4], xmm2

		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
		movss		xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
		movss		xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]

		movss		tangent3[ecx*4-4], xmm3
		movss		tangent4[ecx*4-4], xmm4
		movss		tangent5[ecx*4-4], xmm5

		jl			loopVert4

		movaps		xmm0, lightDir0
		movaps		xmm1, lightDir1
		movaps		xmm2, lightDir2

		movaps		xmm3, tangent0
		mulps		xmm3, xmm0
		movaps		xmm4, tangent1
		mulps		xmm4, xmm1
		movaps		xmm5, tangent2
		mulps		xmm5, xmm2

		addps		xmm3, xmm4
		addps		xmm5, xmm3

		movaps		xmm3, tangent3
		mulps		xmm3, xmm0
		movaps		xmm4, tangent4
		mulps		xmm4, xmm1
		movaps		xmm6, tangent5
		mulps		xmm6, xmm2

		addps		xmm3, xmm4
		addps		xmm6, xmm3

		mulps		xmm0, normal0
		mulps		xmm1, normal1
		mulps		xmm2, normal2

		addps		xmm0, xmm1
		addps		xmm0, xmm2

		mov			ecx, numVerts
		imul		ecx, 12
		mov			edx, usedVertNums[0]
		add			ecx, lightVectors
		imul		edx, 12

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0

		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
		mov			edx, usedVertNums[4]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		imul		edx, 12
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0

		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
		mov			edx, usedVertNums[8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		imul		edx, 12
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0

		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
		mov			edx, usedVertNums[12]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		imul		edx, 12
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0

		xor			ecx, ecx
		jmp			loopVert4

	done4:
		test		ecx, ecx
		jz			done
		xor			eax, eax
		mov			edi, numVerts
		imul		edi, 12
		add			edi, lightVectors

	loopVert1:
		movss		xmm0, lightDir0[eax*4]
		movss		xmm1, lightDir1[eax*4]
		movss		xmm2, lightDir2[eax*4]

		mov			edx, usedVertNums[eax*4]
		imul		edx, 12

		movss		xmm3, tangent0[eax*4]
		mulss		xmm3, xmm0
		movss		xmm4, tangent1[eax*4]
		mulss		xmm4, xmm1
		movss		xmm5, tangent2[eax*4]
		mulss		xmm5, xmm2

		addss		xmm3, xmm4
		addss		xmm5, xmm3
		movss		[edi+edx+0], xmm5

		movss		xmm3, tangent3[eax*4]
		mulss		xmm3, xmm0
		movss		xmm4, tangent4[eax*4]
		mulss		xmm4, xmm1
		movss		xmm6, tangent5[eax*4]
		mulss		xmm6, xmm2

		addss		xmm3, xmm4
		addss		xmm6, xmm3
		movss		[edi+edx+4], xmm6

		mulss		xmm0, normal0[eax*4]
		mulss		xmm1, normal1[eax*4]
		mulss		xmm2, normal2[eax*4]

		addss		xmm0, xmm1
		addss		xmm0, xmm2
		movss		[edi+edx+8], xmm0

		inc			eax
		dec			ecx
		jg			loopVert1

	done:
	}

#else

	ALIGN16( float lightVectors0[4] );
	ALIGN16( float lightVectors1[4] );
	ALIGN16( float lightVectors2[4] );
	int numUsedVerts = 0;

	for ( int i = 0; i < numVerts; i++ ) {
		if ( !used[i] ) {
			continue;
		}

		const idDrawVert *v = &verts[i];

		lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
		lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
		lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];

		normal0[numUsedVerts] = v->normal[0];
		normal1[numUsedVerts] = v->normal[1];
		normal2[numUsedVerts] = v->normal[2];

		tangent0[numUsedVerts] = v->tangents[0][0];
		tangent1[numUsedVerts] = v->tangents[0][1];
		tangent2[numUsedVerts] = v->tangents[0][2];

		tangent3[numUsedVerts] = v->tangents[1][0];
		tangent4[numUsedVerts] = v->tangents[1][1];
		tangent5[numUsedVerts] = v->tangents[1][2];

		usedVertNums[numUsedVerts++] = i;
		if ( numUsedVerts < 4 ) {
			continue;
		}

		lightVectors0[0] = lightDir0[0] * tangent0[0];
		lightVectors0[1] = lightDir0[1] * tangent0[1];
		lightVectors0[2] = lightDir0[2] * tangent0[2];
		lightVectors0[3] = lightDir0[3] * tangent0[3];

		lightVectors0[0] += lightDir1[0] * tangent1[0];
		lightVectors0[1] += lightDir1[1] * tangent1[1];
		lightVectors0[2] += lightDir1[2] * tangent1[2];
		lightVectors0[3] += lightDir1[3] * tangent1[3];

		lightVectors0[0] += lightDir2[0] * tangent2[0];
		lightVectors0[1] += lightDir2[1] * tangent2[1];
		lightVectors0[2] += lightDir2[2] * tangent2[2];
		lightVectors0[3] += lightDir2[3] * tangent2[3];

		lightVectors1[0] = lightDir0[0] * tangent3[0];
		lightVectors1[1] = lightDir0[1] * tangent3[1];
		lightVectors1[2] = lightDir0[2] * tangent3[2];
		lightVectors1[3] = lightDir0[3] * tangent3[3];

		lightVectors1[0] += lightDir1[0] * tangent4[0];
		lightVectors1[1] += lightDir1[1] * tangent4[1];
		lightVectors1[2] += lightDir1[2] * tangent4[2];
		lightVectors1[3] += lightDir1[3] * tangent4[3];

		lightVectors1[0] += lightDir2[0] * tangent5[0];
		lightVectors1[1] += lightDir2[1] * tangent5[1];
		lightVectors1[2] += lightDir2[2] * tangent5[2];
		lightVectors1[3] += lightDir2[3] * tangent5[3];

		lightVectors2[0] = lightDir0[0] * normal0[0];
		lightVectors2[1] = lightDir0[1] * normal0[1];
		lightVectors2[2] = lightDir0[2] * normal0[2];
		lightVectors2[3] = lightDir0[3] * normal0[3];

		lightVectors2[0] += lightDir1[0] * normal1[0];
		lightVectors2[1] += lightDir1[1] * normal1[1];
		lightVectors2[2] += lightDir1[2] * normal1[2];
		lightVectors2[3] += lightDir1[3] * normal1[3];

		lightVectors2[0] += lightDir2[0] * normal2[0];
		lightVectors2[1] += lightDir2[1] * normal2[1];
		lightVectors2[2] += lightDir2[2] * normal2[2];
		lightVectors2[3] += lightDir2[3] * normal2[3];


		for ( int j = 0; j < 4; j++ ) {
			int n = usedVertNums[j];

			lightVectors[n][0] = lightVectors0[j];
			lightVectors[n][1] = lightVectors1[j];
			lightVectors[n][2] = lightVectors2[j];
		}

		numUsedVerts = 0;
	}

	for ( int i = 0; i < numUsedVerts; i++ ) {

		lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
		lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
		lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];

		int n = usedVertNums[i];
		lightVectors[n][0] = lightVectors0[i];
		lightVectors[n][1] = lightVectors1[i];
		lightVectors[n][2] = lightVectors2[i];
	}

#endif
}

/*
============
idSIMD_SSE::CreateSpecularTextureCoords
============
*/
void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
	assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
	assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );

	bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
	memset( used, 0, numVerts * sizeof( used[0] ) );

	for ( int i = numIndexes - 1; i >= 0; i-- ) {
		used[indexes[i]] = true;
	}

#if 0

	__asm {

		mov			eax, numVerts

		mov			esi, used
		add			esi, eax

		mov			edi, verts
		sub			edi, DRAWVERT_SIZE

		neg			eax
		dec			eax

		mov			ecx, viewOrigin
		movss		xmm6, [ecx+0]
		movhps		xmm6, [ecx+4]

		mov			ecx, lightOrigin
		movss		xmm7, [ecx+0]
		movhps		xmm7, [ecx+4]

		mov			ecx, texCoords
		sub			ecx, 4*4

	loopVert:
		inc			eax
		jge			done

		add			edi, DRAWVERT_SIZE
		add			ecx, 4*4

		cmp			byte ptr [esi+eax], 0
		je			loopVert

		movaps		xmm0, xmm7
		movaps		xmm1, xmm6
		movss		xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
		subps		xmm0, xmm2
		subps		xmm1, xmm2

		movaps		xmm3, xmm0
		movaps		xmm4, xmm1
		mulps		xmm3, xmm3
		mulps		xmm4, xmm4

		// 0,  X,  1,  2
		// 3,  X,  4,  5

		movaps		xmm5, xmm3								// xmm5 = 0,  X,  1,  2
		unpcklps	xmm5, xmm4								// xmm5 = 0,  3,  X,  X
		unpckhps	xmm3, xmm4								// xmm3 = 1,  4,  2,  5
		movhlps		xmm4, xmm3								// xmm4 = 2,  5,  4,  5

		addps		xmm5, xmm3
		addps		xmm5, xmm4
		shufps		xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
		rsqrtps		xmm5, xmm5

		movaps		xmm4, xmm5
		shufps		xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )

		mulps		xmm0, xmm4
		mulps		xmm1, xmm5
		addps		xmm0, xmm1

		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
		movhps		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
		mulps		xmm2, xmm0

		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
		movhps		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
		mulps		xmm3, xmm0

		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
		movhps		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
		mulps		xmm4, xmm0

		movaps		xmm5, xmm2								// xmm5 = 0,  X,  1,  2
		unpcklps	xmm5, xmm3								// xmm5 = 0,  3,  X,  X
		unpckhps	xmm2, xmm3								// xmm2 = 1,  4,  2,  5

		movlhps		xmm5, xmm4								// xmm5 = 0,  3,  6,  X
		movhlps		xmm4, xmm2								// xmm4 = 2,  5,  7,  8
		shufps		xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 )	// xmm2 = 2,  5,  8,  7

		movaps		xmm3, SIMD_SP_one

		addps		xmm5, xmm4
		addps		xmm5, xmm2
		movaps		[ecx+0], xmm5
		movss		[ecx+12], xmm3

		jmp			loopVert

	done:
	}

#elif 0

	for ( int i = 0; i < numVerts; i++ ) {
		if ( !used[i] ) {
			continue;
		}

		const idDrawVert *v = &verts[i];

		idVec3 lightDir = lightOrigin - v->xyz;
		idVec3 viewDir = viewOrigin - v->xyz;

		float ilength;

		ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
		lightDir[0] *= ilength;
		lightDir[1] *= ilength;
		lightDir[2] *= ilength;

		ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
		viewDir[0] *= ilength;
		viewDir[1] *= ilength;
		viewDir[2] *= ilength;

		lightDir += viewDir;

		texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
		texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
		texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
		texCoords[i][3] = 1.0f;
	}


#elif 1

	ALIGN16( int usedVertNums[4] );
	ALIGN16( float lightDir0[4] );
	ALIGN16( float lightDir1[4] );
	ALIGN16( float lightDir2[4] );
	ALIGN16( float viewDir0[4] );
	ALIGN16( float viewDir1[4] );
	ALIGN16( float viewDir2[4] );
	ALIGN16( float normal0[4] );
	ALIGN16( float normal1[4] );
	ALIGN16( float normal2[4] );
	ALIGN16( float tangent0[4] );
	ALIGN16( float tangent1[4] );
	ALIGN16( float tangent2[4] );
	ALIGN16( float tangent3[4] );
	ALIGN16( float tangent4[4] );
	ALIGN16( float tangent5[4] );
	idVec3 localLightOrigin = lightOrigin;
	idVec3 localViewOrigin = viewOrigin;

	__asm {

		xor			ecx, ecx
		mov			eax, numVerts

		mov			esi, used
		add			esi, eax

		mov			edi, verts
		sub			edi, DRAWVERT_SIZE

		neg			eax
		dec			eax

	loopVert4:
		inc			eax
		jge			done4

		add			edi, DRAWVERT_SIZE

		cmp			byte ptr [esi+eax], 0
		je			loopVert4

		mov			usedVertNums[ecx*4], eax

		inc			ecx
		cmp			ecx, 4

		movss		xmm3, localLightOrigin[0]
		movss		xmm4, localLightOrigin[4]
		movss		xmm5, localLightOrigin[8]

		subss		xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]

		movss		lightDir0[ecx*4-4], xmm3
		movss		lightDir1[ecx*4-4], xmm4
		movss		lightDir2[ecx*4-4], xmm5

		movss		xmm0, localViewOrigin[0]
		movss		xmm1, localViewOrigin[4]
		movss		xmm2, localViewOrigin[8]

		subss		xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
		subss		xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
		subss		xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]

		movss		viewDir0[ecx*4-4], xmm0
		movss		viewDir1[ecx*4-4], xmm1
		movss		viewDir2[ecx*4-4], xmm2

		movss		xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
		movss		xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
		movss		xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]

		movss		normal0[ecx*4-4], xmm3
		movss		normal1[ecx*4-4], xmm4
		movss		normal2[ecx*4-4], xmm5

		movss		xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
		movss		xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
		movss		xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]

		movss		tangent0[ecx*4-4], xmm0
		movss		tangent1[ecx*4-4], xmm1
		movss		tangent2[ecx*4-4], xmm2

		movss		xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
		movss		xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
		movss		xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]

		movss		tangent3[ecx*4-4], xmm3
		movss		tangent4[ecx*4-4], xmm4
		movss		tangent5[ecx*4-4], xmm5

		jl			loopVert4

		movaps		xmm6, lightDir0
		movaps		xmm0, xmm6
		mulps		xmm6, xmm6
		movaps		xmm7, lightDir1
		movaps		xmm1, xmm7
		mulps		xmm7, xmm7
		addps		xmm6, xmm7
		movaps		xmm5, lightDir2
		movaps		xmm2, xmm5
		mulps		xmm5, xmm5
		addps		xmm6, xmm5
		rsqrtps		xmm6, xmm6

		mulps		xmm0, xmm6
		mulps		xmm1, xmm6
		mulps		xmm2, xmm6

		movaps		xmm3, viewDir0
		movaps		xmm7, xmm3
		mulps		xmm7, xmm7
		movaps		xmm4, viewDir1
		movaps		xmm6, xmm4
		mulps		xmm6, xmm6
		addps		xmm7, xmm6
		movaps		xmm5, viewDir2
		movaps		xmm6, xmm5
		mulps		xmm6, xmm6
		addps		xmm7, xmm6
		rsqrtps		xmm7, xmm7

		mulps		xmm3, xmm7
		addps		xmm0, xmm3
		mulps		xmm4, xmm7
		addps		xmm1, xmm4
		mulps		xmm5, xmm7
		addps		xmm2, xmm5

		movaps		xmm3, tangent0
		mulps		xmm3, xmm0
		movaps		xmm4, tangent1
		mulps		xmm4, xmm1
		addps		xmm3, xmm4
		movaps		xmm5, tangent2
		mulps		xmm5, xmm2
		addps		xmm5, xmm3

		movaps		xmm3, tangent3
		mulps		xmm3, xmm0
		movaps		xmm4, tangent4
		mulps		xmm4, xmm1
		addps		xmm3, xmm4
		movaps		xmm6, tangent5
		mulps		xmm6, xmm2
		addps		xmm6, xmm3

		mulps		xmm0, normal0
		mulps		xmm1, normal1
		addps		xmm0, xmm1
		mulps		xmm2, normal2
		addps		xmm0, xmm2

		mov			ecx, numVerts
		shl			ecx, 4
		mov			edx, usedVertNums[0]
		add			ecx, texCoords
		shl			edx, 4
		movss		xmm3, SIMD_SP_one

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0
		movss		[ecx+edx+12], xmm3

		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
		mov			edx, usedVertNums[4]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		shl			edx, 4
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0
		movss		[ecx+edx+12], xmm3

		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
		mov			edx, usedVertNums[8]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		shl			edx, 4
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0
		movss		[ecx+edx+12], xmm3

		shufps		xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
		mov			edx, usedVertNums[12]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
		shl			edx, 4
		shufps		xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )

		movss		[ecx+edx+0], xmm5
		movss		[ecx+edx+4], xmm6
		movss		[ecx+edx+8], xmm0
		movss		[ecx+edx+12], xmm3

		xor			ecx, ecx
		jmp			loopVert4

	done4:
		test		ecx, ecx
		jz			done
		xor			eax, eax
		mov			edi, numVerts
		shl			edi, 4
		add			edi, texCoords

	loopVert1:
		movss		xmm6, lightDir0[eax*4]
		movss		xmm0, xmm6
		mulss		xmm6, xmm6
		movss		xmm7, lightDir1[eax*4]
		movss		xmm1, xmm7
		mulss		xmm7, xmm7
		addss		xmm6, xmm7
		movss		xmm5, lightDir2[eax*4]
		movss		xmm2, xmm5
		mulss		xmm5, xmm5
		addss		xmm6, xmm5
		rsqrtss		xmm6, xmm6

		mulss		xmm0, xmm6
		mulss		xmm1, xmm6
		mulss		xmm2, xmm6

		movss		xmm3, viewDir0[eax*4]
		movss		xmm7, xmm3
		mulss		xmm7, xmm7
		movss		xmm4, viewDir1[eax*4]
		movss		xmm6, xmm4
		mulss		xmm6, xmm6
		addss		xmm7, xmm6
		movss		xmm5, viewDir2[eax*4]
		movss		xmm6, xmm5
		mulss		xmm6, xmm6
		addss		xmm7, xmm6
		rsqrtss		xmm7, xmm7

		mulss		xmm3, xmm7
		addss		xmm0, xmm3
		mulss		xmm4, xmm7
		addss		xmm1, xmm4
		mulss		xmm5, xmm7
		addss		xmm2, xmm5

		mov			edx, usedVertNums[eax*4]
		shl			edx, 4

		movss		xmm3, tangent0[eax*4]
		mulss		xmm3, xmm0
		movss		xmm4, tangent1[eax*4]
		mulss		xmm4, xmm1
		addss		xmm3, xmm4
		movss		xmm5, tangent2[eax*4]
		mulss		xmm5, xmm2
		addss		xmm5, xmm3
		movss		[edi+edx+0], xmm5

		movss		xmm3, tangent3[eax*4]
		mulss		xmm3, xmm0
		movss		xmm4, tangent4[eax*4]
		mulss		xmm4, xmm1
		addss		xmm3, xmm4
		movss		xmm6, tangent5[eax*4]
		mulss		xmm6, xmm2
		addss		xmm6, xmm3
		movss		[edi+edx+4], xmm6

		mulss		xmm0, normal0[eax*4]
		mulss		xmm1, normal1[eax*4]
		addss		xmm0, xmm1
		mulss		xmm2, normal2[eax*4]
		addss		xmm0, xmm2
		movss		[edi+edx+8], xmm0

		movss		xmm3, SIMD_SP_one
		movss		[edi+edx+12], xmm3

		inc			eax
		dec			ecx
		jg			loopVert1

	done:
	}

#else

	ALIGN16( int usedVertNums[4] );
	ALIGN16( float lightDir0[4] );
	ALIGN16( float lightDir1[4] );
	ALIGN16( float lightDir2[4] );
	ALIGN16( float viewDir0[4] );
	ALIGN16( float viewDir1[4] );
	ALIGN16( float viewDir2[4] );
	ALIGN16( float normal0[4] );
	ALIGN16( float normal1[4] );
	ALIGN16( float normal2[4] );
	ALIGN16( float tangent0[4] );
	ALIGN16( float tangent1[4] );
	ALIGN16( float tangent2[4] );
	ALIGN16( float tangent3[4] );
	ALIGN16( float tangent4[4] );
	ALIGN16( float tangent5[4] );
	ALIGN16( float texCoords0[4] );
	ALIGN16( float texCoords1[4] );
	ALIGN16( float texCoords2[4] );
	idVec3 localLightOrigin = lightOrigin;
	idVec3 localViewOrigin = viewOrigin;
	int numUsedVerts = 0;

	for ( int i = 0; i < numVerts; i++ ) {
		if ( !used[i] ) {
			continue;
		}

		const idDrawVert *v = &verts[i];

		lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
		lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
		lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];

		viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
		viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
		viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];

		normal0[numUsedVerts] = v->normal[0];
		normal1[numUsedVerts] = v->normal[1];
		normal2[numUsedVerts] = v->normal[2];

		tangent0[numUsedVerts] = v->tangents[0][0];
		tangent1[numUsedVerts] = v->tangents[0][1];
		tangent2[numUsedVerts] = v->tangents[0][2];

		tangent3[numUsedVerts] = v->tangents[1][0];
		tangent4[numUsedVerts] = v->tangents[1][1];
		tangent5[numUsedVerts] = v->tangents[1][2];

		usedVertNums[numUsedVerts++] = i;
		if ( numUsedVerts < 4 ) {
			continue;
		}

		ALIGN16( float temp[4] );

		temp[0] = lightDir0[0] * lightDir0[0];
		temp[1] = lightDir0[1] * lightDir0[1];
		temp[2] = lightDir0[2] * lightDir0[2];
		temp[3] = lightDir0[3] * lightDir0[3];

		temp[0] += lightDir1[0] * lightDir1[0];
		temp[1] += lightDir1[1] * lightDir1[1];
		temp[2] += lightDir1[2] * lightDir1[2];
		temp[3] += lightDir1[3] * lightDir1[3];

		temp[0] += lightDir2[0] * lightDir2[0];
		temp[1] += lightDir2[1] * lightDir2[1];
		temp[2] += lightDir2[2] * lightDir2[2];
		temp[3] += lightDir2[3] * lightDir2[3];

		temp[0] = idMath::RSqrt( temp[0] );
		temp[1] = idMath::RSqrt( temp[1] );
		temp[2] = idMath::RSqrt( temp[2] );
		temp[3] = idMath::RSqrt( temp[3] );

		lightDir0[0] *= temp[0];
		lightDir0[1] *= temp[1];
		lightDir0[2] *= temp[2];
		lightDir0[3] *= temp[3];

		lightDir1[0] *= temp[0];
		lightDir1[1] *= temp[1];
		lightDir1[2] *= temp[2];
		lightDir1[3] *= temp[3];

		lightDir2[0] *= temp[0];
		lightDir2[1] *= temp[1];
		lightDir2[2] *= temp[2];
		lightDir2[3] *= temp[3];

		temp[0] = viewDir0[0] * viewDir0[0];
		temp[1] = viewDir0[1] * viewDir0[1];
		temp[2] = viewDir0[2] * viewDir0[2];
		temp[3] = viewDir0[3] * viewDir0[3];

		temp[0] += viewDir1[0] * viewDir1[0];
		temp[1] += viewDir1[1] * viewDir1[1];
		temp[2] += viewDir1[2] * viewDir1[2];
		temp[3] += viewDir1[3] * viewDir1[3];

		temp[0] += viewDir2[0] * viewDir2[0];
		temp[1] += viewDir2[1] * viewDir2[1];
		temp[2] += viewDir2[2] * viewDir2[2];
		temp[3] += viewDir2[3] * viewDir2[3];

		temp[0] = idMath::RSqrt( temp[0] );
		temp[1] = idMath::RSqrt( temp[1] );
		temp[2] = idMath::RSqrt( temp[2] );
		temp[3] = idMath::RSqrt( temp[3] );

		viewDir0[0] *= temp[0];
		viewDir0[1] *= temp[1];
		viewDir0[2] *= temp[2];
		viewDir0[3] *= temp[3];

		viewDir1[0] *= temp[0];
		viewDir1[1] *= temp[1];
		viewDir1[2] *= temp[2];
		viewDir1[3] *= temp[3];

		viewDir2[0] *= temp[0];
		viewDir2[1] *= temp[1];
		viewDir2[2] *= temp[2];
		viewDir2[3] *= temp[3];

		lightDir0[0] += viewDir0[0];
		lightDir0[1] += viewDir0[1];
		lightDir0[2] += viewDir0[2];
		lightDir0[3] += viewDir0[3];

		lightDir1[0] += viewDir1[0];
		lightDir1[1] += viewDir1[1];
		lightDir1[2] += viewDir1[2];
		lightDir1[3] += viewDir1[3];

		lightDir2[0] += viewDir2[0];
		lightDir2[1] += viewDir2[1];
		lightDir2[2] += viewDir2[2];
		lightDir2[3] += viewDir2[3];

		texCoords0[0] = lightDir0[0] * tangent0[0];
		texCoords0[1] = lightDir0[1] * tangent0[1];
		texCoords0[2] = lightDir0[2] * tangent0[2];
		texCoords0[3] = lightDir0[3] * tangent0[3];

		texCoords0[0] += lightDir1[0] * tangent1[0];
		texCoords0[1] += lightDir1[1] * tangent1[1];
		texCoords0[2] += lightDir1[2] * tangent1[2];
		texCoords0[3] += lightDir1[3] * tangent1[3];

		texCoords0[0] += lightDir2[0] * tangent2[0];
		texCoords0[1] += lightDir2[1] * tangent2[1];
		texCoords0[2] += lightDir2[2] * tangent2[2];
		texCoords0[3] += lightDir2[3] * tangent2[3];

		texCoords1[0] = lightDir0[0] * tangent3[0];
		texCoords1[1] = lightDir0[1] * tangent3[1];
		texCoords1[2] = lightDir0[2] * tangent3[2];
		texCoords1[3] = lightDir0[3] * tangent3[3];

		texCoords1[0] += lightDir1[0] * tangent4[0];
		texCoords1[1] += lightDir1[1] * tangent4[1];
		texCoords1[2] += lightDir1[2] * tangent4[2];
		texCoords1[3] += lightDir1[3] * tangent4[3];

		texCoords1[0] += lightDir2[0] * tangent5[0];
		texCoords1[1] += lightDir2[1] * tangent5[1];
		texCoords1[2] += lightDir2[2] * tangent5[2];
		texCoords1[3] += lightDir2[3] * tangent5[3];

		texCoords2[0] = lightDir0[0] * normal0[0];
		texCoords2[1] = lightDir0[1] * normal0[1];
		texCoords2[2] = lightDir0[2] * normal0[2];
		texCoords2[3] = lightDir0[3] * normal0[3];

		texCoords2[0] += lightDir1[0] * normal1[0];
		texCoords2[1] += lightDir1[1] * normal1[1];
		texCoords2[2] += lightDir1[2] * normal1[2];
		texCoords2[3] += lightDir1[3] * normal1[3];

		texCoords2[0] += lightDir2[0] * normal2[0];
		texCoords2[1] += lightDir2[1] * normal2[1];
		texCoords2[2] += lightDir2[2] * normal2[2];
		texCoords2[3] += lightDir2[3] * normal2[3];

		for ( int j = 0; j < 4; j++ ) {
			int n = usedVertNums[j];

			texCoords[n][0] = texCoords0[j];
			texCoords[n][1] = texCoords1[j];
			texCoords[n][2] = texCoords2[j];
			texCoords[n][3] = 1.0f;
		}

		numUsedVerts = 0;
	}

	for ( int i = 0; i < numUsedVerts; i++ ) {
		float temp;

		temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
		temp = idMath::RSqrt( temp );

		lightDir0[i] *= temp;
		lightDir1[i] *= temp;
		lightDir2[i] *= temp;

		temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
		temp = idMath::RSqrt( temp );

		viewDir0[i] *= temp;
		viewDir1[i] *= temp;
		viewDir2[i] *= temp;

		lightDir0[i] += viewDir0[i];
		lightDir1[i] += viewDir1[i];
		lightDir2[i] += viewDir2[i];

		texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
		texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
		texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];

		int n = usedVertNums[i];
		texCoords[n][0] = texCoords0;
		texCoords[n][1] = texCoords1;
		texCoords[n][2] = texCoords2;
		texCoords[n][3] = 1.0f;
	}

#endif
}

/*
============
idSIMD_SSE::CreateShadowCache
============
*/
int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
#if 1
	int outVerts;

	__asm {
		push		ebx

		mov			esi, lightOrigin
		movaps		xmm5, SIMD_SP_lastOne
		movss		xmm6, [esi+0]
		movhps		xmm6, [esi+4]
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
		orps		xmm6, SIMD_SP_lastOne
		movaps		xmm7, xmm6

		xor			ebx, ebx
		xor			ecx, ecx

		mov			edx, vertRemap
		mov			esi, verts
		mov			edi, vertexCache
		mov			eax, numVerts
		and			eax, ~3
		jz			done4
		shl			eax, 2
		add			edx, eax
		neg			eax

	loop4:
		prefetchnta	[edx+128]
		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]

		cmp         dword ptr [edx+eax+0], ebx
		jne         skip1

		mov			dword ptr [edx+eax+0], ecx
		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		add			ecx, 2
		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
		orps		xmm0, xmm5
		movaps		[edi+0*16], xmm0
		subps		xmm0, xmm6
		movaps		[edi+1*16], xmm0
		add			edi, 2*16

	skip1:
		cmp         dword ptr [edx+eax+4], ebx
		jne         skip2

		mov			dword ptr [edx+eax+4], ecx
		movss		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		add			ecx, 2
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
		orps		xmm1, xmm5
		movaps		[edi+0*16], xmm1
		subps		xmm1, xmm7
		movaps		[edi+1*16], xmm1
		add			edi, 2*16

	skip2:
		cmp         dword ptr [edx+eax+8], ebx
		jne         skip3

		mov			dword ptr [edx+eax+8], ecx
		movss		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		add			ecx, 2
		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
		orps		xmm2, xmm5
		movaps		[edi+0*16], xmm2
		subps		xmm2, xmm6
		movaps		[edi+1*16], xmm2
		add			edi, 2*16

	skip3:
		cmp         dword ptr [edx+eax+12], ebx
		jne         skip4

		mov			dword ptr [edx+eax+12], ecx
		movss		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		add			ecx, 2
		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
		orps		xmm3, xmm5
		movaps		[edi+0*16], xmm3
		subps		xmm3, xmm7
		movaps		[edi+1*16], xmm3
		add			edi, 2*16

	skip4:
		add			esi, 4*DRAWVERT_SIZE
		add			eax, 4*4
		jl			loop4

	done4:
		mov			eax, numVerts
		and			eax, 3
		jz			done1
		shl			eax, 2
		add			edx, eax
		neg			eax

	loop1:
		cmp         dword ptr [edx+eax+0], ebx
		jne         skip0

		mov			dword ptr [edx+eax+0], ecx
		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		add			ecx, 2
		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
		orps		xmm0, xmm5
		movaps		[edi+0*16], xmm0
		subps		xmm0, xmm6
		movaps		[edi+1*16], xmm0
		add			edi, 2*16

	skip0:

		add			esi, DRAWVERT_SIZE
		add			eax, 4
		jl			loop1

	done1:
		pop			ebx
		mov			outVerts, ecx
	}
	return outVerts;

#else

	int outVerts = 0;
	for ( int i = 0; i < numVerts; i++ ) {
		if ( vertRemap[i] ) {
			continue;
		}
		const float *v = verts[i].xyz.ToFloatPtr();
		vertexCache[outVerts+0][0] = v[0];
		vertexCache[outVerts+0][1] = v[1];
		vertexCache[outVerts+0][2] = v[2];
		vertexCache[outVerts+0][3] = 1.0f;

		// R_SetupProjection() builds the projection matrix with a slight crunch
		// for depth, which keeps this w=0 division from rasterizing right at the
		// wrap around point and causing depth fighting with the rear caps
		vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
		vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
		vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
		vertexCache[outVerts+1][3] = 0.0f;
		vertRemap[i] = outVerts;
		outVerts += 2;
	}
	return outVerts;

#endif
}

/*
============
idSIMD_SSE::CreateVertexProgramShadowCache
============
*/
int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
#if 1

	__asm {
		movaps		xmm4, SIMD_SP_lastOne
		movaps		xmm5, xmm4
		movaps		xmm6, xmm4
		movaps		xmm7, xmm4

		mov			esi, verts
		mov			edi, vertexCache
		mov			eax, numVerts
		and			eax, ~3
		jz			done4
		shl			eax, 5
		add			edi, eax
		neg			eax

	loop4:
		prefetchnta	[esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]

		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
		movaps		[edi+eax+1*16], xmm0
		orps		xmm0, xmm4
		movaps		[edi+eax+0*16], xmm0

		movss		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
		movaps		[edi+eax+3*16], xmm1
		orps		xmm1, xmm5
		movaps		[edi+eax+2*16], xmm1

		movss		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
		movaps		[edi+eax+5*16], xmm2
		orps		xmm2, xmm6
		movaps		[edi+eax+4*16], xmm2

		movss		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		movhps		xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
		shufps		xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
		movaps		[edi+eax+7*16], xmm3
		orps		xmm3, xmm7
		movaps		[edi+eax+6*16], xmm3

		add			esi, 4*DRAWVERT_SIZE
		add			eax, 4*8*4
		jl			loop4

	done4:
		mov			eax, numVerts
		and			eax, 3
		jz			done1
		shl			eax, 5
		add			edi, eax
		neg			eax

	loop1:
		movss		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
		movhps		xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
		movaps		[edi+eax+1*16], xmm0
		orps		xmm0, xmm4
		movaps		[edi+eax+0*16], xmm0

		add			esi, DRAWVERT_SIZE
		add			eax, 8*4
		jl			loop1

	done1:
	}
	return numVerts * 2;

#else

	for ( int i = 0; i < numVerts; i++ ) {
		const float *v = verts[i].xyz.ToFloatPtr();
		vertexCache[i*2+0][0] = v[0];
		vertexCache[i*2+0][1] = v[1];
		vertexCache[i*2+0][2] = v[2];
		vertexCache[i*2+0][3] = 1.0f;

		vertexCache[i*2+1][0] = v[0];
		vertexCache[i*2+1][1] = v[1];
		vertexCache[i*2+1][2] = v[2];
		vertexCache[i*2+1][3] = 0.0f;
	}
	return numVerts * 2;

#endif
}

/*
============
SSE_UpSample11kHzMonoPCMTo44kHz
============
*/
static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
	__asm {
		mov			esi, src
		mov			edi, dest

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 1
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 2*4*4

		movsx		ecx, word ptr [esi+eax+0]
		cvtsi2ss	xmm0, ecx
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi-2*4*4+0], xmm0
		movhps		[edi-2*4*4+8], xmm0

		movsx		edx, word ptr [esi+eax+2]
		cvtsi2ss	xmm1, edx
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi-1*4*4+0], xmm1
		movhps		[edi-1*4*4+8], xmm1

		add			eax, 2*2
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movsx		ecx, word ptr [esi]
		cvtsi2ss	xmm0, ecx
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi+0], xmm0
		movhps		[edi+8], xmm0

	done:
	}
}

/*
============
SSE_UpSample11kHzStereoPCMTo44kHz
============
*/
static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
	__asm {
		mov			esi, src
		mov			edi, dest

		mov			eax, numSamples
		test		eax, ~1
		jz			done2
		shl			eax, 1
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 8*4

		movsx		ecx, word ptr [esi+eax+0]
		cvtsi2ss	xmm0, ecx

		movsx		edx, word ptr [esi+eax+2]
		cvtsi2ss	xmm1, edx

		unpcklps	xmm0, xmm1

		movlps		[edi-8*4+0], xmm0
		movlps		[edi-8*4+8], xmm0
		movlps		[edi-4*4+0], xmm0
		movlps		[edi-4*4+8], xmm0

		add			eax, 2*2
		jl			loop2

	done2:
	}
}

/*
============
SSE_UpSample22kHzMonoPCMTo44kHz
============
*/
static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
	__asm {
		mov			esi, src
		mov			edi, dest

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 1
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 4*4

		movsx		ecx, word ptr [esi+eax+0]
		cvtsi2ss	xmm0, ecx

		movsx		edx, word ptr [esi+eax+2]
		cvtsi2ss	xmm1, edx

		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi-4*4+0], xmm0
		movhps		[edi-4*4+8], xmm0

		add			eax, 2*2
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movsx		ecx, word ptr [esi]
		cvtsi2ss	xmm0, ecx
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi], xmm0

	done:
	}
}

/*
============
SSE_UpSample22kHzStereoPCMTo44kHz
============
*/
static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
	__asm {
		mov			esi, src
		mov			edi, dest

		mov			eax, numSamples
		test		eax, ~1
		jz			done2
		shl			eax, 1
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 4*4

		movsx		ecx, word ptr [esi+eax+0]
		cvtsi2ss	xmm0, ecx
		movss		[edi-4*4], xmm0
		movss		[edi-2*4], xmm0

		movsx		edx, word ptr [esi+eax+2]
		cvtsi2ss	xmm1, edx
		movss		[edi-3*4], xmm1
		movss		[edi-1*4], xmm1

		add			eax, 2*2
		jl			loop2

	done2:
	}
}

/*
============
SSE_UpSample44kHzMonoPCMTo44kHz
============
*/
static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
	__asm {
		mov			esi, src
		mov			edi, dest

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 1
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 2*4

		movsx		ecx, word ptr [esi+eax+0]
		cvtsi2ss	xmm0, ecx
		movss		[edi-2*4], xmm0

		movsx		edx, word ptr [esi+eax+2]
		cvtsi2ss	xmm1, edx
		movss		[edi-1*4], xmm1

		add			eax, 2*2
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movsx		ecx, word ptr [esi]
		cvtsi2ss	xmm0, ecx
		movss		[edi], xmm0

	done:
	}
}

/*
============
idSIMD_SSE::UpSamplePCMTo44kHz

  Duplicate samples for 44kHz output.
============
*/
void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
	if ( kHz == 11025 ) {
		if ( numChannels == 1 ) {
			SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
		} else {
			SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
		}
	} else if ( kHz == 22050 ) {
		if ( numChannels == 1 ) {
			SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
		} else {
			SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
		}
	} else if ( kHz == 44100 ) {
		SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
	} else {
		assert( 0 );
	}
}

/*
============
SSE_UpSample11kHzMonoOGGTo44kHz
============
*/
static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
	float constant = 32768.0f;
	__asm {
		mov			esi, src
		mov			edi, dest
		movss		xmm7, constant
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 2
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 2*16

		movss		xmm0, [esi+eax+0]
		mulss		xmm0, xmm7
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi-32], xmm0
		movlps		[edi-24], xmm0

		movss		xmm1, [esi+eax+4]
		mulss		xmm1, xmm7
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi-16], xmm1
		movlps		[edi- 8], xmm1

		add			eax, 2*4
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movss		xmm0, [esi]
		mulss		xmm0, xmm7
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi+0], xmm0
		movlps		[edi+8], xmm0

	done:
	}
}

/*
============
SSE_UpSample11kHzStereoOGGTo44kHz
============
*/
static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
	float constant = 32768.0f;
	__asm {
		mov			esi, src
		mov			ecx, [esi+0]
		mov			edx, [esi+4]
		mov			edi, dest
		movss		xmm7, constant
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 1
		add			ecx, eax
		add			edx, eax
		neg			eax

		align		16
	loop2:
		add			edi, 4*16

		movlps		xmm0, [ecx+eax]
		movlps		xmm1, [edx+eax]
		unpcklps	xmm0, xmm1
		mulps		xmm0, xmm7
		movlps		[edi-8*8], xmm0
		movlps		[edi-7*8], xmm0
		movlps		[edi-6*8], xmm0
		movlps		[edi-5*8], xmm0
		movhps		[edi-4*8], xmm0
		movhps		[edi-3*8], xmm0
		movhps		[edi-2*8], xmm0
		movhps		[edi-1*8], xmm0

		add			eax, 2*4
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movss		xmm0, [ecx]
		movss		xmm1, [edx]
		unpcklps	xmm0, xmm1
		mulps		xmm0, xmm7
		movlps		[edi+0*8], xmm0
		movlps		[edi+1*8], xmm0
		movlps		[edi+2*8], xmm0
		movlps		[edi+3*8], xmm0

	done:
	}
}

/*
============
SSE_UpSample22kHzMonoOGGTo44kHz
============
*/
static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
	float constant = 32768.0f;
	__asm {
		mov			esi, src
		mov			edi, dest
		movss		xmm7, constant
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 2
		add			esi, eax
		neg			eax

		align		16
	loop2:
		add			edi, 2*8

		movss		xmm0, [esi+eax+0]
		movss		xmm1, [esi+eax+4]
		shufps		xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm0, xmm7
		movlps		[edi-16], xmm0
		movhps		[edi- 8], xmm0

		add			eax, 2*4
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movss		xmm0, [esi]
		mulss		xmm0, xmm7
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
		movlps		[edi+0], xmm0

	done:
	}
}

/*
============
SSE_UpSample22kHzStereoOGGTo44kHz
============
*/
static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
	float constant = 32768.0f;
	__asm {
		mov			esi, src
		mov			ecx, [esi+0]
		mov			edx, [esi+4]
		mov			edi, dest
		movss		xmm7, constant
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 1
		add			ecx, eax
		add			edx, eax
		neg			eax

		align		16
	loop2:
		add			edi, 2*16

		movlps		xmm0, [ecx+eax]
		movlps		xmm1, [edx+eax]
		unpcklps	xmm0, xmm1
		mulps		xmm0, xmm7
		movlps		[edi-4*8], xmm0
		movlps		[edi-3*8], xmm0
		movhps		[edi-2*8], xmm0
		movhps		[edi-1*8], xmm0

		add			eax, 2*4
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movss		xmm0, [ecx]
		movss		xmm1, [edx]
		unpcklps	xmm0, xmm1
		mulps		xmm0, xmm7
		movlps		[edi+0*8], xmm0
		movlps		[edi+1*8], xmm0

	done:
	}
}

/*
============
SSE_UpSample44kHzMonoOGGTo44kHz
============
*/
static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
	float constant = 32768.0f;
	KFLOAT_CA( mul, dest, src, constant, numSamples )
}

/*
============
SSE_UpSample44kHzStereoOGGTo44kHz
============
*/
static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
	float constant = 32768.0f;
	__asm {
		mov			esi, src
		mov			ecx, [esi+0]
		mov			edx, [esi+4]
		mov			edi, dest
		movss		xmm7, constant
		shufps		xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )

		mov			eax, numSamples
		and			eax, ~1
		jz			done2
		shl			eax, 1
		add			ecx, eax
		add			edx, eax
		neg			eax

		align		16
	loop2:
		add			edi, 16

		movlps		xmm0, [ecx+eax]
		movlps		xmm1, [edx+eax]
		unpcklps	xmm0, xmm1
		mulps		xmm0, xmm7
		movlps		[edi-2*8], xmm0
		movhps		[edi-1*8], xmm0

		add			eax, 2*4
		jl			loop2

	done2:
		mov			eax, numSamples
		and			eax, 1
		jz			done

		movss		xmm0, [ecx]
		movss		xmm1, [edx]
		unpcklps	xmm0, xmm1
		mulps		xmm0, xmm7
		movlps		[edi+0*8], xmm0

	done:
	}
}

/*
============
idSIMD_SSE::UpSampleOGGTo44kHz

  Duplicate samples for 44kHz output.
============
*/
void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
	if ( kHz == 11025 ) {
		if ( numChannels == 1 ) {
			SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
		} else {
			SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
		}
	} else if ( kHz == 22050 ) {
		if ( numChannels == 1 ) {
			SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
		} else {
			SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
		}
	} else if ( kHz == 44100 ) {
		if ( numChannels == 1 ) {
			SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
		} else {
			SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
		}
	} else {
		assert( 0 );
	}
}

/*
============
idSIMD_SSE::MixSoundTwoSpeakerMono
============
*/
void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
#if 1

	ALIGN16( float incs[2] );

	assert( numSamples == MIXBUFFER_SAMPLES );

	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;

	__asm {
		mov			eax, MIXBUFFER_SAMPLES
		mov			edi, mixBuffer
		mov			esi, samples
		shl			eax, 2
		add			esi, eax
		neg			eax

		mov			ecx, lastV
		movlps		xmm6, [ecx]
		xorps		xmm7, xmm7
		movhps		xmm7, incs
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
		addps		xmm6, xmm7
		shufps		xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
		addps		xmm7, xmm7

	loop16:
		add			edi, 4*4*4

		movaps		xmm0, [esi+eax+0*4*4]
		movaps		xmm1, xmm0
		shufps		xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
		mulps		xmm0, xmm6
		addps		xmm0, [edi-4*4*4]
		addps		xmm6, xmm7
		movaps		[edi-4*4*4], xmm0

		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
		mulps		xmm1, xmm6
		addps		xmm1, [edi-3*4*4]
		addps		xmm6, xmm7
		movaps		[edi-3*4*4], xmm1

		movaps		xmm2, [esi+eax+1*4*4]
		movaps		xmm3, xmm2
		shufps		xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
		mulps		xmm2, xmm6
		addps		xmm2, [edi-2*4*4]
		addps		xmm6, xmm7
		movaps		[edi-2*4*4], xmm2

		shufps		xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
		mulps		xmm3, xmm6
		addps		xmm3, [edi-1*4*4]
		addps		xmm6, xmm7
		movaps		[edi-1*4*4], xmm3

		add			eax, 2*4*4

		jl			loop16
	}

#else

	int i;
	float incL;
	float incR;
	float sL0, sL1;
	float sR0, sR1;

	assert( numSamples == MIXBUFFER_SAMPLES );

	incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;

	sL0 = lastV[0];
	sR0 = lastV[1];
	sL1 = lastV[0] + incL;
	sR1 = lastV[1] + incR;

	incL *= 2;
	incR *= 2;

	for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
		mixBuffer[i*2+0] += samples[i+0] * sL0;
		mixBuffer[i*2+1] += samples[i+0] * sR0;
		mixBuffer[i*2+2] += samples[i+1] * sL1;
		mixBuffer[i*2+3] += samples[i+1] * sR1;
		sL0 += incL;
		sR0 += incR;
		sL1 += incL;
		sR1 += incR;
	}

#endif
}

/*
============
idSIMD_SSE::MixSoundTwoSpeakerStereo
============
*/
void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
#if 1

	ALIGN16( float incs[2] );

	assert( numSamples == MIXBUFFER_SAMPLES );

	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;

	__asm {
		mov			eax, MIXBUFFER_SAMPLES
		mov			edi, mixBuffer
		mov			esi, samples
		shl			eax, 3
		add			esi, eax
		neg			eax

		mov			ecx, lastV
		movlps		xmm6, [ecx]
		xorps		xmm7, xmm7
		movhps		xmm7, incs
		shufps		xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
		addps		xmm6, xmm7
		shufps		xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
		addps		xmm7, xmm7

	loop16:
		add			edi, 4*4*4

		movaps		xmm0, [esi+eax+0*4*4]
		mulps		xmm0, xmm6
		addps		xmm0, [edi-4*4*4]
		addps		xmm6, xmm7
		movaps		[edi-4*4*4], xmm0

		movaps		xmm2, [esi+eax+1*4*4]
		mulps		xmm2, xmm6
		addps		xmm2, [edi-3*4*4]
		addps		xmm6, xmm7
		movaps		[edi-3*4*4], xmm2

		movaps		xmm3, [esi+eax+2*4*4]
		mulps		xmm3, xmm6
		addps		xmm3, [edi-2*4*4]
		addps		xmm6, xmm7
		movaps		[edi-2*4*4], xmm3

		movaps		xmm4, [esi+eax+3*4*4]
		mulps		xmm4, xmm6
		addps		xmm4, [edi-1*4*4]
		addps		xmm6, xmm7
		movaps		[edi-1*4*4], xmm4

		add			eax, 4*4*4

		jl			loop16
	}

#else

	int i;
	float incL;
	float incR;
	float sL0, sL1;
	float sR0, sR1;

	assert( numSamples == MIXBUFFER_SAMPLES );

	incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;

	sL0 = lastV[0];
	sR0 = lastV[1];
	sL1 = lastV[0] + incL;
	sR1 = lastV[1] + incR;

	incL *= 2;
	incR *= 2;

	for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
		mixBuffer[i*2+0] += samples[i*2+0] * sL0;
		mixBuffer[i*2+1] += samples[i*2+1] * sR0;
		mixBuffer[i*2+2] += samples[i*2+2] * sL1;
		mixBuffer[i*2+3] += samples[i*2+3] * sR1;
		sL0 += incL;
		sR0 += incR;
		sL1 += incL;
		sR1 += incR;
	}

#endif
}

/*
============
idSIMD_SSE::MixSoundSixSpeakerMono
============
*/
void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
#if 1

	ALIGN16( float incs[6] );

	assert( numSamples == MIXBUFFER_SAMPLES );

	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
	incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
	incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
	incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
	incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;

	__asm {
		mov			eax, MIXBUFFER_SAMPLES
		mov			edi, mixBuffer
		mov			esi, samples
		shl			eax, 2
		add			esi, eax
		neg			eax

		mov			ecx, lastV
		movlps		xmm2, [ecx+ 0]
		movhps		xmm2, [ecx+ 8]
		movlps		xmm3, [ecx+16]
		movaps		xmm4, xmm2
		shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
		shufps		xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )

		xorps		xmm5, xmm5
		movhps		xmm5, incs
		movlps		xmm7, incs+8
		movhps		xmm7, incs+16
		addps		xmm3, xmm5
		addps		xmm4, xmm7
		shufps		xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
		movaps		xmm6, xmm7
		shufps		xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
		addps		xmm5, xmm5
		addps		xmm6, xmm6
		addps		xmm7, xmm7

	loop24:
		add			edi, 6*16

		movaps		xmm0, [esi+eax]

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
		mulps		xmm1, xmm2
		addps		xmm1, [edi-6*16]
		addps		xmm2, xmm5
		movaps		[edi-6*16], xmm1

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
		mulps		xmm1, xmm3
		addps		xmm1, [edi-5*16]
		addps		xmm3, xmm6
		movaps		[edi-5*16], xmm1

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
		mulps		xmm1, xmm4
		addps		xmm1, [edi-4*16]
		addps		xmm4, xmm7
		movaps		[edi-4*16], xmm1

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
		mulps		xmm1, xmm2
		addps		xmm1, [edi-3*16]
		addps		xmm2, xmm5
		movaps		[edi-3*16], xmm1

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
		mulps		xmm1, xmm3
		addps		xmm1, [edi-2*16]
		addps		xmm3, xmm6
		movaps		[edi-2*16], xmm1

		shufps		xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
		mulps		xmm0, xmm4
		addps		xmm0, [edi-1*16]
		addps		xmm4, xmm7
		movaps		[edi-1*16], xmm0

		add			eax, 4*4

		jl			loop24
	}

#else

	int i;
	float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
	float incL0, incL1, incL2, incL3, incL4, incL5;

	assert( numSamples == MIXBUFFER_SAMPLES );

	incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
	incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
	incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
	incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
	incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;

	sL0  = lastV[0];
	sL1  = lastV[1];
	sL2  = lastV[2];
	sL3  = lastV[3];
	sL4  = lastV[4];
	sL5  = lastV[5];

	sL6  = lastV[0] + incL0;
	sL7  = lastV[1] + incL1;
	sL8  = lastV[2] + incL2;
	sL9  = lastV[3] + incL3;
	sL10 = lastV[4] + incL4;
	sL11 = lastV[5] + incL5;

	incL0 *= 2;
	incL1 *= 2;
	incL2 *= 2;
	incL3 *= 2;
	incL4 *= 2;
	incL5 *= 2;

	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
		mixBuffer[i*6+ 0] += samples[i+0] * sL0;
		mixBuffer[i*6+ 1] += samples[i+0] * sL1;
		mixBuffer[i*6+ 2] += samples[i+0] * sL2;
		mixBuffer[i*6+ 3] += samples[i+0] * sL3;

		mixBuffer[i*6+ 4] += samples[i+0] * sL4;
		mixBuffer[i*6+ 5] += samples[i+0] * sL5;
		mixBuffer[i*6+ 6] += samples[i+1] * sL6;
		mixBuffer[i*6+ 7] += samples[i+1] * sL7;

		mixBuffer[i*6+ 8] += samples[i+1] * sL8;
		mixBuffer[i*6+ 9] += samples[i+1] * sL9;
		mixBuffer[i*6+10] += samples[i+1] * sL10;
		mixBuffer[i*6+11] += samples[i+1] * sL11;

		sL0  += incL0;
		sL1  += incL1;
		sL2  += incL2;
		sL3  += incL3;

		sL4  += incL4;
		sL5  += incL5;
		sL6  += incL0;
		sL7  += incL1;

		sL8  += incL2;
		sL9  += incL3;
		sL10 += incL4;
		sL11 += incL5;
	}

#endif
}

/*
============
idSIMD_SSE::MixSoundSixSpeakerStereo
============
*/
void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
#if 1

	ALIGN16( float incs[6] );

	assert( numSamples == MIXBUFFER_SAMPLES );
	assert( SPEAKER_RIGHT == 1 );
	assert( SPEAKER_BACKRIGHT == 5 );

	incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
	incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
	incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
	incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
	incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;

	__asm {
		mov			eax, MIXBUFFER_SAMPLES
		mov			edi, mixBuffer
		mov			esi, samples
		shl			eax, 3
		add			esi, eax
		neg			eax

		mov			ecx, lastV
		movlps		xmm2, [ecx+ 0]
		movhps		xmm2, [ecx+ 8]
		movlps		xmm3, [ecx+16]
		movaps		xmm4, xmm2
		shufps		xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
		shufps		xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )

		xorps		xmm5, xmm5
		movhps		xmm5, incs
		movlps		xmm7, incs+ 8
		movhps		xmm7, incs+16
		addps		xmm3, xmm5
		addps		xmm4, xmm7
		shufps		xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
		movaps		xmm6, xmm7
		shufps		xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
		addps		xmm5, xmm5
		addps		xmm6, xmm6
		addps		xmm7, xmm7

	loop12:
		add			edi, 3*16

		movaps		xmm0, [esi+eax+0]

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
		mulps		xmm1, xmm2
		addps		xmm1, [edi-3*16]
		addps		xmm2, xmm5
		movaps		[edi-3*16], xmm1

		movaps		xmm1, xmm0
		shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
		mulps		xmm1, xmm3
		addps		xmm1, [edi-2*16]
		addps		xmm3, xmm6
		movaps		[edi-2*16], xmm1

		add			eax, 4*4

		shufps		xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
		mulps		xmm0, xmm4
		addps		xmm0, [edi-1*16]
		addps		xmm4, xmm7
		movaps		[edi-1*16], xmm0

		jl			loop12

		emms
	}

#else

	int i;
	float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
	float incL0, incL1, incL2, incL3, incL4, incL5;

	assert( numSamples == MIXBUFFER_SAMPLES );
	assert( SPEAKER_RIGHT == 1 );
	assert( SPEAKER_BACKRIGHT == 5 );

	incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
	incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
	incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
	incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
	incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
	incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;

	sL0  = lastV[0];
	sL1  = lastV[1];
	sL2  = lastV[2];
	sL3  = lastV[3];
	sL4  = lastV[4];
	sL5  = lastV[5];

	sL6  = lastV[0] + incL0;
	sL7  = lastV[1] + incL1;
	sL8  = lastV[2] + incL2;
	sL9  = lastV[3] + incL3;
	sL10 = lastV[4] + incL4;
	sL11 = lastV[5] + incL5;

	incL0 *= 2;
	incL1 *= 2;
	incL2 *= 2;
	incL3 *= 2;
	incL4 *= 2;
	incL5 *= 2;

	for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
		mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
		mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
		mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
		mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;

		mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
		mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
		mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
		mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;

		mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
		mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
		mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
		mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;

		sL0  += incL0;
		sL1  += incL1;
		sL2  += incL2;
		sL3  += incL3;

		sL4  += incL4;
		sL5  += incL5;
		sL6  += incL0;
		sL7  += incL1;

		sL8  += incL2;
		sL9  += incL3;
		sL10 += incL4;
		sL11 += incL5;
	}

#endif
}

/*
============
idSIMD_SSE::MixedSoundToSamples
============
*/
void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
#if 1

	assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );

	__asm {

		mov			eax, numSamples
		mov			edi, mixBuffer
		mov			esi, samples
		shl			eax, 2
		add			edi, eax
		neg			eax

	loop16:

		movaps		xmm0, [edi+eax+0*16]
		movaps		xmm2, [edi+eax+1*16]
		movaps		xmm4, [edi+eax+2*16]
		movaps		xmm6, [edi+eax+3*16]

		add			esi, 4*4*2

		movhlps		xmm1, xmm0
		movhlps		xmm3, xmm2
		movhlps		xmm5, xmm4
		movhlps		xmm7, xmm6

		prefetchnta	[edi+eax+64]

		cvtps2pi	mm0, xmm0
		cvtps2pi	mm2, xmm2
		cvtps2pi	mm4, xmm4
		cvtps2pi	mm6, xmm6

		prefetchnta	[edi+eax+128]

		cvtps2pi	mm1, xmm1
		cvtps2pi	mm3, xmm3
		cvtps2pi	mm5, xmm5
		cvtps2pi	mm7, xmm7

		add			eax, 4*16

		packssdw	mm0, mm1
		packssdw	mm2, mm3
		packssdw	mm4, mm5
		packssdw	mm6, mm7

		movq		[esi-4*4*2], mm0
		movq		[esi-3*4*2], mm2
		movq		[esi-2*4*2], mm4
		movq		[esi-1*4*2], mm6

		jl			loop16

		emms
	}

#else

	for ( int i = 0; i < numSamples; i++ ) {
		if ( mixBuffer[i] <= -32768.0f ) {
			samples[i] = -32768;
		} else if ( mixBuffer[i] >= 32767.0f ) {
			samples[i] = 32767;
		} else {
			samples[i] = (short) mixBuffer[i];
		}
	}

#endif
}

#endif /* _WIN32 */