dhewm3/neo/idlib/math/Simd_SSE3.cpp

/*
===========================================================================

Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.

This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").

Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/

#include "../precompiled.h"
#pragma hdrstop

#include "Simd_Generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"
#include "Simd_SSE2.h"
#include "Simd_SSE3.h"


//===============================================================
//
//	SSE3 implementation of idSIMDProcessor
//
//===============================================================

#if defined(__GNUC__) && defined(__SSE3__)

/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
	return "MMX & SSE & SSE2 & SSE3";
}

#elif defined(_MSC_VER)

#include <xmmintrin.h>

#define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#define SHUFFLEPD( x, y )			(( (x) & 1 ) << 1 | ( (y) & 1 ))
#define R_SHUFFLEPD( x, y )			(( (y) & 1 ) << 1 | ( (x) & 1 ))

/*

	The first argument of an instruction macro is the destination
	and the second argument is the source operand. The destination
	operand can be _xmm0 to _xmm7 only. The source operand can be
	any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
	_ebp, _ebx, _esi, or _edi that contains the effective address.

	For instance:  haddps   xmm0, xmm1
	becomes:       haddps( _xmm0, _xmm1 )
	and:           haddps   xmm0, [esi]
	becomes:       haddps( _xmm0, _esi )

	The ADDRESS_ADDC macro can be used when the effective source address
	is formed by adding a constant to a general purpose register.
	For instance:  haddps   xmm0, [esi+48]
	becomes:       haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )

	The ADDRESS_ADDR macro can be used when the effective source address
	is formed by adding two general purpose registers.
	For instance:  haddps   xmm0, [esi+eax]
	becomes:       haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )

	The ADDRESS_ADDRC macro can be used when the effective source address
	is formed by adding two general purpose registers and a constant.
	The constant must be in the range [-128, 127].
	For instance:  haddps   xmm0, [esi+eax+48]
	becomes:       haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )

	The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
	by adding a scaled general purpose register to another general purpose register.
	The scale must be either 1, 2, 4 or 8.
	For instance:  haddps   xmm0, [esi+eax*4]
	becomes:       haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )

	The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
	by adding a scaled general purpose register to another general purpose register and
	also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
	be in the range [-128, 127].
	For instance:  haddps   xmm0, [esi+eax*4+64]
	becomes:       haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )

*/

#define _eax	0x00
#define _ecx	0x01
#define _edx	0x02
#define _ebx	0x03
#define _esp	0x04
#define _ebp	0x05
#define _esi	0x06
#define _edi	0x07

#define _xmm0	0xC0
#define _xmm1	0xC1
#define _xmm2	0xC2
#define _xmm3	0xC3
#define _xmm4	0xC4
#define _xmm5	0xC5
#define _xmm6	0xC6
#define _xmm7	0xC7

#define RSCALE( s )		( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )

#define ADDRESS_ADDC( reg0, constant )						0x40 | ( reg0 & 7 )	\
	_asm _emit constant

#define ADDRESS_ADDR( reg0, reg1 )							0x04				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )

#define ADDRESS_ADDRC( reg0, reg1, constant )				0x44				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )								\
	_asm _emit constant

#define ADDRESS_SCALEADDR( reg0, reg1, scale )				0x04				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )

#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant )	0x44				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )			\
	_asm _emit constant


// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
#define addsubps( dst, src )						\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0xD0									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
#define addsubpd( dst, src )						\
	_asm _emit 0x66									\
	_asm _emit 0x0F									\
	_asm _emit 0xD0									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
#define haddps( dst, src )							\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x7C									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
#define haddpd( dst, src )							\
	_asm _emit 0x66									\
	_asm _emit 0x0F									\
	_asm _emit 0x7C									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
#define hsubps( dst, src )							\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x7D									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
#define hsubpd( dst, src )							\
	_asm _emit 0x66									\
	_asm _emit 0x0F									\
	_asm _emit 0x7D									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
#define movsldup( dst, src )						\
	_asm _emit 0xF3									\
	_asm _emit 0x0F									\
	_asm _emit 0x12									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
#define movdldup( dst, src )						\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x12									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
#define movshdup( dst, src )						\
	_asm _emit 0xF3									\
	_asm _emit 0x0F									\
	_asm _emit 0x16									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
#define movdhdup( dst, src )						\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x16									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Load Unaligned Integer 128 bits
#define lddqu( dst, src )							\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0xF0									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src


#define DRAWVERT_SIZE				60
#define DRAWVERT_XYZ_OFFSET			(0*4)
#define DRAWVERT_ST_OFFSET			(3*4)
#define DRAWVERT_NORMAL_OFFSET		(5*4)
#define DRAWVERT_TANGENT0_OFFSET	(8*4)
#define DRAWVERT_TANGENT1_OFFSET	(11*4)
#define DRAWVERT_COLOR_OFFSET		(14*4)

#define JOINTQUAT_SIZE				(7*4)
#define JOINTMAT_SIZE				(4*3*4)
#define JOINTWEIGHT_SIZE			(4*4)


/*
============
SSE3_Dot
============
*/
float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
	float d;
	__asm {
		mov		esi, v1
		mov		edi, v2
		movaps	xmm0, [esi]
		mulps	xmm0, [edi]
		haddps(	_xmm0, _xmm0 )
		haddps(	_xmm0, _xmm0 )
		movss	d, xmm0
	}
	return d;
}

/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
	return "MMX & SSE & SSE2 & SSE3";
}

/*
============
idSIMD_SSE3::TransformVerts
============
*/
void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
#if 1

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
	assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );

	__asm
	{
		mov			eax, numVerts
		test		eax, eax
		jz			done
		imul		eax, DRAWVERT_SIZE

		mov			ecx, verts
		mov			edx, index
		mov			esi, weights
		mov			edi, joints

		add			ecx, eax
		neg			eax

	loopVert:
		mov			ebx, [edx]
		movaps		xmm2, [esi]
		add			edx, 8
		movaps		xmm0, xmm2
		add			esi, JOINTWEIGHT_SIZE
		movaps		xmm1, xmm2

		mulps		xmm0, [edi+ebx+ 0]						// xmm0 = m0, m1, m2, t0
		mulps		xmm1, [edi+ebx+16]						// xmm1 = m3, m4, m5, t1
		mulps		xmm2, [edi+ebx+32]						// xmm2 = m6, m7, m8, t2

		cmp			dword ptr [edx-4], 0

		jne			doneWeight

	loopWeight:
		mov			ebx, [edx]
		movaps		xmm5, [esi]
		add			edx, 8
		movaps		xmm3, xmm5
		add			esi, JOINTWEIGHT_SIZE
		movaps		xmm4, xmm5

		mulps		xmm3, [edi+ebx+ 0]						// xmm3 = m0, m1, m2, t0
		mulps		xmm4, [edi+ebx+16]						// xmm4 = m3, m4, m5, t1
		mulps		xmm5, [edi+ebx+32]						// xmm5 = m6, m7, m8, t2

		cmp			dword ptr [edx-4], 0

		addps		xmm0, xmm3
		addps		xmm1, xmm4
		addps		xmm2, xmm5

		je			loopWeight

	doneWeight:
		add			eax, DRAWVERT_SIZE

		haddps(		_xmm0, _xmm1 )
		haddps(		_xmm2, _xmm0 )

		movhps		[ecx+eax-DRAWVERT_SIZE+0], xmm2

		haddps(		_xmm2, _xmm2 )

		movss		[ecx+eax-DRAWVERT_SIZE+8], xmm2

		jl			loopVert
	done:
	}

#else

	int i, j;
	const byte *jointsPtr = (byte *)joints;

	for( j = i = 0; i < numVerts; i++ ) {
		idVec3 v;

		v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
		while( index[j*2+1] == 0 ) {
			j++;
			v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
		}
		j++;

		verts[i].xyz = v;
	}

#endif
}

#endif /* _WIN32 */
hello world 2011-11-22 21:28:15 +00:00			`/*`
			`===========================================================================`

			`Doom 3 GPL Source Code`
Fix all whitespace errors Excluding 3rd party files. 2011-12-06 18:20:15 +00:00			`Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.`
hello world 2011-11-22 21:28:15 +00:00
Fix quoting in GPL headers 2011-12-06 16:14:59 +00:00			`This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").`
hello world 2011-11-22 21:28:15 +00:00
			`Doom 3 Source Code is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`Doom 3 Source Code is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.`

			`In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.`

			`If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.`

			`===========================================================================`
			`*/`

			`#include "../precompiled.h"`
			`#pragma hdrstop`

			`#include "Simd_Generic.h"`
			`#include "Simd_MMX.h"`
			`#include "Simd_SSE.h"`
			`#include "Simd_SSE2.h"`
			`#include "Simd_SSE3.h"`


			`//===============================================================`
			`//`
			`// SSE3 implementation of idSIMDProcessor`
			`//`
			`//===============================================================`

Always compile all SIMD code Protect all SIMD implementations with the according defines and let the compiler decide if it supports the intructions. Linux will still use Simd_Generic because CPU feature runtime detection is missing. 2011-12-13 00:06:39 +00:00			`#if defined(__GNUC__) && defined(__SSE3__)`
hello world 2011-11-22 21:28:15 +00:00
			`/*`
			`============`
			`idSIMD_SSE3::GetName`
			`============`
			`*/`
			`const char * idSIMD_SSE3::GetName( void ) const {`
			`return "MMX & SSE & SSE2 & SSE3";`
			`}`

Only use MSVC style asm with MSVC 2011-12-13 22:43:27 +00:00			`#elif defined(_MSC_VER)`
hello world 2011-11-22 21:28:15 +00:00
			`#include <xmmintrin.h>`

			`#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 \| ( (y) & 3 ) << 4 \| ( (z) & 3 ) << 2 \| ( (w) & 3 ))`
			`#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 \| ( (z) & 3 ) << 4 \| ( (y) & 3 ) << 2 \| ( (x) & 3 ))`
			`#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 \| ( (y) & 1 ))`
			`#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 \| ( (x) & 1 ))`

			`/*`

			`The first argument of an instruction macro is the destination`
			`and the second argument is the source operand. The destination`
			`operand can be _xmm0 to _xmm7 only. The source operand can be`
			`any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,`
			`_ebp, _ebx, _esi, or _edi that contains the effective address.`

			`For instance: haddps xmm0, xmm1`
			`becomes: haddps( _xmm0, _xmm1 )`
			`and: haddps xmm0, [esi]`
			`becomes: haddps( _xmm0, _esi )`

			`The ADDRESS_ADDC macro can be used when the effective source address`
			`is formed by adding a constant to a general purpose register.`
			`For instance: haddps xmm0, [esi+48]`
			`becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )`

			`The ADDRESS_ADDR macro can be used when the effective source address`
			`is formed by adding two general purpose registers.`
			`For instance: haddps xmm0, [esi+eax]`
			`becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )`

			`The ADDRESS_ADDRC macro can be used when the effective source address`
			`is formed by adding two general purpose registers and a constant.`
			`The constant must be in the range [-128, 127].`
			`For instance: haddps xmm0, [esi+eax+48]`
			`becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )`

			`The ADDRESS_SCALEADDR macro can be used when the effective source address is formed`
			`by adding a scaled general purpose register to another general purpose register.`
			`The scale must be either 1, 2, 4 or 8.`
			`For instance: haddps xmm0, [esi+eax*4]`
			`becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )`

			`The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed`
			`by adding a scaled general purpose register to another general purpose register and`
			`also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must`
			`be in the range [-128, 127].`
			`For instance: haddps xmm0, [esi+eax*4+64]`
			`becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )`

			`*/`

			`#define _eax 0x00`
			`#define _ecx 0x01`
			`#define _edx 0x02`
			`#define _ebx 0x03`
			`#define _esp 0x04`
			`#define _ebp 0x05`
			`#define _esi 0x06`
			`#define _edi 0x07`

			`#define _xmm0 0xC0`
			`#define _xmm1 0xC1`
			`#define _xmm2 0xC2`
			`#define _xmm3 0xC3`
			`#define _xmm4 0xC4`
			`#define _xmm5 0xC5`
			`#define _xmm6 0xC6`
			`#define _xmm7 0xC7`

			`#define RSCALE( s ) ( (s&2)<<5 ) \| ( (s&4)<<5 ) \| ( (s&8)<<3 ) \| ( (s&8)<<4 )`

			`#define ADDRESS_ADDC( reg0, constant ) 0x40 \| ( reg0 & 7 ) \`
			`_asm _emit constant`

			`#define ADDRESS_ADDR( reg0, reg1 ) 0x04 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 )`

			`#define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 ) \`
			`_asm _emit constant`

			`#define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 ) \| RSCALE( scale )`

			`#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 ) \| RSCALE( scale ) \`
			`_asm _emit constant`


			`// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )`
			`#define addsubps( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0xD0 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )`
			`#define addsubpd( dst, src ) \`
			`_asm _emit 0x66 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0xD0 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )`
			`#define haddps( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7C \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )`
			`#define haddpd( dst, src ) \`
			`_asm _emit 0x66 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7C \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )`
			`#define hsubps( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7D \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )`
			`#define hsubpd( dst, src ) \`
			`_asm _emit 0x66 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7D \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )`
			`#define movsldup( dst, src ) \`
			`_asm _emit 0xF3 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x12 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )`
			`#define movdldup( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x12 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )`
			`#define movshdup( dst, src ) \`
			`_asm _emit 0xF3 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x16 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )`
			`#define movdhdup( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x16 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Load Unaligned Integer 128 bits`
			`#define lddqu( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0xF0 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`


			`#define DRAWVERT_SIZE 60`
			`#define DRAWVERT_XYZ_OFFSET (0*4)`
			`#define DRAWVERT_ST_OFFSET (3*4)`
			`#define DRAWVERT_NORMAL_OFFSET (5*4)`
			`#define DRAWVERT_TANGENT0_OFFSET (8*4)`
			`#define DRAWVERT_TANGENT1_OFFSET (11*4)`
			`#define DRAWVERT_COLOR_OFFSET (14*4)`

			`#define JOINTQUAT_SIZE (7*4)`
			`#define JOINTMAT_SIZE (434)`
			`#define JOINTWEIGHT_SIZE (4*4)`


			`/*`
			`============`
			`SSE3_Dot`
			`============`
			`*/`
			`float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {`
			`float d;`
			`__asm {`
			`mov esi, v1`
			`mov edi, v2`
			`movaps xmm0, [esi]`
			`mulps xmm0, [edi]`
			`haddps( _xmm0, _xmm0 )`
			`haddps( _xmm0, _xmm0 )`
			`movss d, xmm0`
			`}`
			`return d;`
			`}`

			`/*`
			`============`
			`idSIMD_SSE3::GetName`
			`============`
			`*/`
			`const char * idSIMD_SSE3::GetName( void ) const {`
			`return "MMX & SSE & SSE2 & SSE3";`
			`}`

			`/*`
			`============`
			`idSIMD_SSE3::TransformVerts`
			`============`
			`*/`
			`void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert verts, const int numVerts, const idJointMat joints, const idVec4 weights, const int index, const int numWeights ) {`
			`#if 1`

			`assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );`
			`assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );`
			`assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );`
			`assert( sizeof( idJointMat ) == JOINTMAT_SIZE );`

			`__asm`
			`{`
			`mov eax, numVerts`
			`test eax, eax`
			`jz done`
			`imul eax, DRAWVERT_SIZE`

			`mov ecx, verts`
			`mov edx, index`
			`mov esi, weights`
			`mov edi, joints`

			`add ecx, eax`
			`neg eax`

			`loopVert:`
			`mov ebx, [edx]`
			`movaps xmm2, [esi]`
			`add edx, 8`
			`movaps xmm0, xmm2`
			`add esi, JOINTWEIGHT_SIZE`
			`movaps xmm1, xmm2`

			`mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0`
			`mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1`
			`mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2`

			`cmp dword ptr [edx-4], 0`

			`jne doneWeight`

			`loopWeight:`
			`mov ebx, [edx]`
			`movaps xmm5, [esi]`
			`add edx, 8`
			`movaps xmm3, xmm5`
			`add esi, JOINTWEIGHT_SIZE`
			`movaps xmm4, xmm5`

			`mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0`
			`mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1`
			`mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2`

			`cmp dword ptr [edx-4], 0`

			`addps xmm0, xmm3`
			`addps xmm1, xmm4`
			`addps xmm2, xmm5`

			`je loopWeight`

			`doneWeight:`
			`add eax, DRAWVERT_SIZE`

			`haddps( _xmm0, _xmm1 )`
			`haddps( _xmm2, _xmm0 )`

			`movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2`

			`haddps( _xmm2, _xmm2 )`

			`movss [ecx+eax-DRAWVERT_SIZE+8], xmm2`

			`jl loopVert`
			`done:`
			`}`

			`#else`

			`int i, j;`
			`const byte jointsPtr = (byte )joints;`

			`for( j = i = 0; i < numVerts; i++ ) {`
			`idVec3 v;`

			`v = ( (idJointMat ) ( jointsPtr + index[j2+0] ) ) weights[j];`
			`while( index[j*2+1] == 0 ) {`
			`j++;`
			`v += ( (idJointMat ) ( jointsPtr + index[j2+0] ) ) weights[j];`
			`}`
			`j++;`

			`verts[i].xyz = v;`
			`}`

			`#endif`
			`}`

			`#endif /* _WIN32 */`