dhewm3/neo/idlib/math/Simd_SSE3.cpp

/*
===========================================================================

Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.

This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").

Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with Doom 3 Source Code.  If not, see <http://www.gnu.org/licenses/>.

In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code.  If not, please request a copy in writing from id Software at the address below.

If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.

===========================================================================
*/

#include "sys/platform.h"

#include "idlib/math/Simd_SSE3.h"

//===============================================================
//
//	SSE3 implementation of idSIMDProcessor
//
//===============================================================

#if defined(__GNUC__) && defined(__SSE3__)

/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
	return "MMX & SSE & SSE2 & SSE3";
}

#elif defined(_MSC_VER)

#include <xmmintrin.h>

#include "idlib/geometry/JointTransform.h"
#include "idlib/geometry/DrawVert.h"
#include "idlib/math/Vector.h"

#define SHUFFLEPS( x, y, z, w )		(( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w )	(( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#define SHUFFLEPD( x, y )			(( (x) & 1 ) << 1 | ( (y) & 1 ))
#define R_SHUFFLEPD( x, y )			(( (y) & 1 ) << 1 | ( (x) & 1 ))

/*

	The first argument of an instruction macro is the destination
	and the second argument is the source operand. The destination
	operand can be _xmm0 to _xmm7 only. The source operand can be
	any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
	_ebp, _ebx, _esi, or _edi that contains the effective address.

	For instance:  haddps   xmm0, xmm1
	becomes:       haddps( _xmm0, _xmm1 )
	and:           haddps   xmm0, [esi]
	becomes:       haddps( _xmm0, _esi )

	The ADDRESS_ADDC macro can be used when the effective source address
	is formed by adding a constant to a general purpose register.
	For instance:  haddps   xmm0, [esi+48]
	becomes:       haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )

	The ADDRESS_ADDR macro can be used when the effective source address
	is formed by adding two general purpose registers.
	For instance:  haddps   xmm0, [esi+eax]
	becomes:       haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )

	The ADDRESS_ADDRC macro can be used when the effective source address
	is formed by adding two general purpose registers and a constant.
	The constant must be in the range [-128, 127].
	For instance:  haddps   xmm0, [esi+eax+48]
	becomes:       haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )

	The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
	by adding a scaled general purpose register to another general purpose register.
	The scale must be either 1, 2, 4 or 8.
	For instance:  haddps   xmm0, [esi+eax*4]
	becomes:       haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )

	The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
	by adding a scaled general purpose register to another general purpose register and
	also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
	be in the range [-128, 127].
	For instance:  haddps   xmm0, [esi+eax*4+64]
	becomes:       haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )

*/

#define _eax	0x00
#define _ecx	0x01
#define _edx	0x02
#define _ebx	0x03
#define _esp	0x04
#define _ebp	0x05
#define _esi	0x06
#define _edi	0x07

#define _xmm0	0xC0
#define _xmm1	0xC1
#define _xmm2	0xC2
#define _xmm3	0xC3
#define _xmm4	0xC4
#define _xmm5	0xC5
#define _xmm6	0xC6
#define _xmm7	0xC7

#define RSCALE( s )		( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )

#define ADDRESS_ADDC( reg0, constant )						0x40 | ( reg0 & 7 )	\
	_asm _emit constant

#define ADDRESS_ADDR( reg0, reg1 )							0x04				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )

#define ADDRESS_ADDRC( reg0, reg1, constant )				0x44				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )								\
	_asm _emit constant

#define ADDRESS_SCALEADDR( reg0, reg1, scale )				0x04				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )

#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant )	0x44				\
	_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )			\
	_asm _emit constant


// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
#define addsubps( dst, src )						\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0xD0									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
#define addsubpd( dst, src )						\
	_asm _emit 0x66									\
	_asm _emit 0x0F									\
	_asm _emit 0xD0									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
#define haddps( dst, src )							\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x7C									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
#define haddpd( dst, src )							\
	_asm _emit 0x66									\
	_asm _emit 0x0F									\
	_asm _emit 0x7C									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
#define hsubps( dst, src )							\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x7D									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
#define hsubpd( dst, src )							\
	_asm _emit 0x66									\
	_asm _emit 0x0F									\
	_asm _emit 0x7D									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
#define movsldup( dst, src )						\
	_asm _emit 0xF3									\
	_asm _emit 0x0F									\
	_asm _emit 0x12									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
#define movdldup( dst, src )						\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x12									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
#define movshdup( dst, src )						\
	_asm _emit 0xF3									\
	_asm _emit 0x0F									\
	_asm _emit 0x16									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
#define movdhdup( dst, src )						\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0x16									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src

// Load Unaligned Integer 128 bits
#define lddqu( dst, src )							\
	_asm _emit 0xF2									\
	_asm _emit 0x0F									\
	_asm _emit 0xF0									\
	_asm _emit ( ( dst & 7 ) << 3 ) | src


#define DRAWVERT_SIZE				60
#define DRAWVERT_XYZ_OFFSET			(0*4)
#define DRAWVERT_ST_OFFSET			(3*4)
#define DRAWVERT_NORMAL_OFFSET		(5*4)
#define DRAWVERT_TANGENT0_OFFSET	(8*4)
#define DRAWVERT_TANGENT1_OFFSET	(11*4)
#define DRAWVERT_COLOR_OFFSET		(14*4)

#define JOINTQUAT_SIZE				(7*4)
#define JOINTMAT_SIZE				(4*3*4)
#define JOINTWEIGHT_SIZE			(4*4)


/*
============
SSE3_Dot
============
*/
float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
	float d;
	__asm {
		mov		esi, v1
		mov		edi, v2
		movaps	xmm0, [esi]
		mulps	xmm0, [edi]
		haddps(	_xmm0, _xmm0 )
		haddps(	_xmm0, _xmm0 )
		movss	d, xmm0
	}
	return d;
}

/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
	return "MMX & SSE & SSE2 & SSE3";
}

/*
============
idSIMD_SSE3::TransformVerts
============
*/
void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
#if 1

	assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
	assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
	assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
	assert( sizeof( idJointMat ) == JOINTMAT_SIZE );

	__asm
	{
		mov			eax, numVerts
		test		eax, eax
		jz			done
		imul		eax, DRAWVERT_SIZE

		mov			ecx, verts
		mov			edx, index
		mov			esi, weights
		mov			edi, joints

		add			ecx, eax
		neg			eax

	loopVert:
		mov			ebx, [edx]
		movaps		xmm2, [esi]
		add			edx, 8
		movaps		xmm0, xmm2
		add			esi, JOINTWEIGHT_SIZE
		movaps		xmm1, xmm2

		mulps		xmm0, [edi+ebx+ 0]						// xmm0 = m0, m1, m2, t0
		mulps		xmm1, [edi+ebx+16]						// xmm1 = m3, m4, m5, t1
		mulps		xmm2, [edi+ebx+32]						// xmm2 = m6, m7, m8, t2

		cmp			dword ptr [edx-4], 0

		jne			doneWeight

	loopWeight:
		mov			ebx, [edx]
		movaps		xmm5, [esi]
		add			edx, 8
		movaps		xmm3, xmm5
		add			esi, JOINTWEIGHT_SIZE
		movaps		xmm4, xmm5

		mulps		xmm3, [edi+ebx+ 0]						// xmm3 = m0, m1, m2, t0
		mulps		xmm4, [edi+ebx+16]						// xmm4 = m3, m4, m5, t1
		mulps		xmm5, [edi+ebx+32]						// xmm5 = m6, m7, m8, t2

		cmp			dword ptr [edx-4], 0

		addps		xmm0, xmm3
		addps		xmm1, xmm4
		addps		xmm2, xmm5

		je			loopWeight

	doneWeight:
		add			eax, DRAWVERT_SIZE

		haddps(		_xmm0, _xmm1 )
		haddps(		_xmm2, _xmm0 )

		movhps		[ecx+eax-DRAWVERT_SIZE+0], xmm2

		haddps(		_xmm2, _xmm2 )

		movss		[ecx+eax-DRAWVERT_SIZE+8], xmm2

		jl			loopVert
	done:
	}

#else

	int i, j;
	const byte *jointsPtr = (byte *)joints;

	for( j = i = 0; i < numVerts; i++ ) {
		idVec3 v;

		v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
		while( index[j*2+1] == 0 ) {
			j++;
			v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
		}
		j++;

		verts[i].xyz = v;
	}

#endif
}

#endif /* _MSC_VER */
hello world 2011-11-22 21:28:15 +00:00			`/*`
			`===========================================================================`

			`Doom 3 GPL Source Code`
Fix all whitespace errors Excluding 3rd party files. 2011-12-06 18:20:15 +00:00			`Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.`
hello world 2011-11-22 21:28:15 +00:00
Fix quoting in GPL headers 2011-12-06 16:14:59 +00:00			`This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").`
hello world 2011-11-22 21:28:15 +00:00
			`Doom 3 Source Code is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`Doom 3 Source Code is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.`

			`In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.`

			`If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.`

			`===========================================================================`
			`*/`

Untangle the epic precompiled.h mess Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards. 2011-12-16 22:28:29 +00:00			`#include "sys/platform.h"`
hello world 2011-11-22 21:28:15 +00:00
Untangle the epic precompiled.h mess Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards. 2011-12-16 22:28:29 +00:00			`#include "idlib/math/Simd_SSE3.h"`
hello world 2011-11-22 21:28:15 +00:00
			`//===============================================================`
			`//`
			`// SSE3 implementation of idSIMDProcessor`
			`//`
			`//===============================================================`

Always compile all SIMD code Protect all SIMD implementations with the according defines and let the compiler decide if it supports the intructions. Linux will still use Simd_Generic because CPU feature runtime detection is missing. 2011-12-13 00:06:39 +00:00			`#if defined(__GNUC__) && defined(__SSE3__)`
hello world 2011-11-22 21:28:15 +00:00
			`/*`
			`============`
			`idSIMD_SSE3::GetName`
			`============`
			`*/`
			`const char * idSIMD_SSE3::GetName( void ) const {`
			`return "MMX & SSE & SSE2 & SSE3";`
			`}`

Only use MSVC style asm with MSVC 2011-12-13 22:43:27 +00:00			`#elif defined(_MSC_VER)`
hello world 2011-11-22 21:28:15 +00:00
			`#include <xmmintrin.h>`

Add missing includes for MSVC - MSVC doesn't provide C99 headers - Default to min. req. 64Mb video mem if no COM present - Move misplaced __attribute__((packed)) from MSVC to MinGW 2011-12-21 23:08:10 +00:00			`#include "idlib/geometry/JointTransform.h"`
			`#include "idlib/geometry/DrawVert.h"`
			`#include "idlib/math/Vector.h"`

hello world 2011-11-22 21:28:15 +00:00			`#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 \| ( (y) & 3 ) << 4 \| ( (z) & 3 ) << 2 \| ( (w) & 3 ))`
			`#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 \| ( (z) & 3 ) << 4 \| ( (y) & 3 ) << 2 \| ( (x) & 3 ))`
			`#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 \| ( (y) & 1 ))`
			`#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 \| ( (x) & 1 ))`

			`/*`

			`The first argument of an instruction macro is the destination`
			`and the second argument is the source operand. The destination`
			`operand can be _xmm0 to _xmm7 only. The source operand can be`
			`any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,`
			`_ebp, _ebx, _esi, or _edi that contains the effective address.`

			`For instance: haddps xmm0, xmm1`
			`becomes: haddps( _xmm0, _xmm1 )`
			`and: haddps xmm0, [esi]`
			`becomes: haddps( _xmm0, _esi )`

			`The ADDRESS_ADDC macro can be used when the effective source address`
			`is formed by adding a constant to a general purpose register.`
			`For instance: haddps xmm0, [esi+48]`
			`becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )`

			`The ADDRESS_ADDR macro can be used when the effective source address`
			`is formed by adding two general purpose registers.`
			`For instance: haddps xmm0, [esi+eax]`
			`becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )`

			`The ADDRESS_ADDRC macro can be used when the effective source address`
			`is formed by adding two general purpose registers and a constant.`
			`The constant must be in the range [-128, 127].`
			`For instance: haddps xmm0, [esi+eax+48]`
			`becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )`

			`The ADDRESS_SCALEADDR macro can be used when the effective source address is formed`
			`by adding a scaled general purpose register to another general purpose register.`
			`The scale must be either 1, 2, 4 or 8.`
			`For instance: haddps xmm0, [esi+eax*4]`
			`becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )`

			`The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed`
			`by adding a scaled general purpose register to another general purpose register and`
			`also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must`
			`be in the range [-128, 127].`
			`For instance: haddps xmm0, [esi+eax*4+64]`
			`becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )`

			`*/`

			`#define _eax 0x00`
			`#define _ecx 0x01`
			`#define _edx 0x02`
			`#define _ebx 0x03`
			`#define _esp 0x04`
			`#define _ebp 0x05`
			`#define _esi 0x06`
			`#define _edi 0x07`

			`#define _xmm0 0xC0`
			`#define _xmm1 0xC1`
			`#define _xmm2 0xC2`
			`#define _xmm3 0xC3`
			`#define _xmm4 0xC4`
			`#define _xmm5 0xC5`
			`#define _xmm6 0xC6`
			`#define _xmm7 0xC7`

			`#define RSCALE( s ) ( (s&2)<<5 ) \| ( (s&4)<<5 ) \| ( (s&8)<<3 ) \| ( (s&8)<<4 )`

			`#define ADDRESS_ADDC( reg0, constant ) 0x40 \| ( reg0 & 7 ) \`
			`_asm _emit constant`

			`#define ADDRESS_ADDR( reg0, reg1 ) 0x04 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 )`

			`#define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 ) \`
			`_asm _emit constant`

			`#define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 ) \| RSCALE( scale )`

			`#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \`
			`_asm _emit ( ( reg1 & 7 ) << 3 ) \| ( reg0 & 7 ) \| RSCALE( scale ) \`
			`_asm _emit constant`


			`// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )`
			`#define addsubps( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0xD0 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )`
			`#define addsubpd( dst, src ) \`
			`_asm _emit 0x66 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0xD0 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )`
			`#define haddps( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7C \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )`
			`#define haddpd( dst, src ) \`
			`_asm _emit 0x66 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7C \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )`
			`#define hsubps( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7D \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )`
			`#define hsubpd( dst, src ) \`
			`_asm _emit 0x66 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x7D \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )`
			`#define movsldup( dst, src ) \`
			`_asm _emit 0xF3 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x12 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )`
			`#define movdldup( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x12 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )`
			`#define movshdup( dst, src ) \`
			`_asm _emit 0xF3 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x16 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )`
			`#define movdhdup( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0x16 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`

			`// Load Unaligned Integer 128 bits`
			`#define lddqu( dst, src ) \`
			`_asm _emit 0xF2 \`
			`_asm _emit 0x0F \`
			`_asm _emit 0xF0 \`
			`_asm _emit ( ( dst & 7 ) << 3 ) \| src`


			`#define DRAWVERT_SIZE 60`
			`#define DRAWVERT_XYZ_OFFSET (0*4)`
			`#define DRAWVERT_ST_OFFSET (3*4)`
			`#define DRAWVERT_NORMAL_OFFSET (5*4)`
			`#define DRAWVERT_TANGENT0_OFFSET (8*4)`
			`#define DRAWVERT_TANGENT1_OFFSET (11*4)`
			`#define DRAWVERT_COLOR_OFFSET (14*4)`

			`#define JOINTQUAT_SIZE (7*4)`
			`#define JOINTMAT_SIZE (434)`
			`#define JOINTWEIGHT_SIZE (4*4)`


			`/*`
			`============`
			`SSE3_Dot`
			`============`
			`*/`
			`float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {`
			`float d;`
			`__asm {`
			`mov esi, v1`
			`mov edi, v2`
			`movaps xmm0, [esi]`
			`mulps xmm0, [edi]`
			`haddps( _xmm0, _xmm0 )`
			`haddps( _xmm0, _xmm0 )`
			`movss d, xmm0`
			`}`
			`return d;`
			`}`

			`/*`
			`============`
			`idSIMD_SSE3::GetName`
			`============`
			`*/`
			`const char * idSIMD_SSE3::GetName( void ) const {`
			`return "MMX & SSE & SSE2 & SSE3";`
			`}`

			`/*`
			`============`
			`idSIMD_SSE3::TransformVerts`
			`============`
			`*/`
			`void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert verts, const int numVerts, const idJointMat joints, const idVec4 weights, const int index, const int numWeights ) {`
			`#if 1`

			`assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );`
			`assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );`
			`assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );`
			`assert( sizeof( idJointMat ) == JOINTMAT_SIZE );`

			`__asm`
			`{`
			`mov eax, numVerts`
			`test eax, eax`
			`jz done`
			`imul eax, DRAWVERT_SIZE`

			`mov ecx, verts`
			`mov edx, index`
			`mov esi, weights`
			`mov edi, joints`

			`add ecx, eax`
			`neg eax`

			`loopVert:`
			`mov ebx, [edx]`
			`movaps xmm2, [esi]`
			`add edx, 8`
			`movaps xmm0, xmm2`
			`add esi, JOINTWEIGHT_SIZE`
			`movaps xmm1, xmm2`

			`mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0`
			`mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1`
			`mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2`

			`cmp dword ptr [edx-4], 0`

			`jne doneWeight`

			`loopWeight:`
			`mov ebx, [edx]`
			`movaps xmm5, [esi]`
			`add edx, 8`
			`movaps xmm3, xmm5`
			`add esi, JOINTWEIGHT_SIZE`
			`movaps xmm4, xmm5`

			`mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0`
			`mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1`
			`mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2`

			`cmp dword ptr [edx-4], 0`

			`addps xmm0, xmm3`
			`addps xmm1, xmm4`
			`addps xmm2, xmm5`

			`je loopWeight`

			`doneWeight:`
			`add eax, DRAWVERT_SIZE`

			`haddps( _xmm0, _xmm1 )`
			`haddps( _xmm2, _xmm0 )`

			`movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2`

			`haddps( _xmm2, _xmm2 )`

			`movss [ecx+eax-DRAWVERT_SIZE+8], xmm2`

			`jl loopVert`
			`done:`
			`}`

			`#else`

			`int i, j;`
			`const byte jointsPtr = (byte )joints;`

			`for( j = i = 0; i < numVerts; i++ ) {`
			`idVec3 v;`

			`v = ( (idJointMat ) ( jointsPtr + index[j2+0] ) ) weights[j];`
			`while( index[j*2+1] == 0 ) {`
			`j++;`
			`v += ( (idJointMat ) ( jointsPtr + index[j2+0] ) ) weights[j];`
			`}`
			`j++;`

			`verts[i].xyz = v;`
			`}`

			`#endif`
			`}`

Change another _WIN32 to _MSC_VER for MinGW Older MinGW versions do not know about __assume(). Change _WIN32 in comments too to match their opening #if. Reported by serpentine. 2011-12-22 10:46:24 +00:00			`#endif /* _MSC_VER */`