mirror of
https://github.com/dhewm/dhewm3.git
synced 2024-12-11 21:40:49 +00:00
736ec20d4d
Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards.
361 lines
10 KiB
C++
361 lines
10 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include "sys/platform.h"
|
|
|
|
#include "idlib/math/Simd_SSE3.h"
|
|
|
|
//===============================================================
|
|
//
|
|
// SSE3 implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
|
|
#if defined(__GNUC__) && defined(__SSE3__)
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE3::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE3::GetName( void ) const {
|
|
return "MMX & SSE & SSE2 & SSE3";
|
|
}
|
|
|
|
#elif defined(_MSC_VER)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
|
|
#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
|
|
|
|
/*
|
|
|
|
The first argument of an instruction macro is the destination
|
|
and the second argument is the source operand. The destination
|
|
operand can be _xmm0 to _xmm7 only. The source operand can be
|
|
any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
|
|
_ebp, _ebx, _esi, or _edi that contains the effective address.
|
|
|
|
For instance: haddps xmm0, xmm1
|
|
becomes: haddps( _xmm0, _xmm1 )
|
|
and: haddps xmm0, [esi]
|
|
becomes: haddps( _xmm0, _esi )
|
|
|
|
The ADDRESS_ADDC macro can be used when the effective source address
|
|
is formed by adding a constant to a general purpose register.
|
|
For instance: haddps xmm0, [esi+48]
|
|
becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
|
|
|
|
The ADDRESS_ADDR macro can be used when the effective source address
|
|
is formed by adding two general purpose registers.
|
|
For instance: haddps xmm0, [esi+eax]
|
|
becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
|
|
|
|
The ADDRESS_ADDRC macro can be used when the effective source address
|
|
is formed by adding two general purpose registers and a constant.
|
|
The constant must be in the range [-128, 127].
|
|
For instance: haddps xmm0, [esi+eax+48]
|
|
becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
|
|
|
|
The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
|
|
by adding a scaled general purpose register to another general purpose register.
|
|
The scale must be either 1, 2, 4 or 8.
|
|
For instance: haddps xmm0, [esi+eax*4]
|
|
becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
|
|
|
|
The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
|
|
by adding a scaled general purpose register to another general purpose register and
|
|
also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
|
|
be in the range [-128, 127].
|
|
For instance: haddps xmm0, [esi+eax*4+64]
|
|
becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
|
|
|
|
*/
|
|
|
|
#define _eax 0x00
|
|
#define _ecx 0x01
|
|
#define _edx 0x02
|
|
#define _ebx 0x03
|
|
#define _esp 0x04
|
|
#define _ebp 0x05
|
|
#define _esi 0x06
|
|
#define _edi 0x07
|
|
|
|
#define _xmm0 0xC0
|
|
#define _xmm1 0xC1
|
|
#define _xmm2 0xC2
|
|
#define _xmm3 0xC3
|
|
#define _xmm4 0xC4
|
|
#define _xmm5 0xC5
|
|
#define _xmm6 0xC6
|
|
#define _xmm7 0xC7
|
|
|
|
#define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
|
|
|
|
#define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
|
|
_asm _emit constant
|
|
|
|
#define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
|
|
|
|
#define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
|
|
_asm _emit constant
|
|
|
|
#define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
|
|
|
|
#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
|
|
_asm _emit constant
|
|
|
|
|
|
// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
|
|
#define addsubps( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0xD0 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
|
|
#define addsubpd( dst, src ) \
|
|
_asm _emit 0x66 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0xD0 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
|
|
#define haddps( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7C \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
|
|
#define haddpd( dst, src ) \
|
|
_asm _emit 0x66 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7C \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
|
|
#define hsubps( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7D \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
|
|
#define hsubpd( dst, src ) \
|
|
_asm _emit 0x66 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7D \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
|
|
#define movsldup( dst, src ) \
|
|
_asm _emit 0xF3 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x12 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
|
|
#define movdldup( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x12 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
|
|
#define movshdup( dst, src ) \
|
|
_asm _emit 0xF3 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x16 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
|
|
#define movdhdup( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x16 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Load Unaligned Integer 128 bits
|
|
#define lddqu( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0xF0 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
|
|
#define DRAWVERT_SIZE 60
|
|
#define DRAWVERT_XYZ_OFFSET (0*4)
|
|
#define DRAWVERT_ST_OFFSET (3*4)
|
|
#define DRAWVERT_NORMAL_OFFSET (5*4)
|
|
#define DRAWVERT_TANGENT0_OFFSET (8*4)
|
|
#define DRAWVERT_TANGENT1_OFFSET (11*4)
|
|
#define DRAWVERT_COLOR_OFFSET (14*4)
|
|
|
|
#define JOINTQUAT_SIZE (7*4)
|
|
#define JOINTMAT_SIZE (4*3*4)
|
|
#define JOINTWEIGHT_SIZE (4*4)
|
|
|
|
|
|
/*
|
|
============
|
|
SSE3_Dot
|
|
============
|
|
*/
|
|
float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
|
|
float d;
|
|
__asm {
|
|
mov esi, v1
|
|
mov edi, v2
|
|
movaps xmm0, [esi]
|
|
mulps xmm0, [edi]
|
|
haddps( _xmm0, _xmm0 )
|
|
haddps( _xmm0, _xmm0 )
|
|
movss d, xmm0
|
|
}
|
|
return d;
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE3::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE3::GetName( void ) const {
|
|
return "MMX & SSE & SSE2 & SSE3";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE3::TransformVerts
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
|
|
__asm
|
|
{
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
imul eax, DRAWVERT_SIZE
|
|
|
|
mov ecx, verts
|
|
mov edx, index
|
|
mov esi, weights
|
|
mov edi, joints
|
|
|
|
add ecx, eax
|
|
neg eax
|
|
|
|
loopVert:
|
|
mov ebx, [edx]
|
|
movaps xmm2, [esi]
|
|
add edx, 8
|
|
movaps xmm0, xmm2
|
|
add esi, JOINTWEIGHT_SIZE
|
|
movaps xmm1, xmm2
|
|
|
|
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
|
|
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-4], 0
|
|
|
|
jne doneWeight
|
|
|
|
loopWeight:
|
|
mov ebx, [edx]
|
|
movaps xmm5, [esi]
|
|
add edx, 8
|
|
movaps xmm3, xmm5
|
|
add esi, JOINTWEIGHT_SIZE
|
|
movaps xmm4, xmm5
|
|
|
|
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
|
|
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
|
|
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-4], 0
|
|
|
|
addps xmm0, xmm3
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
|
|
je loopWeight
|
|
|
|
doneWeight:
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
haddps( _xmm0, _xmm1 )
|
|
haddps( _xmm2, _xmm0 )
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
|
|
|
|
haddps( _xmm2, _xmm2 )
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
|
|
|
|
jl loopVert
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j;
|
|
const byte *jointsPtr = (byte *)joints;
|
|
|
|
for( j = i = 0; i < numVerts; i++ ) {
|
|
idVec3 v;
|
|
|
|
v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
while( index[j*2+1] == 0 ) {
|
|
j++;
|
|
v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
}
|
|
j++;
|
|
|
|
verts[i].xyz = v;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
#endif /* _WIN32 */
|