mirror of
https://github.com/dhewm/dhewm3.git
synced 2024-11-29 23:51:49 +00:00
30c66d5ef8
The 64bit compiler doesn't support __asm.
365 lines
11 KiB
C++
365 lines
11 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include "sys/platform.h"
|
|
|
|
#include "idlib/math/Simd_SSE3.h"
|
|
|
|
//===============================================================
|
|
//
|
|
// SSE3 implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
|
|
#if defined(__GNUC__) && defined(__SSE3__)
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE3::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE3::GetName( void ) const {
|
|
return "MMX & SSE & SSE2 & SSE3";
|
|
}
|
|
|
|
#elif defined(_MSC_VER) && defined(_M_IX86)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#include "idlib/geometry/JointTransform.h"
|
|
#include "idlib/geometry/DrawVert.h"
|
|
#include "idlib/math/Vector.h"
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
|
|
#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
|
|
|
|
/*
|
|
|
|
The first argument of an instruction macro is the destination
|
|
and the second argument is the source operand. The destination
|
|
operand can be _xmm0 to _xmm7 only. The source operand can be
|
|
any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
|
|
_ebp, _ebx, _esi, or _edi that contains the effective address.
|
|
|
|
For instance: haddps xmm0, xmm1
|
|
becomes: haddps( _xmm0, _xmm1 )
|
|
and: haddps xmm0, [esi]
|
|
becomes: haddps( _xmm0, _esi )
|
|
|
|
The ADDRESS_ADDC macro can be used when the effective source address
|
|
is formed by adding a constant to a general purpose register.
|
|
For instance: haddps xmm0, [esi+48]
|
|
becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
|
|
|
|
The ADDRESS_ADDR macro can be used when the effective source address
|
|
is formed by adding two general purpose registers.
|
|
For instance: haddps xmm0, [esi+eax]
|
|
becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
|
|
|
|
The ADDRESS_ADDRC macro can be used when the effective source address
|
|
is formed by adding two general purpose registers and a constant.
|
|
The constant must be in the range [-128, 127].
|
|
For instance: haddps xmm0, [esi+eax+48]
|
|
becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
|
|
|
|
The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
|
|
by adding a scaled general purpose register to another general purpose register.
|
|
The scale must be either 1, 2, 4 or 8.
|
|
For instance: haddps xmm0, [esi+eax*4]
|
|
becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
|
|
|
|
The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
|
|
by adding a scaled general purpose register to another general purpose register and
|
|
also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
|
|
be in the range [-128, 127].
|
|
For instance: haddps xmm0, [esi+eax*4+64]
|
|
becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
|
|
|
|
*/
|
|
|
|
#define _eax 0x00
|
|
#define _ecx 0x01
|
|
#define _edx 0x02
|
|
#define _ebx 0x03
|
|
#define _esp 0x04
|
|
#define _ebp 0x05
|
|
#define _esi 0x06
|
|
#define _edi 0x07
|
|
|
|
#define _xmm0 0xC0
|
|
#define _xmm1 0xC1
|
|
#define _xmm2 0xC2
|
|
#define _xmm3 0xC3
|
|
#define _xmm4 0xC4
|
|
#define _xmm5 0xC5
|
|
#define _xmm6 0xC6
|
|
#define _xmm7 0xC7
|
|
|
|
#define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
|
|
|
|
#define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
|
|
_asm _emit constant
|
|
|
|
#define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
|
|
|
|
#define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
|
|
_asm _emit constant
|
|
|
|
#define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
|
|
|
|
#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
|
|
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
|
|
_asm _emit constant
|
|
|
|
|
|
// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
|
|
#define addsubps( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0xD0 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
|
|
#define addsubpd( dst, src ) \
|
|
_asm _emit 0x66 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0xD0 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
|
|
#define haddps( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7C \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
|
|
#define haddpd( dst, src ) \
|
|
_asm _emit 0x66 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7C \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
|
|
#define hsubps( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7D \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
|
|
#define hsubpd( dst, src ) \
|
|
_asm _emit 0x66 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x7D \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
|
|
#define movsldup( dst, src ) \
|
|
_asm _emit 0xF3 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x12 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
|
|
#define movdldup( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x12 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
|
|
#define movshdup( dst, src ) \
|
|
_asm _emit 0xF3 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x16 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
|
|
#define movdhdup( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0x16 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
// Load Unaligned Integer 128 bits
|
|
#define lddqu( dst, src ) \
|
|
_asm _emit 0xF2 \
|
|
_asm _emit 0x0F \
|
|
_asm _emit 0xF0 \
|
|
_asm _emit ( ( dst & 7 ) << 3 ) | src
|
|
|
|
|
|
#define DRAWVERT_SIZE 60
|
|
#define DRAWVERT_XYZ_OFFSET (0*4)
|
|
#define DRAWVERT_ST_OFFSET (3*4)
|
|
#define DRAWVERT_NORMAL_OFFSET (5*4)
|
|
#define DRAWVERT_TANGENT0_OFFSET (8*4)
|
|
#define DRAWVERT_TANGENT1_OFFSET (11*4)
|
|
#define DRAWVERT_COLOR_OFFSET (14*4)
|
|
|
|
#define JOINTQUAT_SIZE (7*4)
|
|
#define JOINTMAT_SIZE (4*3*4)
|
|
#define JOINTWEIGHT_SIZE (4*4)
|
|
|
|
|
|
/*
|
|
============
|
|
SSE3_Dot
|
|
============
|
|
*/
|
|
float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
|
|
float d;
|
|
__asm {
|
|
mov esi, v1
|
|
mov edi, v2
|
|
movaps xmm0, [esi]
|
|
mulps xmm0, [edi]
|
|
haddps( _xmm0, _xmm0 )
|
|
haddps( _xmm0, _xmm0 )
|
|
movss d, xmm0
|
|
}
|
|
return d;
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE3::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE3::GetName( void ) const {
|
|
return "MMX & SSE & SSE2 & SSE3";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE3::TransformVerts
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
|
|
__asm
|
|
{
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
imul eax, DRAWVERT_SIZE
|
|
|
|
mov ecx, verts
|
|
mov edx, index
|
|
mov esi, weights
|
|
mov edi, joints
|
|
|
|
add ecx, eax
|
|
neg eax
|
|
|
|
loopVert:
|
|
mov ebx, [edx]
|
|
movaps xmm2, [esi]
|
|
add edx, 8
|
|
movaps xmm0, xmm2
|
|
add esi, JOINTWEIGHT_SIZE
|
|
movaps xmm1, xmm2
|
|
|
|
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
|
|
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-4], 0
|
|
|
|
jne doneWeight
|
|
|
|
loopWeight:
|
|
mov ebx, [edx]
|
|
movaps xmm5, [esi]
|
|
add edx, 8
|
|
movaps xmm3, xmm5
|
|
add esi, JOINTWEIGHT_SIZE
|
|
movaps xmm4, xmm5
|
|
|
|
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
|
|
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
|
|
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-4], 0
|
|
|
|
addps xmm0, xmm3
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
|
|
je loopWeight
|
|
|
|
doneWeight:
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
haddps( _xmm0, _xmm1 )
|
|
haddps( _xmm2, _xmm0 )
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
|
|
|
|
haddps( _xmm2, _xmm2 )
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
|
|
|
|
jl loopVert
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j;
|
|
const byte *jointsPtr = (byte *)joints;
|
|
|
|
for( j = i = 0; i < numVerts; i++ ) {
|
|
idVec3 v;
|
|
|
|
v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
while( index[j*2+1] == 0 ) {
|
|
j++;
|
|
v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
}
|
|
j++;
|
|
|
|
verts[i].xyz = v;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
#endif /* _MSC_VER */
|