dhewm3/neo/idlib/math/Simd_SSE3.cpp

368 lines
11 KiB
C++
Raw Normal View History

2011-11-22 21:28:15 +00:00
/*
===========================================================================
Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
2011-11-22 21:28:15 +00:00
2011-12-06 16:14:59 +00:00
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
2011-11-22 21:28:15 +00:00
Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"
#include "Simd_SSE2.h"
#include "Simd_SSE3.h"
//===============================================================
//
// SSE3 implementation of idSIMDProcessor
//
//===============================================================
#if defined(__GNUC__) && defined(__SSE3__)
2011-11-22 21:28:15 +00:00
/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
return "MMX & SSE & SSE2 & SSE3";
}
2011-12-13 22:43:27 +00:00
#elif defined(_MSC_VER)
2011-11-22 21:28:15 +00:00
#include <xmmintrin.h>
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
/*
The first argument of an instruction macro is the destination
and the second argument is the source operand. The destination
operand can be _xmm0 to _xmm7 only. The source operand can be
any one of the registers _xmm0 to _xmm7 or _eax, _ecx, _edx, _esp,
_ebp, _ebx, _esi, or _edi that contains the effective address.
For instance: haddps xmm0, xmm1
becomes: haddps( _xmm0, _xmm1 )
and: haddps xmm0, [esi]
becomes: haddps( _xmm0, _esi )
The ADDRESS_ADDC macro can be used when the effective source address
is formed by adding a constant to a general purpose register.
For instance: haddps xmm0, [esi+48]
becomes: haddps( _xmm0, ADDRESS_ADDC( _esi, 48 ) )
The ADDRESS_ADDR macro can be used when the effective source address
is formed by adding two general purpose registers.
For instance: haddps xmm0, [esi+eax]
becomes: haddps( _xmm0, ADDRESS_ADDR( _esi, _eax ) )
The ADDRESS_ADDRC macro can be used when the effective source address
is formed by adding two general purpose registers and a constant.
The constant must be in the range [-128, 127].
For instance: haddps xmm0, [esi+eax+48]
becomes: haddps( _xmm0, ADDRESS_ADDRC( _esi, _eax, 48 ) )
The ADDRESS_SCALEADDR macro can be used when the effective source address is formed
by adding a scaled general purpose register to another general purpose register.
The scale must be either 1, 2, 4 or 8.
For instance: haddps xmm0, [esi+eax*4]
becomes: haddps( _xmm0, ADDRESS_SCALEADDR( _esi, _eax, 4 ) )
The ADDRESS_SCALEADDRC macro can be used when the effective source address is formed
by adding a scaled general purpose register to another general purpose register and
also adding a constant. The scale must be either 1, 2, 4 or 8. The constant must
be in the range [-128, 127].
For instance: haddps xmm0, [esi+eax*4+64]
becomes: haddps( _xmm0, ADDRESS_SCALEADDRC( _esi, _eax, 4, 64 ) )
*/
#define _eax 0x00
#define _ecx 0x01
#define _edx 0x02
#define _ebx 0x03
#define _esp 0x04
#define _ebp 0x05
#define _esi 0x06
#define _edi 0x07
#define _xmm0 0xC0
#define _xmm1 0xC1
#define _xmm2 0xC2
#define _xmm3 0xC3
#define _xmm4 0xC4
#define _xmm5 0xC5
#define _xmm6 0xC6
#define _xmm7 0xC7
#define RSCALE( s ) ( (s&2)<<5 ) | ( (s&4)<<5 ) | ( (s&8)<<3 ) | ( (s&8)<<4 )
#define ADDRESS_ADDC( reg0, constant ) 0x40 | ( reg0 & 7 ) \
_asm _emit constant
#define ADDRESS_ADDR( reg0, reg1 ) 0x04 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 )
#define ADDRESS_ADDRC( reg0, reg1, constant ) 0x44 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) \
_asm _emit constant
#define ADDRESS_SCALEADDR( reg0, reg1, scale ) 0x04 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale )
#define ADDRESS_SCALEADDRC( reg0, reg1, scale, constant ) 0x44 \
_asm _emit ( ( reg1 & 7 ) << 3 ) | ( reg0 & 7 ) | RSCALE( scale ) \
_asm _emit constant
// Packed Single-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1], dst[2]=dst[2]+src[2], dst[3]=dst[3]-src[3] )
#define addsubps( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0xD0 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Double-FP Add/Subtract ( dst[0]=dst[0]+src[0], dst[1]=dst[1]-src[1] )
#define addsubpd( dst, src ) \
_asm _emit 0x66 \
_asm _emit 0x0F \
_asm _emit 0xD0 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Single-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=dst[2]+dst[3], dst[2]=src[0]+src[1], dst[3]=src[2]+src[3] )
#define haddps( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x7C \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Double-FP Horizontal Add ( dst[0]=dst[0]+dst[1], dst[1]=src[0]+src[1] )
#define haddpd( dst, src ) \
_asm _emit 0x66 \
_asm _emit 0x0F \
_asm _emit 0x7C \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Single-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=dst[2]-dst[3], dst[2]=src[0]-src[1], dst[3]=src[2]-src[3] )
#define hsubps( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x7D \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Packed Double-FP Horizontal Subtract ( dst[0]=dst[0]-dst[1], dst[1]=src[0]-src[1] )
#define hsubpd( dst, src ) \
_asm _emit 0x66 \
_asm _emit 0x0F \
_asm _emit 0x7D \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move Packed Single-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0], dst[2]=src[2], dst[3]=src[2] )
#define movsldup( dst, src ) \
_asm _emit 0xF3 \
_asm _emit 0x0F \
_asm _emit 0x12 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move One Double-FP Low and Duplicate ( dst[0]=src[0], dst[1]=src[0] )
#define movdldup( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x12 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move Packed Single-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1], dst[2]=src[3], dst[3]=src[3] )
#define movshdup( dst, src ) \
_asm _emit 0xF3 \
_asm _emit 0x0F \
_asm _emit 0x16 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Move One Double-FP High and Duplicate ( dst[0]=src[1], dst[1]=src[1] )
#define movdhdup( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0x16 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
// Load Unaligned Integer 128 bits
#define lddqu( dst, src ) \
_asm _emit 0xF2 \
_asm _emit 0x0F \
_asm _emit 0xF0 \
_asm _emit ( ( dst & 7 ) << 3 ) | src
#define DRAWVERT_SIZE 60
#define DRAWVERT_XYZ_OFFSET (0*4)
#define DRAWVERT_ST_OFFSET (3*4)
#define DRAWVERT_NORMAL_OFFSET (5*4)
#define DRAWVERT_TANGENT0_OFFSET (8*4)
#define DRAWVERT_TANGENT1_OFFSET (11*4)
#define DRAWVERT_COLOR_OFFSET (14*4)
#define JOINTQUAT_SIZE (7*4)
#define JOINTMAT_SIZE (4*3*4)
#define JOINTWEIGHT_SIZE (4*4)
/*
============
SSE3_Dot
============
*/
float SSE3_Dot( const idVec4 &v1, const idVec4 &v2 ) {
float d;
__asm {
mov esi, v1
mov edi, v2
movaps xmm0, [esi]
mulps xmm0, [edi]
haddps( _xmm0, _xmm0 )
haddps( _xmm0, _xmm0 )
movss d, xmm0
}
return d;
}
/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
return "MMX & SSE & SSE2 & SSE3";
}
/*
============
idSIMD_SSE3::TransformVerts
============
*/
void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
#if 1
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, index
mov esi, weights
mov edi, joints
add ecx, eax
neg eax
loopVert:
mov ebx, [edx]
movaps xmm2, [esi]
add edx, 8
movaps xmm0, xmm2
add esi, JOINTWEIGHT_SIZE
movaps xmm1, xmm2
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
cmp dword ptr [edx-4], 0
jne doneWeight
loopWeight:
mov ebx, [edx]
movaps xmm5, [esi]
add edx, 8
movaps xmm3, xmm5
add esi, JOINTWEIGHT_SIZE
movaps xmm4, xmm5
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
cmp dword ptr [edx-4], 0
addps xmm0, xmm3
addps xmm1, xmm4
addps xmm2, xmm5
je loopWeight
doneWeight:
add eax, DRAWVERT_SIZE
haddps( _xmm0, _xmm1 )
haddps( _xmm2, _xmm0 )
movhps [ecx+eax-DRAWVERT_SIZE+0], xmm2
haddps( _xmm2, _xmm2 )
movss [ecx+eax-DRAWVERT_SIZE+8], xmm2
jl loopVert
done:
}
#else
int i, j;
const byte *jointsPtr = (byte *)joints;
for( j = i = 0; i < numVerts; i++ ) {
idVec3 v;
v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
while( index[j*2+1] == 0 ) {
j++;
v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
}
j++;
verts[i].xyz = v;
}
#endif
}
#endif /* _WIN32 */