quake4-sdk/source/idlib/math/Simd_SSE2.cpp
2007-06-15 00:00:00 +00:00

2211 lines
58 KiB
C++

#include "../precompiled.h"
#pragma hdrstop
#include "Simd_generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"
#include "Simd_SSE2.h"
#include "Simd_InstructionMacros.h"
//===============================================================
//
// SSE2 implementation of idSIMDProcessor
//
//===============================================================
#ifdef _WINDOWS
#include "Simd_InstructionMacros.h"
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
ALIGN4_INIT4( unsigned long SIMD_SP_firstSignBit, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_ZERO );
ALIGN4_INIT1( unsigned long SIMD_SP_signBit, IEEE_SP_SIGN );
ALIGN4_INIT1( unsigned long SIMD_SP_absMask, ~IEEE_SP_SIGN );
ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, ~IEEE_SP_INF );
ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );
ALIGN4_INIT4( unsigned long SIMD_SP_clearLast, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 );
ALIGN4_INIT4( unsigned long SIMD_SP_clearFirstThree, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF );
ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x0, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN );
ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x1, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN );
ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x2, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN );
ALIGN4_INIT4( unsigned long SIMD_DW_zero, 0, 0, 0, 0 );
ALIGN16_INIT1( unsigned char SIMD_B_one, 1 );
ALIGN4_INIT4( unsigned long SIMD_DW_capTris_c0, 0, 0, 0, 1 );
ALIGN4_INIT4( unsigned long SIMD_DW_capTris_c1, 1, 1, 0, 0 );
ALIGN4_INIT4( unsigned long SIMD_DW_capTris_c2, 0, 1, 0, 0 );
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
ALIGN4_INIT1( float SIMD_SP_negInfinity, -idMath::INFINITY );
/*
============
idSIMD_SSE2::GetName
============
*/
const char * idSIMD_SSE2::GetName( void ) const {
return "MMX & SSE & SSE2";
}
#if 0 // the SSE2 code is ungodly slow
/*
============
idSIMD_SSE2::MatX_LowerTriangularSolve
solves x in Lx = b for the n * n sub-matrix of L
if skip > 0 the first skip elements of x are assumed to be valid already
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
int nc;
const float *lptr;
if ( skip >= n ) {
return;
}
lptr = L[skip];
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
#define NSKIP( n, s ) ((n<<3)|(s&7))
switch( NSKIP( n, skip ) ) {
case NSKIP( 1, 0 ): x[0] = b[0];
return;
case NSKIP( 2, 0 ): x[0] = b[0];
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
return;
case NSKIP( 3, 0 ): x[0] = b[0];
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
return;
case NSKIP( 4, 0 ): x[0] = b[0];
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
return;
case NSKIP( 5, 0 ): x[0] = b[0];
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
return;
case NSKIP( 6, 0 ): x[0] = b[0];
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
return;
case NSKIP( 7, 0 ): x[0] = b[0];
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
return;
}
return;
}
// process first 4 rows
switch( skip ) {
case 0: x[0] = b[0];
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
skip = 4;
}
lptr = L[skip];
__asm {
push ebx
mov eax, skip // eax = i
shl eax, 2 // eax = i*4
mov edx, n // edx = n
shl edx, 2 // edx = n*4
mov esi, x // esi = x
mov edi, lptr // edi = lptr
add esi, eax
add edi, eax
mov ebx, b // ebx = b
// aligned
looprow:
mov ecx, eax
neg ecx
cvtps2pd xmm0, [esi+ecx]
cvtps2pd xmm2, [edi+ecx]
mulpd xmm0, xmm2
cvtps2pd xmm1, [esi+ecx+8]
cvtps2pd xmm3, [edi+ecx+8]
mulpd xmm1, xmm3
add ecx, 20*4
jg donedot16
dot16:
cvtps2pd xmm2, [esi+ecx-(16*4)]
cvtps2pd xmm3, [edi+ecx-(16*4)]
cvtps2pd xmm4, [esi+ecx-(14*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(14*4)]
addpd xmm0, xmm2
cvtps2pd xmm2, [esi+ecx-(12*4)]
mulpd xmm4, xmm5
cvtps2pd xmm3, [edi+ecx-(12*4)]
addpd xmm1, xmm4
cvtps2pd xmm4, [esi+ecx-(10*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(10*4)]
addpd xmm0, xmm2
cvtps2pd xmm2, [esi+ecx-(8*4)]
mulpd xmm4, xmm5
cvtps2pd xmm3, [edi+ecx-(8*4)]
addpd xmm1, xmm4
cvtps2pd xmm4, [esi+ecx-(6*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(6*4)]
addpd xmm0, xmm2
cvtps2pd xmm2, [esi+ecx-(4*4)]
mulpd xmm4, xmm5
cvtps2pd xmm3, [edi+ecx-(4*4)]
addpd xmm1, xmm4
cvtps2pd xmm4, [esi+ecx-(2*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(2*4)]
addpd xmm0, xmm2
add ecx, 16*4
mulpd xmm4, xmm5
addpd xmm1, xmm4
jle dot16
donedot16:
sub ecx, 8*4
jg donedot8
dot8:
cvtps2pd xmm2, [esi+ecx-(8*4)]
cvtps2pd xmm3, [edi+ecx-(8*4)]
cvtps2pd xmm7, [esi+ecx-(6*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(6*4)]
addpd xmm0, xmm2
cvtps2pd xmm6, [esi+ecx-(4*4)]
mulpd xmm7, xmm5
cvtps2pd xmm3, [edi+ecx-(4*4)]
addpd xmm1, xmm7
cvtps2pd xmm4, [esi+ecx-(2*4)]
mulpd xmm6, xmm3
cvtps2pd xmm7, [edi+ecx-(2*4)]
addpd xmm0, xmm6
add ecx, 8*4
mulpd xmm4, xmm7
addpd xmm1, xmm4
donedot8:
sub ecx, 4*4
jg donedot4
dot4:
cvtps2pd xmm2, [esi+ecx-(4*4)]
cvtps2pd xmm3, [edi+ecx-(4*4)]
cvtps2pd xmm4, [esi+ecx-(2*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(2*4)]
addpd xmm0, xmm2
add ecx, 4*4
mulpd xmm4, xmm5
addpd xmm1, xmm4
donedot4:
addpd xmm0, xmm1
movaps xmm1, xmm0
shufpd xmm1, xmm1, R_SHUFFLE_PD( 1, 0 )
addsd xmm0, xmm1
sub ecx, 4*4
jz dot0
add ecx, 4
jz dot1
add ecx, 4
jz dot2
//dot3:
cvtss2sd xmm1, [esi-(3*4)]
cvtss2sd xmm2, [edi-(3*4)]
mulsd xmm1, xmm2
addsd xmm0, xmm1
dot2:
cvtss2sd xmm3, [esi-(2*4)]
cvtss2sd xmm4, [edi-(2*4)]
mulsd xmm3, xmm4
addsd xmm0, xmm3
dot1:
cvtss2sd xmm5, [esi-(1*4)]
cvtss2sd xmm6, [edi-(1*4)]
mulsd xmm5, xmm6
addsd xmm0, xmm5
dot0:
cvtss2sd xmm1, [ebx+eax]
subsd xmm1, xmm0
cvtsd2ss xmm0, xmm1
movss [esi], xmm0
add eax, 4
cmp eax, edx
jge done
add esi, 4
mov ecx, nc
shl ecx, 2
add edi, ecx
add edi, 4
jmp looprow
// done
done:
pop ebx
}
}
/*
============
idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
solves x in L'x = b for the n * n sub-matrix of L
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
int nc;
const float *lptr;
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x[1] = b[1];
x[0] = b[0] - lptr[1*nc+0] * x[1];
return;
case 3:
x[2] = b[2];
x[1] = b[1] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 4:
x[3] = b[3];
x[2] = b[2] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 5:
x[4] = b[4];
x[3] = b[3] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 6:
x[5] = b[5];
x[4] = b[4] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 7:
x[6] = b[6];
x[5] = b[5] - lptr[6*nc+5] * x[6];
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
}
return;
}
int i, j, m;
float *xptr;
double s0;
// if the number of columns is not a multiple of 2 we're screwed for alignment.
// however, if the number of columns is a multiple of 2 but the number of to be
// processed rows is not a multiple of 2 we can still run 8 byte aligned
m = n;
if ( m & 1 ) {
m--;
x[m] = b[m];
lptr = L[m] + m - 4;
xptr = x + m;
__asm {
push ebx
mov eax, m // eax = i
mov esi, xptr // esi = xptr
mov edi, lptr // edi = lptr
mov ebx, b // ebx = b
mov edx, nc // edx = nc*sizeof(float)
shl edx, 2
process4rows_1:
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
xor ecx, ecx
sub eax, m
neg eax
jz done4x4_1
process4x4_1: // process 4x4 blocks
cvtps2pd xmm3, [edi]
cvtps2pd xmm4, [edi+8]
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+0]
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+4]
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
mulpd xmm1, xmm7
cvtps2pd xmm3, [edi]
mulpd xmm6, xmm7
cvtps2pd xmm4, [edi+8]
subpd xmm0, xmm1
subpd xmm2, xmm6
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+8]
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+12]
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
mulpd xmm1, xmm7
add ecx, 4
mulpd xmm6, xmm7
cmp ecx, eax
subpd xmm0, xmm1
subpd xmm2, xmm6
jl process4x4_1
done4x4_1: // process left over of the 4 rows
cvtps2pd xmm3, [edi]
cvtps2pd xmm4, [edi+8]
cvtss2sd xmm5, [esi+4*ecx]
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
mulpd xmm3, xmm5
mulpd xmm4, xmm5
subpd xmm0, xmm3
subpd xmm2, xmm4
imul ecx, edx
sub edi, ecx
neg eax
add eax, m
sub eax, 4
movapd xmm1, xmm0
shufpd xmm1, xmm1, R_SHUFFLE_PD( 1, 1 )
movapd xmm3, xmm2
shufpd xmm3, xmm3, R_SHUFFLE_PD( 1, 1 )
sub edi, edx
cvtsd2ss xmm7, xmm3
movss [esi-4], xmm7 // xptr[-1] = s3
movsd xmm4, xmm3
movsd xmm5, xmm3
cvtss2sd xmm7, [edi+8]
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
cvtss2sd xmm7, [edi+4]
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
cvtss2sd xmm7, [edi]
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
subsd xmm2, xmm3
cvtsd2ss xmm7, xmm2
movss [esi-8], xmm7 // xptr[-2] = s2
movsd xmm6, xmm2
sub edi, edx
subsd xmm0, xmm5
subsd xmm1, xmm4
cvtss2sd xmm7, [edi+4]
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
cvtss2sd xmm7, [edi]
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
subsd xmm1, xmm2
cvtsd2ss xmm7, xmm1
movss [esi-12], xmm7 // xptr[-3] = s1
subsd xmm0, xmm6
sub edi, edx
cmp eax, 4
cvtss2sd xmm7, [edi]
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
subsd xmm0, xmm1
cvtsd2ss xmm7, xmm0
movss [esi-16], xmm7 // xptr[-4] = s0
jl done4rows_1
sub edi, edx
sub edi, 16
sub esi, 16
jmp process4rows_1
done4rows_1:
pop ebx
}
}
else {
lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
xptr = x + m;
__asm {
push ebx
mov eax, m // eax = i
mov esi, xptr // esi = xptr
mov edi, lptr // edi = lptr
mov ebx, b // ebx = b
mov edx, nc // edx = nc*sizeof(float)
shl edx, 2
process4rows:
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
sub eax, m
jz done4x4
neg eax
xor ecx, ecx
process4x4: // process 4x4 blocks
cvtps2pd xmm3, [edi]
cvtps2pd xmm4, [edi+8]
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+0]
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+4]
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
mulpd xmm1, xmm7
cvtps2pd xmm3, [edi]
mulpd xmm6, xmm7
cvtps2pd xmm4, [edi+8]
subpd xmm0, xmm1
subpd xmm2, xmm6
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+8]
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+12]
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
mulpd xmm1, xmm7
add ecx, 4
mulpd xmm6, xmm7
cmp ecx, eax
subpd xmm0, xmm1
subpd xmm2, xmm6
jl process4x4
imul ecx, edx
sub edi, ecx
neg eax
done4x4: // process left over of the 4 rows
add eax, m
sub eax, 4
movapd xmm1, xmm0
shufpd xmm1, xmm1, R_SHUFFLE_PD( 1, 1 )
movapd xmm3, xmm2
shufpd xmm3, xmm3, R_SHUFFLE_PD( 1, 1 )
sub edi, edx
cvtsd2ss xmm7, xmm3
movss [esi-4], xmm7 // xptr[-1] = s3
movsd xmm4, xmm3
movsd xmm5, xmm3
cvtss2sd xmm7, [edi+8]
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
cvtss2sd xmm7, [edi+4]
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
cvtss2sd xmm7, [edi]
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
subsd xmm2, xmm3
cvtsd2ss xmm7, xmm2
movss [esi-8], xmm7 // xptr[-2] = s2
movsd xmm6, xmm2
sub edi, edx
subsd xmm0, xmm5
subsd xmm1, xmm4
cvtss2sd xmm7, [edi+4]
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
cvtss2sd xmm7, [edi]
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
subsd xmm1, xmm2
cvtsd2ss xmm7, xmm1
movss [esi-12], xmm7 // xptr[-3] = s1
subsd xmm0, xmm6
sub edi, edx
cmp eax, 4
cvtss2sd xmm7, [edi]
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
subsd xmm0, xmm1
cvtsd2ss xmm7, xmm0
movss [esi-16], xmm7 // xptr[-4] = s0
jl done4rows
sub edi, edx
sub edi, 16
sub esi, 16
jmp process4rows
done4rows:
pop ebx
}
}
// process left over rows
for ( i = (m&3)-1; i >= 0; i-- ) {
s0 = b[i];
lptr = L[i+1] + i;
for ( j = i + 1; j < m; j++ ) {
s0 -= lptr[0] * x[j];
lptr += nc;
}
x[i] = s0;
}
}
#endif
/*
============
idSIMD_SSE2::ConvertJointQuatsToJointMats
============
*/
void VPCALL idSIMD_SSE2::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
#if 1
assert_16_byte_aligned( jointMats );
assert_16_byte_aligned( jointQuats );
__asm {
mov eax, numJoints
shl eax, JOINTQUAT_SIZE_SHIFT
mov esi, jointQuats
mov edi, jointMats
add esi, eax
neg eax
jz done
loopQuat:
movaps xmm0, [esi+eax+JOINTQUAT_Q_OFFSET] // xmm0 = q.x, q.y, q.z, q.w
movaps xmm6, [esi+eax+JOINTQUAT_T_OFFSET] // xmm6 = t.x, t.y, t.z, w
add edi, JOINTMAT_SIZE
movaps xmm1, xmm0 // xmm1 = x, y, z, w
addps xmm1, xmm1 // xmm1 = x2, y2, z2, w2
add eax, JOINTQUAT_SIZE
// calculate the 9 products
pshufd xmm2, xmm0, R_SHUFFLE_D( 1, 0, 0, 1 ) // xmm2 = y, x, x, y
pshufd xmm3, xmm1, R_SHUFFLE_D( 1, 1, 2, 2 ) // xmm3 = y2, y2, z2, z2
mulps xmm2, xmm3 // xmm2 = yy2, xy2, xz2, yz2
pshufd xmm4, xmm0, R_SHUFFLE_D( 2, 3, 3, 3 ) // xmm4 = z, w, w, w
pshufd xmm5, xmm1, R_SHUFFLE_D( 2, 2, 1, 0 ) // xmm5 = z2, z2, y2, x2
mulps xmm4, xmm5 // xmm4 = zz2, wz2, wy2, wx2
mulss xmm0, xmm1 // xmm0 = xx2, y2, z2, w2
// calculate the last two elements of the third row
movss xmm7, SIMD_SP_one // xmm7 = 1, 0, 0, 0
subss xmm7, xmm0 // xmm7 = -xx2+1, 0, 0, 0
subss xmm7, xmm2 // xmm7 = -xx2-yy2+1, 0, 0, 0
shufps xmm7, xmm6, R_SHUFFLE_PS( 0, 1, 2, 3 ) // xmm7 = -xx2-yy2+1, 0, t.z, w
// calcluate first row
xorps xmm2, SIMD_SP_quat2mat_x0 // xmm2 = yy2, -xy2, -xz2, -yz2
xorps xmm4, SIMD_SP_quat2mat_x1 // xmm4 = -zz2, wz2, -wy2, -wx2
addss xmm4, SIMD_SP_one // xmm4 = -zz2+1, wz2, -wy2, -wx2
movaps xmm3, xmm4 // xmm3 = -zz2+1, wz2, -wy2, -wx2
subps xmm3, xmm2 // xmm3 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2
movaps [edi-JOINTMAT_SIZE+0*16+0*4], xmm3 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2
movss [edi-JOINTMAT_SIZE+0*16+3*4], xmm6 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, t.x
// calculate second row
movss xmm2, xmm0 // xmm2 = xx2, -xy2, -xz2, -yz2
xorps xmm4, SIMD_SP_quat2mat_x2 // xmm4 = -zz2+1, -wz2, wy2, wx2
subps xmm4, xmm2 // xmm4 = -xx2-zz2+1, xy2-wz2, xz2+wy2, yz2+wx2
shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) // xmm6 = t.y, t.z, w, t.x
shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 0, 3, 2 ) // xmm4 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2
movaps [edi-JOINTMAT_SIZE+1*16+0*4], xmm4 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2
movss [edi-JOINTMAT_SIZE+1*16+3*4], xmm6 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, t.y
// calculate third row
movhlps xmm3, xmm4 // xmm3 = yz2+wx2, xz2+wy2, xz2-wy2, yz2-wx2
shufps xmm3, xmm7, R_SHUFFLE_PS( 1, 3, 0, 2 ) // xmm3 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z
movaps [edi-JOINTMAT_SIZE+2*16+0*4], xmm3 // row2 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z
jl loopQuat
done:
}
#else
for ( int i = 0; i < numJoints; i++ ) {
const float *q = &jointQuats[i].q;
float *m = jointMats[i].mat;
float x2 = q[0] + q[0];
float y2 = q[1] + q[1];
float z2 = q[2] + q[2];
float w2 = q[3] + q[3];
float yy2 = q[1] * y2;
float xy2 = q[0] * y2;
float xz2 = q[0] * z2;
float yz2 = q[1] * z2;
float zz2 = q[2] * z2;
float wz2 = q[3] * z2;
float wy2 = q[3] * y2;
float wx2 = q[3] * x2;
float xx2 = q[0] * x2;
m[0*4+0] = - yy2 - zz2 + 1.0f;
m[0*4+1] = xy2 + wz2;
m[0*4+2] = xz2 - wy2;
m[0*4+3] = q[4];
m[1*4+0] = xy2 - wz2;
m[1*4+1] = - xx2 - zz2 + 1.0f;
m[1*4+2] = yz2 + wx2;
m[1*4+3] = q[5];
m[2*4+0] = xz2 + wy2;
m[2*4+1] = yz2 - wx2;
m[2*4+2] = - xx2 - yy2 + 1.0f;
m[2*4+3] = q[6];
}
#endif
}
/*
============
idSIMD_SSE2::MultiplyJoints
============
*/
/*
ALIGN4_INIT4( unsigned long SIMD_SP_quatmul_xor0, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_SIGN );
ALIGN4_INIT4( unsigned long SIMD_SP_quatmul_xor1, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN );
ALIGN4_INIT4( unsigned long SIMD_SP_quatmul_xor2, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_SIGN );
void VPCALL idSIMD_SSE2::MultiplyJoints( idJointQuat *result, const idJointQuat *joints1, const idJointQuat *joints2, const int numJoints ) {
#if 1
assert_16_byte_aligned( result );
assert_16_byte_aligned( joints1 );
assert_16_byte_aligned( joints2 );
__asm {
mov eax, numJoints
test eax, eax
jz done
mov ecx, joints1
mov edx, joints2
mov edi, result
imul eax, JOINTMAT_SIZE
add ecx, eax
add edx, eax
add edi, eax
neg eax
loopQuat:
movaps xmm0, [ecx+eax] // Ax, Ay, Az, Aw 6/1
movaps xmm1, [edx+eax] // Bx, By, Bz, Bw 6/1
pshufd xmm2, xmm0, 0x00 // Ax, Ax, Ax, Ax 4/2
pshufd xmm3, xmm1, 0x1B // BW, BZ, BY, BX 4/2
pshufd xmm4, xmm0, 0x55 // Ay, Ay, Ay, Ay 4/2
pshufd xmm5, xmm1, 0x4E // Bz, Bw, Bx, By 4/2
mulps xmm2, xmm3 // Ax*Bw, Ax*Bz, Ax*By, Ax*Bx 6/2
pshufd xmm6, xmm0, 0xAA // Az, Az, Az, Az 4/2
mulps xmm4, xmm5 // Ay*Bz, Ay*Bw, Ay*Bx, Ay*By 6/2
pshufd xmm7, xmm1, 0xB1 // By, Bx, Bw, Bz 4/2
xorps xmm2, SIMD_SP_quatmul_xor0 // Ax*Bw, -Ax*Bz, Ax*By, -Ax*Bx 4/2
pshufd xmm0, xmm0, 0xFF // Aw, Aw, Aw, Aw 4/2
mulps xmm6, xmm7 // Az*By, Az*Bx, Az*Bw, Az*Bz 6/2
xorps xmm4, SIMD_SP_quatmul_xor1 // Ay*Bz, Ay*Bw, -Ay*Bx, -Ay*By 4/2
mulps xmm0, xmm1 // Aw*Bx, Aw*By, Aw*Bz, Aw*Bw 6/2
addps xmm2, xmm4 // Ax*Bw+Ay*Bz, -Ax*Bz+Ay*Bw, Ax*By-Ay*Bx, -Ax*Bx-Ay*By 4/2
xorps xmm6, SIMD_SP_quatmul_xor2 // -Az*By, Az*Bx, Az*Bw, -Az*Bz 4/2
addps xmm0, xmm2 // add to the result 4/2
addps xmm0, xmm6 // add to the result 4/2
movaps [edi+eax], xmm0 // save result 6/1
add eax, JOINTQUAT_SIZE
jl loopQuat
}
#else
int i;
for ( i = 0; i < numJoints; i++ ) {
result[i].q = joints1[i].q * joints2[i].q;
}
#endif
}
*/
/*
============
idSIMD_SSE2::TransformJoints
============
*/
void VPCALL idSIMD_SSE2::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
#if 1
assert_16_byte_aligned( jointMats );
__asm {
mov ecx, firstJoint
mov eax, lastJoint
sub eax, ecx
jl done
shl ecx, 2 // ecx = firstJoint * 4
mov edi, parents
add edi, ecx // edx = &parents[firstJoint]
lea ecx, [ecx+ecx*2]
shl ecx, 2 // ecx = firstJoint * JOINTMAT_SIZE
mov esi, jointMats // esi = jointMats
shl eax, 2 // eax = ( lastJoint - firstJoint ) * 4
add edi, eax
neg eax
loopJoint:
mov edx, [edi+eax]
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
lea edx, [edx+edx*2]
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
shl edx, 4 // edx = parents[i] * JOINTMAT_SIZE
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
movaps xmm7, [esi+edx+ 0]
pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm4, xmm0
pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm5, xmm1
addps xmm4, xmm5
add ecx, JOINTMAT_SIZE
add eax, 4
pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm6, xmm2
addps xmm4, xmm6
andps xmm7, SIMD_SP_clearFirstThree
addps xmm4, xmm7
movaps [esi+ecx-JOINTMAT_SIZE+ 0], xmm4
movaps xmm3, [esi+edx+16]
pshufd xmm5, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm5, xmm0
pshufd xmm6, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm6, xmm1
addps xmm5, xmm6
pshufd xmm4, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm4, xmm2
addps xmm5, xmm4
andps xmm3, SIMD_SP_clearFirstThree
addps xmm5, xmm3
movaps [esi+ecx-JOINTMAT_SIZE+16], xmm5
movaps xmm7, [esi+edx+32]
pshufd xmm6, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm6, xmm0
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm4, xmm1
addps xmm6, xmm4
pshufd xmm3, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm3, xmm2
addps xmm6, xmm3
andps xmm7, SIMD_SP_clearFirstThree
addps xmm6, xmm7
movaps [esi+ecx-JOINTMAT_SIZE+32], xmm6
jle loopJoint
done:
}
#else
int i;
for( i = firstJoint; i <= lastJoint; i++ ) {
assert( parents[i] < i );
jointMats[i] *= jointMats[parents[i]];
}
#endif
}
/*
============
idSIMD_SSE2::UntransformJoints
============
*/
void VPCALL idSIMD_SSE2::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
#if 1
assert_16_byte_aligned( jointMats );
__asm {
mov edx, firstJoint
mov eax, lastJoint
mov ecx, eax
sub eax, edx
jl done
mov esi, jointMats // esi = jointMats
lea ecx, [ecx+ecx*2]
shl ecx, 4 // ecx = lastJoint * JOINTMAT_SIZE
shl edx, 2
mov edi, parents
add edi, edx // edi = &parents[firstJoint]
shl eax, 2 // eax = ( lastJoint - firstJoint ) * 4
loopJoint:
mov edx, [edi+eax]
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
lea edx, [edx+edx*2]
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
shl edx, 4 // edx = parents[i] * JOINTMAT_SIZE
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
movss xmm7, [esi+edx+12]
shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 2, 3, 0 )
subps xmm0, xmm7
movss xmm6, [esi+edx+28]
shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 )
subps xmm1, xmm6
movss xmm5, [esi+edx+44]
shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 2, 3, 0 )
subps xmm2, xmm5
sub ecx, JOINTMAT_SIZE
sub eax, 4
movaps xmm7, [esi+edx+ 0]
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm3, xmm0
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm4, xmm0
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm5, xmm0
movaps xmm7, [esi+edx+16]
pshufd xmm0, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm0, xmm1
addps xmm3, xmm0
pshufd xmm6, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm6, xmm1
addps xmm4, xmm6
pshufd xmm0, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm0, xmm1
addps xmm5, xmm0
movaps xmm7, [esi+edx+32]
pshufd xmm6, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm6, xmm2
addps xmm3, xmm6
movaps [esi+ecx+JOINTMAT_SIZE+ 0], xmm3
pshufd xmm1, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm1, xmm2
addps xmm4, xmm1
movaps [esi+ecx+JOINTMAT_SIZE+16], xmm4
pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm6, xmm2
addps xmm5, xmm6
movaps [esi+ecx+JOINTMAT_SIZE+32], xmm5
jge loopJoint
done:
}
#else
int i;
for( i = lastJoint; i >= firstJoint; i-- ) {
assert( parents[i] < i );
jointMats[i] /= jointMats[parents[i]];
}
#endif
}
/*
============
idSIMD_SSE2::MultiplyJoints
============
*/
void VPCALL idSIMD_SSE2::MultiplyJoints( idJointMat *result, const idJointMat *joints1, const idJointMat *joints2, const int numJoints ) {
#if 1
assert_16_byte_aligned( result );
assert_16_byte_aligned( joints1 );
assert_16_byte_aligned( joints2 );
__asm {
mov eax, numJoints
test eax, eax
jz done
mov ecx, joints1
mov edx, joints2
mov edi, result
imul eax, JOINTMAT_SIZE
add ecx, eax
add edx, eax
add edi, eax
neg eax
loopJoint:
movaps xmm0, [edx+eax+ 0]
movaps xmm1, [edx+eax+16]
movaps xmm2, [edx+eax+32]
movaps xmm7, [ecx+eax+ 0]
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm3, xmm0
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm4, xmm1
addps xmm3, xmm4
add eax, JOINTMAT_SIZE
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm5, xmm2
addps xmm3, xmm5
andps xmm7, SIMD_SP_clearFirstThree
addps xmm3, xmm7
movaps [edi+eax-JOINTMAT_SIZE+0], xmm3
movaps xmm7, [ecx+eax-JOINTMAT_SIZE+16]
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm3, xmm0
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm4, xmm1
addps xmm3, xmm4
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm5, xmm2
addps xmm3, xmm5
andps xmm7, SIMD_SP_clearFirstThree
addps xmm3, xmm7
movaps [edi+eax-JOINTMAT_SIZE+16], xmm3
movaps xmm7, [ecx+eax-JOINTMAT_SIZE+32]
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
mulps xmm3, xmm0
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
mulps xmm4, xmm1
addps xmm3, xmm4
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
mulps xmm5, xmm2
addps xmm3, xmm5
andps xmm7, SIMD_SP_clearFirstThree
addps xmm3, xmm7
movaps [edi+eax-JOINTMAT_SIZE+32], xmm3
jl loopJoint
done:
}
#else
int i;
for ( i = 0; i < numJoints; i++ ) {
idJointMat::Multiply( result[i], joints1[i], joints2[i] );
}
#endif
}
#pragma warning( disable : 4731 ) // frame pointer register 'ebx' modified by inline assembly code
/*
============
idSIMD_SSE2::TransformVertsNew
============
*/
void VPCALL idSIMD_SSE2::TransformVertsNew( idDrawVert *verts, const int numVerts, idBounds &bounds, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
ALIGN16( float tmpMin[4] );
ALIGN16( float tmpMax[4] );
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm {
push ebx
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
movaps xmm6, SIMD_SP_infinity
movaps xmm7, SIMD_SP_negInfinity
movaps tmpMin, xmm6
movaps tmpMax, xmm7
loopVert:
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
movaps xmm2, [esi]
add edx, JOINTWEIGHT_SIZE
movaps xmm0, xmm2
add esi, BASEVECTOR_SIZE
movaps xmm1, xmm2
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
je doneWeight
loopWeight:
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
movaps xmm5, [esi]
add edx, JOINTWEIGHT_SIZE
movaps xmm3, xmm5
add esi, BASEVECTOR_SIZE
movaps xmm4, xmm5
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
addps xmm0, xmm3
addps xmm1, xmm4
addps xmm2, xmm5
jne loopWeight
doneWeight:
add eax, DRAWVERT_SIZE
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6
pshufd xmm5, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
movss xmm6, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm5
movaps xmm7, xmm6
minps xmm7, tmpMin
maxps xmm6, tmpMax
movaps tmpMin, xmm7
movaps tmpMax, xmm6
jl loopVert
done:
pop ebx
mov esi, bounds
movaps xmm6, tmpMin
movaps xmm7, tmpMax
movhps [esi+ 0], xmm6
movss [esi+ 8], xmm6
movhps [esi+12], xmm7
movss [esi+20], xmm7
}
}
/*
============
idSIMD_SSE2::TransformVertsAndTangents
============
*/
void VPCALL idSIMD_SSE2::TransformVertsAndTangents( idDrawVert *verts, const int numVerts, idBounds &bounds, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
ALIGN16( float tmpMin[4] );
ALIGN16( float tmpMax[4] );
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm {
push ebx
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
movaps xmm6, SIMD_SP_infinity
movaps xmm7, SIMD_SP_negInfinity
movaps tmpMin, xmm6
movaps tmpMax, xmm7
loopVert:
movss xmm0, [edx+JOINTWEIGHT_WEIGHT_OFFSET]
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 )
add edx, JOINTWEIGHT_SIZE
movaps xmm1, xmm0
movaps xmm2, xmm0
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
je doneWeight
loopWeight:
movss xmm3, [edx+JOINTWEIGHT_WEIGHT_OFFSET]
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 )
add edx, JOINTWEIGHT_SIZE
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
addps xmm0, xmm3
addps xmm1, xmm4
addps xmm2, xmm5
jne loopWeight
doneWeight:
add esi, 4*BASEVECTOR_SIZE
add eax, DRAWVERT_SIZE
// transform vertex
movaps xmm3, [esi-4*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss xmm6, xmm7
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
movaps xmm5, xmm6
minps xmm5, tmpMin
maxps xmm6, tmpMax
movaps tmpMin, xmm5
movaps tmpMax, xmm6
// transform normal
movaps xmm3, [esi-3*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7
// transform first tangent
movaps xmm3, [esi-2*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+8], xmm7
// transform second tangent
movaps xmm3, [esi-1*BASEVECTOR_SIZE]
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+8], xmm7
jl loopVert
done:
pop ebx
mov esi, bounds
movaps xmm6, tmpMin
movaps xmm7, tmpMax
movhps [esi+ 0], xmm6
movss [esi+ 8], xmm6
movhps [esi+12], xmm7
movss [esi+20], xmm7
}
}
/*
============
idSIMD_SSE2::TransformVertsAndTangentsFast
============
*/
void VPCALL idSIMD_SSE2::TransformVertsAndTangentsFast( idDrawVert *verts, const int numVerts, idBounds &bounds, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
ALIGN16( float tmpMin[4] );
ALIGN16( float tmpMax[4] );
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm {
push ebx
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
movaps xmm6, SIMD_SP_infinity
movaps xmm7, SIMD_SP_negInfinity
movaps tmpMin, xmm6
movaps tmpMax, xmm7
loopVert:
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
add esi, 4*BASEVECTOR_SIZE
movaps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
movaps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
movaps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
add edx, dword ptr [edx+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET]
add eax, DRAWVERT_SIZE
// transform vertex
movaps xmm3, [esi-4*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss xmm6, xmm7
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
movaps xmm5, xmm6
minps xmm5, tmpMin
maxps xmm6, tmpMax
movaps tmpMin, xmm5
movaps tmpMax, xmm6
// transform normal
movaps xmm3, [esi-3*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7
// transform first tangent
movaps xmm3, [esi-2*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+8], xmm7
// transform second tangent
movaps xmm3, [esi-1*BASEVECTOR_SIZE]
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+0], xmm6
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+8], xmm7
jl loopVert
done:
pop ebx
mov esi, bounds
movaps xmm6, tmpMin
movaps xmm7, tmpMax
movhps [esi+ 0], xmm6
movss [esi+ 8], xmm6
movhps [esi+12], xmm7
movss [esi+20], xmm7
}
}
/*
============
idSIMD_SSE2::ShadowVolume_CountFacing
============
*/
int VPCALL idSIMD_SSE2::ShadowVolume_CountFacing( const byte *facing, const int numFaces ) {
#if 1
ALIGN16( int n[4]; )
__asm {
mov eax, numFaces
mov edi, facing
test eax, eax
jz done
pxor xmm6, xmm6
pxor xmm7, xmm7
sub eax, 256
jl run16
loop256:
movdqa xmm0, [edi+ 0*16]
movdqa xmm1, [edi+ 1*16]
movdqa xmm2, [edi+ 2*16]
movdqa xmm3, [edi+ 3*16]
paddusb xmm0, [edi+ 4*16]
paddusb xmm1, [edi+ 5*16]
paddusb xmm2, [edi+ 6*16]
paddusb xmm3, [edi+ 7*16]
paddusb xmm0, [edi+ 8*16]
paddusb xmm1, [edi+ 9*16]
paddusb xmm2, [edi+10*16]
paddusb xmm3, [edi+11*16]
paddusb xmm0, [edi+12*16]
paddusb xmm1, [edi+13*16]
paddusb xmm2, [edi+14*16]
paddusb xmm3, [edi+15*16]
paddusb xmm0, xmm1
paddusb xmm2, xmm3
paddusb xmm0, xmm2
add edi, 256
sub eax, 256
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7
punpckhbw xmm1, xmm7
paddusw xmm0, xmm1
movdqa xmm1, xmm0
punpcklwd xmm0, xmm7
punpckhwd xmm1, xmm7
paddd xmm0, xmm1
paddd xmm6, xmm0
jge loop256
run16:
pxor xmm0, xmm0
add eax, 256 - 16
jl run4
loop16:
paddusb xmm0, [edi]
add edi, 16
sub eax, 16
jge loop16
run4:
add eax, 16 - 4
jl run1
pxor xmm1, xmm1
loop4:
movd xmm1, [edi]
paddusb xmm0, xmm1
add edi, 4
sub eax, 4
jge loop4
run1:
movdqa xmm1, xmm0
punpcklbw xmm0, xmm7
punpckhbw xmm1, xmm7
paddusw xmm0, xmm1
movdqa xmm1, xmm0
punpcklwd xmm0, xmm7
punpckhwd xmm1, xmm7
paddd xmm0, xmm1
paddd xmm6, xmm0
movdqa n, xmm6
add eax, 4-1
jl done
mov edx, dword ptr n
loop1:
movzx ecx, [edi]
add edx, ecx
add edi, 1
sub eax, 1
jge loop1
mov dword ptr n, edx
done:
}
return n[0] + n[1] + n[2] + n[3];
#else
int i, n0, n1, n2, n3;
n0 = n1 = n2 = n3 = 0;
for ( i = 0; i < numFaces - 3; i += 4 ) {
n0 += facing[i+0];
n1 += facing[i+1];
n2 += facing[i+2];
n3 += facing[i+3];
}
for ( ; i < numFaces; i++ ) {
n0 += facing[i];
}
return n0 + n1 + n2 + n3;
#endif
}
/*
============
idSIMD_Generic::ShadowVolume_CountFacingCull
============
*/
#pragma warning( disable : 4731 ) // frame pointer register 'ebx' modified by inline assembly code
int VPCALL idSIMD_SSE2::ShadowVolume_CountFacingCull( byte *facing, const int numFaces, const int *indexes, const byte *cull ) {
#if 1
ALIGN16( int n[4]; )
__asm {
push ebx
mov eax, numFaces
mov esi, indexes
mov edi, cull
mov ebx, facing
test eax, eax
jz done
add ebx, eax
neg eax
pxor xmm5, xmm5
pxor xmm6, xmm6
movdqa xmm7, SIMD_B_one
add eax, 4
jg run1
loop4:
mov ecx, dword ptr [esi+0*4]
movzx edx, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+1*4]
and dl, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+2*4]
and dl, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+3*4]
mov dh, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+4*4]
and dh, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+5*4]
and dh, byte ptr [edi+ecx]
movd xmm0, edx
mov ecx, dword ptr [esi+6*4]
movzx edx, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+7*4]
and dl, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+8*4]
and dl, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+9*4]
mov dh, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+10*4]
and dh, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+11*4]
and dh, byte ptr [edi+ecx]
pinsrw xmm0, edx, 1
add esi, 12*4
movd xmm1, [ebx+eax-4]
pcmpgtb xmm0, xmm6
pand xmm0, xmm7
por xmm1, xmm0
movd [ebx+eax-4], xmm1
add eax, 4
punpcklbw xmm1, xmm6
punpcklwd xmm1, xmm6
paddd xmm5, xmm1
jle loop4
run1:
sub eax, 4
jge done
loop1:
mov ecx, dword ptr [esi+0*4]
movzx edx, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+1*4]
and dl, byte ptr [edi+ecx]
mov ecx, dword ptr [esi+2*4]
and dl, byte ptr [edi+ecx]
neg edx
shr edx, 31
movzx ecx, byte ptr [ebx+eax]
or ecx, edx
mov byte ptr [ebx+eax], cl
movd xmm0, ecx
paddd xmm5, xmm0
add esi, 3*4
add eax, 1
jl loop1
done:
pop ebx
movdqa dword ptr n, xmm5
}
return n[0] + n[1] + n[2] + n[3];
#else
int i, n;
n = 0;
for ( i = 0; i < numFaces - 3; i += 4 ) {
int c0 = cull[indexes[0]] & cull[indexes[ 1]] & cull[indexes[ 2]];
int c1 = cull[indexes[3]] & cull[indexes[ 4]] & cull[indexes[ 5]];
int c2 = cull[indexes[6]] & cull[indexes[ 7]] & cull[indexes[ 8]];
int c3 = cull[indexes[9]] & cull[indexes[10]] & cull[indexes[11]];
facing[i+0] |= ( (-c0) >> 31 ) & 1;
facing[i+1] |= ( (-c1) >> 31 ) & 1;
facing[i+2] |= ( (-c2) >> 31 ) & 1;
facing[i+3] |= ( (-c3) >> 31 ) & 1;
n += facing[i+0];
n += facing[i+1];
n += facing[i+2];
n += facing[i+3];
indexes += 12;
}
for ( ; i < numFaces; i++ ) {
int c = cull[indexes[0]] & cull[indexes[1]] & cull[indexes[2]];
c = ( (-c) >> 31 ) & 1;
facing[i] |= c;
n += facing[i];
indexes += 3;
}
return n;
#endif
}
/*
============
idSIMD_SSE2::ShadowVolume_CreateSilTriangles
============
*/
int VPCALL idSIMD_SSE2::ShadowVolume_CreateSilTriangles( int *shadowIndexes, const byte *facing, const silEdge_s *silEdges, const int numSilEdges ) {
#if 1
int num;
__asm {
push ebx
mov eax, numSilEdges
mov ebx, shadowIndexes
mov esi, facing
mov edi, silEdges
shl eax, 4
jz done
add edi, eax
neg eax
shr ebx, 3
add eax, 4*16
jg run1
loop4:
mov ecx, dword ptr [edi+eax-4*16+0]
movzx ecx, byte ptr [esi+ecx]
movd xmm2, ecx
mov edx, dword ptr [edi+eax-4*16+4]
movzx edx, byte ptr [esi+edx]
pinsrw xmm2, edx, 2
movq xmm0, qword ptr [edi+eax-4*16+8]
paddd xmm0, xmm0 //
pshufd xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 1, 1 )
xor ecx, edx
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
lea edx, [ecx*2+ecx]
pxor xmm0, xmm1
add edx, ebx
movlps qword ptr [ebx*8+0*4], xmm0
pxor xmm2, xmm0
movhps qword ptr [ebx*8+2*4], xmm0
movlps qword ptr [ebx*8+4*4], xmm2
mov ecx, dword ptr [edi+eax-3*16+0]
movzx ecx, byte ptr [esi+ecx]
movd xmm3, ecx
mov ebx, dword ptr [edi+eax-3*16+4]
movzx ebx, byte ptr [esi+ebx]
pinsrw xmm3, ebx, 2
movq xmm0, qword ptr [edi+eax-3*16+8]
paddd xmm0, xmm0 //
pshufd xmm1, xmm3, R_SHUFFLE_PS( 2, 0, 1, 1 )
xor ecx, ebx
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
lea ebx, [ecx*2+ecx]
pxor xmm0, xmm1
add ebx, edx
movlps qword ptr [edx*8+0*4], xmm0
pxor xmm3, xmm0
movhps qword ptr [edx*8+2*4], xmm0
movlps qword ptr [edx*8+4*4], xmm3
mov ecx, dword ptr [edi+eax-2*16+0]
movzx ecx, byte ptr [esi+ecx]
movd xmm2, ecx
mov edx, dword ptr [edi+eax-2*16+4]
movzx edx, byte ptr [esi+edx]
pinsrw xmm2, edx, 2
movq xmm0, qword ptr [edi+eax-2*16+8]
paddd xmm0, xmm0 //
pshufd xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 1, 1 )
xor ecx, edx
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
lea edx, [ecx*2+ecx]
pxor xmm0, xmm1
add edx, ebx
movlps qword ptr [ebx*8+0*4], xmm0
pxor xmm2, xmm0
movhps qword ptr [ebx*8+2*4], xmm0
movlps qword ptr [ebx*8+4*4], xmm2
mov ecx, dword ptr [edi+eax-1*16+0]
movzx ecx, byte ptr [esi+ecx]
movd xmm3, ecx
mov ebx, dword ptr [edi+eax-1*16+4]
movzx ebx, byte ptr [esi+ebx]
pinsrw xmm3, ebx, 2
movq xmm0, qword ptr [edi+eax-1*16+8]
paddd xmm0, xmm0 //
pshufd xmm1, xmm3, R_SHUFFLE_PS( 2, 0, 1, 1 )
xor ecx, ebx
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
lea ebx, [ecx*2+ecx]
pxor xmm0, xmm1
add ebx, edx
movlps qword ptr [edx*8+0*4], xmm0
pxor xmm3, xmm0
movhps qword ptr [edx*8+2*4], xmm0
add eax, 4*16
movlps qword ptr [edx*8+4*4], xmm3
jle loop4
run1:
sub eax, 4*16
jge done
loop1:
mov ecx, dword ptr [edi+eax+0]
movzx ecx, byte ptr [esi+ecx]
movd xmm2, ecx
mov edx, dword ptr [edi+eax+4]
movzx edx, byte ptr [esi+edx]
pinsrw xmm2, edx, 2
movq xmm0, qword ptr [edi+eax+8]
paddd xmm0, xmm0 //
pshufd xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 1, 1 )
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
pxor xmm0, xmm1
movlps qword ptr [ebx*8+0*4], xmm0
movhps qword ptr [ebx*8+2*4], xmm0
pxor xmm2, xmm0
movlps qword ptr [ebx*8+4*4], xmm2
xor ecx, edx
lea edx, [ecx*2+ecx]
add ebx, edx
add eax, 16
jl loop1
done:
shl ebx, 3
mov num, ebx
pop ebx
}
return ( num - (int)shadowIndexes ) >> 2;
#else
int i;
const silEdge_t *sil;
int *si;
si = shadowIndexes;
for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) {
int f1 = facing[sil->p1];
int f2 = facing[sil->p2];
if ( !( f1 ^ f2 ) ) {
continue;
}
int v1 = sil->v1 << 1;
int v2 = sil->v2 << 1;
// set the two triangle winding orders based on facing
// without using a poorly-predictable branch
si[0] = v1;
si[1] = v2 ^ f1;
si[2] = v2 ^ f2;
si[3] = v1 ^ f2;
si[4] = v1 ^ f1;
si[5] = v2 ^ 1;
si += 6;
}
return si - shadowIndexes;
#endif
}
/*
============
idSIMD_SSE2::ShadowVolume_CreateCapTriangles
============
*/
int VPCALL idSIMD_SSE2::ShadowVolume_CreateCapTriangles( int *shadowIndexes, const byte *facing, const int *indexes, const int numIndexes ) {
#if 1
int num = numIndexes / 3;
__asm {
push ebx
mov eax, numIndexes
mov ebx, shadowIndexes
mov esi, facing
mov edi, indexes
shl eax, 2
jz done
add edi, eax
mov eax, num
add esi, eax
neg eax
shr ebx, 3
movaps xmm6, SIMD_DW_capTris_c0
movaps xmm7, SIMD_DW_capTris_c1
movaps xmm5, SIMD_DW_capTris_c2
add eax, 4
lea edx, [eax*2+eax]
jg run1
loop4:
movdqa xmm0, [edi+edx*4-4*3*4+0] // xmm0 = 0, 1, 2, 3
paddd xmm0, xmm0
pshufd xmm1, xmm0, R_SHUFFLE_PS( 2, 1, 0, 0 ) // xmm1 = 2, 1, 0, 0
movzx ecx, byte ptr [esi+eax-4]
pshufd xmm2, xmm0, R_SHUFFLE_PS( 1, 2, 1, 2 ) // xmm2 = 1, 2, 1, 2
sub ecx, 1
pxor xmm1, xmm6
and ecx, 3
movlps qword ptr [ebx*8+0*4], xmm1
add ecx, ebx
movhps qword ptr [ebx*8+2*4], xmm1
pxor xmm2, xmm7
movlps qword ptr [ebx*8+4*4], xmm2
movdqa xmm3, [edi+edx*4-3*3*4+4] // xmm3 = 4, 5, 6, 7
paddd xmm3, xmm3
shufps xmm0, xmm3, R_SHUFFLE_PS( 3, 3, 1, 0 ) // xmm0 = 3 3, 5, 4
movzx ebx, byte ptr [esi+eax-3]
movdqa xmm2, xmm3 // xmm2 = 4, 5, 6, 7
sub ebx, 1
pxor xmm0, xmm5
and ebx, 3
movhps qword ptr [ecx*8+0*4], xmm0
add ebx, ecx
movlps qword ptr [ecx*8+2*4], xmm0
pxor xmm2, xmm7
movlps qword ptr [ecx*8+4*4], xmm2
movdqa xmm0, [edi+edx*4-1*3*4-4] // xmm0 = 8, 9, 10, 11
paddd xmm0, xmm0
shufps xmm3, xmm0, R_SHUFFLE_PS( 2, 3, 0, 1 ) // xmm3 = 6, 7, 8, 9
pshufd xmm1, xmm3, R_SHUFFLE_PS( 2, 1, 0, 0 ) // xmm1 = 8, 7, 6, 6
movzx ecx, byte ptr [esi+eax-2]
pshufd xmm2, xmm3, R_SHUFFLE_PS( 1, 2, 1, 2 ) // xmm2 = 7, 8, 7, 8
sub ecx, 1
pxor xmm1, xmm6
and ecx, 3
movlps qword ptr [ebx*8+0*4], xmm1
add ecx, ebx
movhps qword ptr [ebx*8+2*4], xmm1
pxor xmm2, xmm7
movlps qword ptr [ebx*8+4*4], xmm2
pshufd xmm1, xmm0, R_SHUFFLE_PS( 3, 2, 1, 1 )
movzx ebx, byte ptr [esi+eax-1]
pshufd xmm2, xmm0, R_SHUFFLE_PS( 2, 3, 2, 3 )
sub ebx, 1
pxor xmm1, xmm6
and ebx, 3
movlps qword ptr [ecx*8+0*4], xmm1
add ebx, ecx
movhps qword ptr [ecx*8+2*4], xmm1
pxor xmm2, xmm7
movlps qword ptr [ecx*8+4*4], xmm2
add edx, 3*4
add eax, 4
jle loop4
run1:
sub eax, 4
jge done
loop1:
lea edx, [eax*2+eax]
movdqu xmm0, [edi+edx*4+0]
paddd xmm0, xmm0
pshufd xmm1, xmm0, R_SHUFFLE_PS( 2, 1, 0, 0 )
pshufd xmm2, xmm0, R_SHUFFLE_PS( 1, 2, 1, 2 )
pxor xmm1, xmm6
movlps qword ptr [ebx*8+0*4], xmm1
pxor xmm2, xmm7
movhps qword ptr [ebx*8+2*4], xmm1
movzx ecx, byte ptr [esi+eax]
movlps qword ptr [ebx*8+4*4], xmm2
sub ecx, 1
and ecx, 3
add ebx, ecx
add eax, 1
jl loop1
done:
shl ebx, 3
mov num, ebx
pop ebx
}
return ( num - (int)shadowIndexes ) >> 2;
#else
int i, j;
int *si;
si = shadowIndexes;
for ( i = 0, j = 0; i < numIndexes; i += 3, j++ ) {
if ( facing[j] ) {
continue;
}
int i0 = indexes[i+0];
int i1 = indexes[i+1];
int i2 = indexes[i+2];
i0 += i0;
i1 += i1;
i2 += i2;
si[0] = i2;
si[1] = i1;
si[2] = i0;
si[3] = i0 ^ 1;
si[4] = i1 ^ 1;
si[5] = i2 ^ 1;
si += 6;
}
return si - shadowIndexes;
#endif
}
/*
============
idSIMD_SSE2::MixedSoundToSamples
============
*/
void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
__asm {
mov eax, numSamples
mov edi, mixBuffer
mov esi, samples
shl eax, 2
add edi, eax
neg eax
loop16:
movaps xmm0, [edi+eax+0*16]
movaps xmm1, [edi+eax+1*16]
movaps xmm2, [edi+eax+2*16]
movaps xmm3, [edi+eax+3*16]
add esi, 4*4*2
cvtps2dq xmm4, xmm0
cvtps2dq xmm5, xmm1
cvtps2dq xmm6, xmm2
cvtps2dq xmm7, xmm3
prefetchnta [edi+eax+128]
packssdw xmm4, xmm5
packssdw xmm6, xmm7
add eax, 4*16
movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
movhps [esi-3*4*2], xmm4
movlps [esi-2*4*2], xmm6
movhps [esi-1*4*2], xmm6
jl loop16
}
}
#endif /* _WINDOWS */