2211 lines
58 KiB
C++
2211 lines
58 KiB
C++
|
|
#include "../precompiled.h"
|
|
#pragma hdrstop
|
|
|
|
#include "Simd_generic.h"
|
|
#include "Simd_MMX.h"
|
|
#include "Simd_SSE.h"
|
|
#include "Simd_SSE2.h"
|
|
#include "Simd_InstructionMacros.h"
|
|
|
|
|
|
//===============================================================
|
|
//
|
|
// SSE2 implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
|
|
#ifdef _WINDOWS
|
|
|
|
#include "Simd_InstructionMacros.h"
|
|
|
|
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
|
|
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
|
|
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_firstSignBit, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_ZERO );
|
|
ALIGN4_INIT1( unsigned long SIMD_SP_signBit, IEEE_SP_SIGN );
|
|
ALIGN4_INIT1( unsigned long SIMD_SP_absMask, ~IEEE_SP_SIGN );
|
|
ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, ~IEEE_SP_INF );
|
|
ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_clearLast, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 );
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_clearFirstThree, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF );
|
|
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x0, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN );
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x1, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN );
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x2, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN );
|
|
|
|
ALIGN4_INIT4( unsigned long SIMD_DW_zero, 0, 0, 0, 0 );
|
|
ALIGN16_INIT1( unsigned char SIMD_B_one, 1 );
|
|
|
|
ALIGN4_INIT4( unsigned long SIMD_DW_capTris_c0, 0, 0, 0, 1 );
|
|
ALIGN4_INIT4( unsigned long SIMD_DW_capTris_c1, 1, 1, 0, 0 );
|
|
ALIGN4_INIT4( unsigned long SIMD_DW_capTris_c2, 0, 1, 0, 0 );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
|
|
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
|
|
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
|
|
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
|
|
ALIGN4_INIT1( float SIMD_SP_negInfinity, -idMath::INFINITY );
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE2::GetName( void ) const {
|
|
return "MMX & SSE & SSE2";
|
|
}
|
|
|
|
#if 0 // the SSE2 code is ungodly slow
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MatX_LowerTriangularSolve
|
|
|
|
solves x in Lx = b for the n * n sub-matrix of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
if ( skip >= n ) {
|
|
return;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
#define NSKIP( n, s ) ((n<<3)|(s&7))
|
|
switch( NSKIP( n, skip ) ) {
|
|
case NSKIP( 1, 0 ): x[0] = b[0];
|
|
return;
|
|
case NSKIP( 2, 0 ): x[0] = b[0];
|
|
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
return;
|
|
case NSKIP( 3, 0 ): x[0] = b[0];
|
|
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
return;
|
|
case NSKIP( 4, 0 ): x[0] = b[0];
|
|
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
return;
|
|
case NSKIP( 5, 0 ): x[0] = b[0];
|
|
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
return;
|
|
case NSKIP( 6, 0 ): x[0] = b[0];
|
|
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
return;
|
|
case NSKIP( 7, 0 ): x[0] = b[0];
|
|
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// process first 4 rows
|
|
switch( skip ) {
|
|
case 0: x[0] = b[0];
|
|
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
skip = 4;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, skip // eax = i
|
|
shl eax, 2 // eax = i*4
|
|
mov edx, n // edx = n
|
|
shl edx, 2 // edx = n*4
|
|
mov esi, x // esi = x
|
|
mov edi, lptr // edi = lptr
|
|
add esi, eax
|
|
add edi, eax
|
|
mov ebx, b // ebx = b
|
|
// aligned
|
|
looprow:
|
|
mov ecx, eax
|
|
neg ecx
|
|
cvtps2pd xmm0, [esi+ecx]
|
|
cvtps2pd xmm2, [edi+ecx]
|
|
mulpd xmm0, xmm2
|
|
cvtps2pd xmm1, [esi+ecx+8]
|
|
cvtps2pd xmm3, [edi+ecx+8]
|
|
mulpd xmm1, xmm3
|
|
add ecx, 20*4
|
|
jg donedot16
|
|
dot16:
|
|
cvtps2pd xmm2, [esi+ecx-(16*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(16*4)]
|
|
cvtps2pd xmm4, [esi+ecx-(14*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(14*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(12*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(12*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(10*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(10*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(8*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(8*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(6*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(6*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(4*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm2
|
|
add ecx, 16*4
|
|
mulpd xmm4, xmm5
|
|
addpd xmm1, xmm4
|
|
jle dot16
|
|
donedot16:
|
|
sub ecx, 8*4
|
|
jg donedot8
|
|
dot8:
|
|
cvtps2pd xmm2, [esi+ecx-(8*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(8*4)]
|
|
cvtps2pd xmm7, [esi+ecx-(6*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(6*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm6, [esi+ecx-(4*4)]
|
|
mulpd xmm7, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
addpd xmm1, xmm7
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm6, xmm3
|
|
cvtps2pd xmm7, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm6
|
|
add ecx, 8*4
|
|
mulpd xmm4, xmm7
|
|
addpd xmm1, xmm4
|
|
donedot8:
|
|
sub ecx, 4*4
|
|
jg donedot4
|
|
dot4:
|
|
cvtps2pd xmm2, [esi+ecx-(4*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm2
|
|
add ecx, 4*4
|
|
mulpd xmm4, xmm5
|
|
addpd xmm1, xmm4
|
|
donedot4:
|
|
addpd xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLE_PD( 1, 0 )
|
|
addsd xmm0, xmm1
|
|
sub ecx, 4*4
|
|
jz dot0
|
|
add ecx, 4
|
|
jz dot1
|
|
add ecx, 4
|
|
jz dot2
|
|
//dot3:
|
|
cvtss2sd xmm1, [esi-(3*4)]
|
|
cvtss2sd xmm2, [edi-(3*4)]
|
|
mulsd xmm1, xmm2
|
|
addsd xmm0, xmm1
|
|
dot2:
|
|
cvtss2sd xmm3, [esi-(2*4)]
|
|
cvtss2sd xmm4, [edi-(2*4)]
|
|
mulsd xmm3, xmm4
|
|
addsd xmm0, xmm3
|
|
dot1:
|
|
cvtss2sd xmm5, [esi-(1*4)]
|
|
cvtss2sd xmm6, [edi-(1*4)]
|
|
mulsd xmm5, xmm6
|
|
addsd xmm0, xmm5
|
|
dot0:
|
|
cvtss2sd xmm1, [ebx+eax]
|
|
subsd xmm1, xmm0
|
|
cvtsd2ss xmm0, xmm1
|
|
movss [esi], xmm0
|
|
add eax, 4
|
|
cmp eax, edx
|
|
jge done
|
|
add esi, 4
|
|
mov ecx, nc
|
|
shl ecx, 2
|
|
add edi, ecx
|
|
add edi, 4
|
|
jmp looprow
|
|
// done
|
|
done:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L'x = b for the n * n sub-matrix of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x[1] = b[1];
|
|
x[0] = b[0] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 3:
|
|
x[2] = b[2];
|
|
x[1] = b[1] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 4:
|
|
x[3] = b[3];
|
|
x[2] = b[2] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 5:
|
|
x[4] = b[4];
|
|
x[3] = b[3] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 6:
|
|
x[5] = b[5];
|
|
x[4] = b[4] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 7:
|
|
x[6] = b[6];
|
|
x[5] = b[5] - lptr[6*nc+5] * x[6];
|
|
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int i, j, m;
|
|
float *xptr;
|
|
double s0;
|
|
|
|
// if the number of columns is not a multiple of 2 we're screwed for alignment.
|
|
// however, if the number of columns is a multiple of 2 but the number of to be
|
|
// processed rows is not a multiple of 2 we can still run 8 byte aligned
|
|
m = n;
|
|
if ( m & 1 ) {
|
|
m--;
|
|
x[m] = b[m];
|
|
|
|
lptr = L[m] + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows_1:
|
|
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
xor ecx, ecx
|
|
sub eax, m
|
|
neg eax
|
|
jz done4x4_1
|
|
process4x4_1: // process 4x4 blocks
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+0]
|
|
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+4]
|
|
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
cvtps2pd xmm3, [edi]
|
|
mulpd xmm6, xmm7
|
|
cvtps2pd xmm4, [edi+8]
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+8]
|
|
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+12]
|
|
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
add ecx, 4
|
|
mulpd xmm6, xmm7
|
|
cmp ecx, eax
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
jl process4x4_1
|
|
done4x4_1: // process left over of the 4 rows
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
cvtss2sd xmm5, [esi+4*ecx]
|
|
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
mulpd xmm4, xmm5
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
|
|
add eax, m
|
|
sub eax, 4
|
|
movapd xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLE_PD( 1, 1 )
|
|
movapd xmm3, xmm2
|
|
shufpd xmm3, xmm3, R_SHUFFLE_PD( 1, 1 )
|
|
sub edi, edx
|
|
cvtsd2ss xmm7, xmm3
|
|
movss [esi-4], xmm7 // xptr[-1] = s3
|
|
movsd xmm4, xmm3
|
|
movsd xmm5, xmm3
|
|
cvtss2sd xmm7, [edi+8]
|
|
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
|
|
subsd xmm2, xmm3
|
|
cvtsd2ss xmm7, xmm2
|
|
movss [esi-8], xmm7 // xptr[-2] = s2
|
|
movsd xmm6, xmm2
|
|
sub edi, edx
|
|
subsd xmm0, xmm5
|
|
subsd xmm1, xmm4
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
|
|
subsd xmm1, xmm2
|
|
cvtsd2ss xmm7, xmm1
|
|
movss [esi-12], xmm7 // xptr[-3] = s1
|
|
subsd xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
|
|
subsd xmm0, xmm1
|
|
cvtsd2ss xmm7, xmm0
|
|
movss [esi-16], xmm7 // xptr[-4] = s0
|
|
jl done4rows_1
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows_1
|
|
done4rows_1:
|
|
pop ebx
|
|
}
|
|
}
|
|
else {
|
|
lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows:
|
|
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
sub eax, m
|
|
jz done4x4
|
|
neg eax
|
|
xor ecx, ecx
|
|
process4x4: // process 4x4 blocks
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+0]
|
|
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+4]
|
|
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
cvtps2pd xmm3, [edi]
|
|
mulpd xmm6, xmm7
|
|
cvtps2pd xmm4, [edi+8]
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+8]
|
|
shufpd xmm5, xmm5, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+12]
|
|
shufpd xmm7, xmm7, R_SHUFFLE_PD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
add ecx, 4
|
|
mulpd xmm6, xmm7
|
|
cmp ecx, eax
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
jl process4x4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
done4x4: // process left over of the 4 rows
|
|
add eax, m
|
|
sub eax, 4
|
|
movapd xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLE_PD( 1, 1 )
|
|
movapd xmm3, xmm2
|
|
shufpd xmm3, xmm3, R_SHUFFLE_PD( 1, 1 )
|
|
sub edi, edx
|
|
cvtsd2ss xmm7, xmm3
|
|
movss [esi-4], xmm7 // xptr[-1] = s3
|
|
movsd xmm4, xmm3
|
|
movsd xmm5, xmm3
|
|
cvtss2sd xmm7, [edi+8]
|
|
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
|
|
subsd xmm2, xmm3
|
|
cvtsd2ss xmm7, xmm2
|
|
movss [esi-8], xmm7 // xptr[-2] = s2
|
|
movsd xmm6, xmm2
|
|
sub edi, edx
|
|
subsd xmm0, xmm5
|
|
subsd xmm1, xmm4
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
|
|
subsd xmm1, xmm2
|
|
cvtsd2ss xmm7, xmm1
|
|
movss [esi-12], xmm7 // xptr[-3] = s1
|
|
subsd xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
|
|
subsd xmm0, xmm1
|
|
cvtsd2ss xmm7, xmm0
|
|
movss [esi-16], xmm7 // xptr[-4] = s0
|
|
jl done4rows
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows
|
|
done4rows:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
// process left over rows
|
|
for ( i = (m&3)-1; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[i+1] + i;
|
|
for ( j = i + 1; j < m; j++ ) {
|
|
s0 -= lptr[0] * x[j];
|
|
lptr += nc;
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::ConvertJointQuatsToJointMats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
|
|
#if 1
|
|
|
|
assert_16_byte_aligned( jointMats );
|
|
assert_16_byte_aligned( jointQuats );
|
|
|
|
__asm {
|
|
mov eax, numJoints
|
|
shl eax, JOINTQUAT_SIZE_SHIFT
|
|
mov esi, jointQuats
|
|
mov edi, jointMats
|
|
|
|
add esi, eax
|
|
neg eax
|
|
jz done
|
|
|
|
loopQuat:
|
|
movaps xmm0, [esi+eax+JOINTQUAT_Q_OFFSET] // xmm0 = q.x, q.y, q.z, q.w
|
|
movaps xmm6, [esi+eax+JOINTQUAT_T_OFFSET] // xmm6 = t.x, t.y, t.z, w
|
|
|
|
add edi, JOINTMAT_SIZE
|
|
|
|
movaps xmm1, xmm0 // xmm1 = x, y, z, w
|
|
addps xmm1, xmm1 // xmm1 = x2, y2, z2, w2
|
|
|
|
add eax, JOINTQUAT_SIZE
|
|
|
|
// calculate the 9 products
|
|
pshufd xmm2, xmm0, R_SHUFFLE_D( 1, 0, 0, 1 ) // xmm2 = y, x, x, y
|
|
pshufd xmm3, xmm1, R_SHUFFLE_D( 1, 1, 2, 2 ) // xmm3 = y2, y2, z2, z2
|
|
mulps xmm2, xmm3 // xmm2 = yy2, xy2, xz2, yz2
|
|
|
|
pshufd xmm4, xmm0, R_SHUFFLE_D( 2, 3, 3, 3 ) // xmm4 = z, w, w, w
|
|
pshufd xmm5, xmm1, R_SHUFFLE_D( 2, 2, 1, 0 ) // xmm5 = z2, z2, y2, x2
|
|
mulps xmm4, xmm5 // xmm4 = zz2, wz2, wy2, wx2
|
|
|
|
mulss xmm0, xmm1 // xmm0 = xx2, y2, z2, w2
|
|
|
|
// calculate the last two elements of the third row
|
|
movss xmm7, SIMD_SP_one // xmm7 = 1, 0, 0, 0
|
|
subss xmm7, xmm0 // xmm7 = -xx2+1, 0, 0, 0
|
|
subss xmm7, xmm2 // xmm7 = -xx2-yy2+1, 0, 0, 0
|
|
shufps xmm7, xmm6, R_SHUFFLE_PS( 0, 1, 2, 3 ) // xmm7 = -xx2-yy2+1, 0, t.z, w
|
|
|
|
// calcluate first row
|
|
xorps xmm2, SIMD_SP_quat2mat_x0 // xmm2 = yy2, -xy2, -xz2, -yz2
|
|
xorps xmm4, SIMD_SP_quat2mat_x1 // xmm4 = -zz2, wz2, -wy2, -wx2
|
|
addss xmm4, SIMD_SP_one // xmm4 = -zz2+1, wz2, -wy2, -wx2
|
|
movaps xmm3, xmm4 // xmm3 = -zz2+1, wz2, -wy2, -wx2
|
|
subps xmm3, xmm2 // xmm3 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2
|
|
movaps [edi-JOINTMAT_SIZE+0*16+0*4], xmm3 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2
|
|
movss [edi-JOINTMAT_SIZE+0*16+3*4], xmm6 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, t.x
|
|
|
|
// calculate second row
|
|
movss xmm2, xmm0 // xmm2 = xx2, -xy2, -xz2, -yz2
|
|
xorps xmm4, SIMD_SP_quat2mat_x2 // xmm4 = -zz2+1, -wz2, wy2, wx2
|
|
subps xmm4, xmm2 // xmm4 = -xx2-zz2+1, xy2-wz2, xz2+wy2, yz2+wx2
|
|
shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) // xmm6 = t.y, t.z, w, t.x
|
|
shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 0, 3, 2 ) // xmm4 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2
|
|
movaps [edi-JOINTMAT_SIZE+1*16+0*4], xmm4 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2
|
|
movss [edi-JOINTMAT_SIZE+1*16+3*4], xmm6 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, t.y
|
|
|
|
// calculate third row
|
|
movhlps xmm3, xmm4 // xmm3 = yz2+wx2, xz2+wy2, xz2-wy2, yz2-wx2
|
|
shufps xmm3, xmm7, R_SHUFFLE_PS( 1, 3, 0, 2 ) // xmm3 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z
|
|
movaps [edi-JOINTMAT_SIZE+2*16+0*4], xmm3 // row2 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z
|
|
|
|
jl loopQuat
|
|
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
for ( int i = 0; i < numJoints; i++ ) {
|
|
const float *q = &jointQuats[i].q;
|
|
float *m = jointMats[i].mat;
|
|
|
|
float x2 = q[0] + q[0];
|
|
float y2 = q[1] + q[1];
|
|
float z2 = q[2] + q[2];
|
|
float w2 = q[3] + q[3];
|
|
|
|
float yy2 = q[1] * y2;
|
|
float xy2 = q[0] * y2;
|
|
float xz2 = q[0] * z2;
|
|
float yz2 = q[1] * z2;
|
|
|
|
float zz2 = q[2] * z2;
|
|
float wz2 = q[3] * z2;
|
|
float wy2 = q[3] * y2;
|
|
float wx2 = q[3] * x2;
|
|
|
|
float xx2 = q[0] * x2;
|
|
|
|
m[0*4+0] = - yy2 - zz2 + 1.0f;
|
|
m[0*4+1] = xy2 + wz2;
|
|
m[0*4+2] = xz2 - wy2;
|
|
m[0*4+3] = q[4];
|
|
|
|
m[1*4+0] = xy2 - wz2;
|
|
m[1*4+1] = - xx2 - zz2 + 1.0f;
|
|
m[1*4+2] = yz2 + wx2;
|
|
m[1*4+3] = q[5];
|
|
|
|
m[2*4+0] = xz2 + wy2;
|
|
m[2*4+1] = yz2 - wx2;
|
|
m[2*4+2] = - xx2 - yy2 + 1.0f;
|
|
m[2*4+3] = q[6];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MultiplyJoints
|
|
============
|
|
*/
|
|
/*
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_quatmul_xor0, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_SIGN );
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_quatmul_xor1, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN );
|
|
ALIGN4_INIT4( unsigned long SIMD_SP_quatmul_xor2, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_SIGN );
|
|
|
|
void VPCALL idSIMD_SSE2::MultiplyJoints( idJointQuat *result, const idJointQuat *joints1, const idJointQuat *joints2, const int numJoints ) {
|
|
#if 1
|
|
|
|
assert_16_byte_aligned( result );
|
|
assert_16_byte_aligned( joints1 );
|
|
assert_16_byte_aligned( joints2 );
|
|
|
|
__asm {
|
|
|
|
mov eax, numJoints
|
|
test eax, eax
|
|
jz done
|
|
mov ecx, joints1
|
|
mov edx, joints2
|
|
mov edi, result
|
|
imul eax, JOINTMAT_SIZE
|
|
add ecx, eax
|
|
add edx, eax
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loopQuat:
|
|
movaps xmm0, [ecx+eax] // Ax, Ay, Az, Aw 6/1
|
|
movaps xmm1, [edx+eax] // Bx, By, Bz, Bw 6/1
|
|
pshufd xmm2, xmm0, 0x00 // Ax, Ax, Ax, Ax 4/2
|
|
pshufd xmm3, xmm1, 0x1B // BW, BZ, BY, BX 4/2
|
|
pshufd xmm4, xmm0, 0x55 // Ay, Ay, Ay, Ay 4/2
|
|
pshufd xmm5, xmm1, 0x4E // Bz, Bw, Bx, By 4/2
|
|
mulps xmm2, xmm3 // Ax*Bw, Ax*Bz, Ax*By, Ax*Bx 6/2
|
|
pshufd xmm6, xmm0, 0xAA // Az, Az, Az, Az 4/2
|
|
mulps xmm4, xmm5 // Ay*Bz, Ay*Bw, Ay*Bx, Ay*By 6/2
|
|
pshufd xmm7, xmm1, 0xB1 // By, Bx, Bw, Bz 4/2
|
|
xorps xmm2, SIMD_SP_quatmul_xor0 // Ax*Bw, -Ax*Bz, Ax*By, -Ax*Bx 4/2
|
|
pshufd xmm0, xmm0, 0xFF // Aw, Aw, Aw, Aw 4/2
|
|
mulps xmm6, xmm7 // Az*By, Az*Bx, Az*Bw, Az*Bz 6/2
|
|
xorps xmm4, SIMD_SP_quatmul_xor1 // Ay*Bz, Ay*Bw, -Ay*Bx, -Ay*By 4/2
|
|
mulps xmm0, xmm1 // Aw*Bx, Aw*By, Aw*Bz, Aw*Bw 6/2
|
|
addps xmm2, xmm4 // Ax*Bw+Ay*Bz, -Ax*Bz+Ay*Bw, Ax*By-Ay*Bx, -Ax*Bx-Ay*By 4/2
|
|
xorps xmm6, SIMD_SP_quatmul_xor2 // -Az*By, Az*Bx, Az*Bw, -Az*Bz 4/2
|
|
addps xmm0, xmm2 // add to the result 4/2
|
|
addps xmm0, xmm6 // add to the result 4/2
|
|
movaps [edi+eax], xmm0 // save result 6/1
|
|
add eax, JOINTQUAT_SIZE
|
|
jl loopQuat
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i < numJoints; i++ ) {
|
|
result[i].q = joints1[i].q * joints2[i].q;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
*/
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::TransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
#if 1
|
|
|
|
assert_16_byte_aligned( jointMats );
|
|
|
|
__asm {
|
|
|
|
mov ecx, firstJoint
|
|
mov eax, lastJoint
|
|
sub eax, ecx
|
|
jl done
|
|
shl ecx, 2 // ecx = firstJoint * 4
|
|
mov edi, parents
|
|
add edi, ecx // edx = &parents[firstJoint]
|
|
lea ecx, [ecx+ecx*2]
|
|
shl ecx, 2 // ecx = firstJoint * JOINTMAT_SIZE
|
|
mov esi, jointMats // esi = jointMats
|
|
shl eax, 2 // eax = ( lastJoint - firstJoint ) * 4
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loopJoint:
|
|
|
|
mov edx, [edi+eax]
|
|
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
|
|
lea edx, [edx+edx*2]
|
|
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
|
|
shl edx, 4 // edx = parents[i] * JOINTMAT_SIZE
|
|
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
|
|
|
|
movaps xmm7, [esi+edx+ 0]
|
|
pshufd xmm4, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm0
|
|
pshufd xmm5, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm1
|
|
addps xmm4, xmm5
|
|
|
|
add ecx, JOINTMAT_SIZE
|
|
add eax, 4
|
|
|
|
pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm6, xmm2
|
|
addps xmm4, xmm6
|
|
andps xmm7, SIMD_SP_clearFirstThree
|
|
addps xmm4, xmm7
|
|
|
|
movaps [esi+ecx-JOINTMAT_SIZE+ 0], xmm4
|
|
|
|
movaps xmm3, [esi+edx+16]
|
|
pshufd xmm5, xmm3, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm0
|
|
pshufd xmm6, xmm3, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm6, xmm1
|
|
addps xmm5, xmm6
|
|
pshufd xmm4, xmm3, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm4, xmm2
|
|
addps xmm5, xmm4
|
|
andps xmm3, SIMD_SP_clearFirstThree
|
|
addps xmm5, xmm3
|
|
|
|
movaps [esi+ecx-JOINTMAT_SIZE+16], xmm5
|
|
|
|
movaps xmm7, [esi+edx+32]
|
|
pshufd xmm6, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm1
|
|
addps xmm6, xmm4
|
|
pshufd xmm3, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm3, xmm2
|
|
addps xmm6, xmm3
|
|
andps xmm7, SIMD_SP_clearFirstThree
|
|
addps xmm6, xmm7
|
|
|
|
movaps [esi+ecx-JOINTMAT_SIZE+32], xmm6
|
|
|
|
jle loopJoint
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] *= jointMats[parents[i]];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::UntransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
#if 1
|
|
|
|
assert_16_byte_aligned( jointMats );
|
|
|
|
__asm {
|
|
|
|
mov edx, firstJoint
|
|
mov eax, lastJoint
|
|
mov ecx, eax
|
|
sub eax, edx
|
|
jl done
|
|
mov esi, jointMats // esi = jointMats
|
|
lea ecx, [ecx+ecx*2]
|
|
shl ecx, 4 // ecx = lastJoint * JOINTMAT_SIZE
|
|
shl edx, 2
|
|
mov edi, parents
|
|
add edi, edx // edi = &parents[firstJoint]
|
|
shl eax, 2 // eax = ( lastJoint - firstJoint ) * 4
|
|
|
|
loopJoint:
|
|
|
|
mov edx, [edi+eax]
|
|
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
|
|
lea edx, [edx+edx*2]
|
|
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
|
|
shl edx, 4 // edx = parents[i] * JOINTMAT_SIZE
|
|
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
|
|
|
|
movss xmm7, [esi+edx+12]
|
|
shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 2, 3, 0 )
|
|
subps xmm0, xmm7
|
|
movss xmm6, [esi+edx+28]
|
|
shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 )
|
|
subps xmm1, xmm6
|
|
movss xmm5, [esi+edx+44]
|
|
shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 2, 3, 0 )
|
|
subps xmm2, xmm5
|
|
|
|
sub ecx, JOINTMAT_SIZE
|
|
sub eax, 4
|
|
|
|
movaps xmm7, [esi+edx+ 0]
|
|
|
|
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm0
|
|
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm0
|
|
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm5, xmm0
|
|
|
|
movaps xmm7, [esi+edx+16]
|
|
|
|
pshufd xmm0, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm0, xmm1
|
|
addps xmm3, xmm0
|
|
pshufd xmm6, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm6, xmm1
|
|
addps xmm4, xmm6
|
|
pshufd xmm0, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm0, xmm1
|
|
addps xmm5, xmm0
|
|
|
|
movaps xmm7, [esi+edx+32]
|
|
|
|
pshufd xmm6, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm2
|
|
addps xmm3, xmm6
|
|
|
|
movaps [esi+ecx+JOINTMAT_SIZE+ 0], xmm3
|
|
|
|
pshufd xmm1, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm1, xmm2
|
|
addps xmm4, xmm1
|
|
|
|
movaps [esi+ecx+JOINTMAT_SIZE+16], xmm4
|
|
|
|
pshufd xmm6, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm6, xmm2
|
|
addps xmm5, xmm6
|
|
|
|
movaps [esi+ecx+JOINTMAT_SIZE+32], xmm5
|
|
|
|
jge loopJoint
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] /= jointMats[parents[i]];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MultiplyJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MultiplyJoints( idJointMat *result, const idJointMat *joints1, const idJointMat *joints2, const int numJoints ) {
|
|
#if 1
|
|
|
|
assert_16_byte_aligned( result );
|
|
assert_16_byte_aligned( joints1 );
|
|
assert_16_byte_aligned( joints2 );
|
|
|
|
__asm {
|
|
|
|
mov eax, numJoints
|
|
test eax, eax
|
|
jz done
|
|
mov ecx, joints1
|
|
mov edx, joints2
|
|
mov edi, result
|
|
imul eax, JOINTMAT_SIZE
|
|
add ecx, eax
|
|
add edx, eax
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loopJoint:
|
|
|
|
movaps xmm0, [edx+eax+ 0]
|
|
movaps xmm1, [edx+eax+16]
|
|
movaps xmm2, [edx+eax+32]
|
|
|
|
movaps xmm7, [ecx+eax+ 0]
|
|
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm0
|
|
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
|
|
add eax, JOINTMAT_SIZE
|
|
|
|
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
andps xmm7, SIMD_SP_clearFirstThree
|
|
addps xmm3, xmm7
|
|
|
|
movaps [edi+eax-JOINTMAT_SIZE+0], xmm3
|
|
|
|
movaps xmm7, [ecx+eax-JOINTMAT_SIZE+16]
|
|
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm0
|
|
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
andps xmm7, SIMD_SP_clearFirstThree
|
|
addps xmm3, xmm7
|
|
|
|
movaps [edi+eax-JOINTMAT_SIZE+16], xmm3
|
|
|
|
movaps xmm7, [ecx+eax-JOINTMAT_SIZE+32]
|
|
pshufd xmm3, xmm7, R_SHUFFLE_D( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm0
|
|
pshufd xmm4, xmm7, R_SHUFFLE_D( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
pshufd xmm5, xmm7, R_SHUFFLE_D( 2, 2, 2, 2 )
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
andps xmm7, SIMD_SP_clearFirstThree
|
|
addps xmm3, xmm7
|
|
|
|
movaps [edi+eax-JOINTMAT_SIZE+32], xmm3
|
|
|
|
jl loopJoint
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i < numJoints; i++ ) {
|
|
idJointMat::Multiply( result[i], joints1[i], joints2[i] );
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
#pragma warning( disable : 4731 ) // frame pointer register 'ebx' modified by inline assembly code
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::TransformVertsNew
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::TransformVertsNew( idDrawVert *verts, const int numVerts, idBounds &bounds, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
|
|
ALIGN16( float tmpMin[4] );
|
|
ALIGN16( float tmpMax[4] );
|
|
|
|
assert_16_byte_aligned( joints );
|
|
assert_16_byte_aligned( base );
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
imul eax, DRAWVERT_SIZE
|
|
|
|
mov ecx, verts
|
|
mov edx, weights
|
|
mov esi, base
|
|
mov edi, joints
|
|
|
|
add ecx, eax
|
|
neg eax
|
|
|
|
movaps xmm6, SIMD_SP_infinity
|
|
movaps xmm7, SIMD_SP_negInfinity
|
|
movaps tmpMin, xmm6
|
|
movaps tmpMax, xmm7
|
|
|
|
loopVert:
|
|
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
|
|
movaps xmm2, [esi]
|
|
add edx, JOINTWEIGHT_SIZE
|
|
movaps xmm0, xmm2
|
|
add esi, BASEVECTOR_SIZE
|
|
movaps xmm1, xmm2
|
|
|
|
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
|
|
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
|
|
|
|
je doneWeight
|
|
|
|
loopWeight:
|
|
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
|
|
movaps xmm5, [esi]
|
|
add edx, JOINTWEIGHT_SIZE
|
|
movaps xmm3, xmm5
|
|
add esi, BASEVECTOR_SIZE
|
|
movaps xmm4, xmm5
|
|
|
|
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
|
|
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
|
|
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
|
|
|
|
addps xmm0, xmm3
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
|
|
jne loopWeight
|
|
|
|
doneWeight:
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
|
|
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6
|
|
|
|
pshufd xmm5, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
|
|
movss xmm6, xmm5
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm5
|
|
|
|
movaps xmm7, xmm6
|
|
minps xmm7, tmpMin
|
|
maxps xmm6, tmpMax
|
|
movaps tmpMin, xmm7
|
|
movaps tmpMax, xmm6
|
|
|
|
jl loopVert
|
|
done:
|
|
pop ebx
|
|
mov esi, bounds
|
|
movaps xmm6, tmpMin
|
|
movaps xmm7, tmpMax
|
|
movhps [esi+ 0], xmm6
|
|
movss [esi+ 8], xmm6
|
|
movhps [esi+12], xmm7
|
|
movss [esi+20], xmm7
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::TransformVertsAndTangents
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::TransformVertsAndTangents( idDrawVert *verts, const int numVerts, idBounds &bounds, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
|
|
ALIGN16( float tmpMin[4] );
|
|
ALIGN16( float tmpMax[4] );
|
|
|
|
assert_16_byte_aligned( joints );
|
|
assert_16_byte_aligned( base );
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
imul eax, DRAWVERT_SIZE
|
|
|
|
mov ecx, verts
|
|
mov edx, weights
|
|
mov esi, base
|
|
mov edi, joints
|
|
|
|
add ecx, eax
|
|
neg eax
|
|
|
|
movaps xmm6, SIMD_SP_infinity
|
|
movaps xmm7, SIMD_SP_negInfinity
|
|
movaps tmpMin, xmm6
|
|
movaps tmpMax, xmm7
|
|
|
|
loopVert:
|
|
movss xmm0, [edx+JOINTWEIGHT_WEIGHT_OFFSET]
|
|
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
|
|
shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 )
|
|
add edx, JOINTWEIGHT_SIZE
|
|
movaps xmm1, xmm0
|
|
movaps xmm2, xmm0
|
|
|
|
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
|
|
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
|
|
|
|
je doneWeight
|
|
|
|
loopWeight:
|
|
movss xmm3, [edx+JOINTWEIGHT_WEIGHT_OFFSET]
|
|
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
|
|
shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 )
|
|
add edx, JOINTWEIGHT_SIZE
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
|
|
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
|
|
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
|
|
|
|
addps xmm0, xmm3
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
|
|
jne loopWeight
|
|
|
|
doneWeight:
|
|
add esi, 4*BASEVECTOR_SIZE
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
// transform vertex
|
|
movaps xmm3, [esi-4*BASEVECTOR_SIZE]
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, xmm0
|
|
mulps xmm4, xmm1
|
|
mulps xmm5, xmm2
|
|
|
|
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1
|
|
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
movss xmm6, xmm7
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
|
|
|
|
movaps xmm5, xmm6
|
|
minps xmm5, tmpMin
|
|
maxps xmm6, tmpMax
|
|
movaps tmpMin, xmm5
|
|
movaps tmpMax, xmm6
|
|
|
|
// transform normal
|
|
movaps xmm3, [esi-3*BASEVECTOR_SIZE]
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, xmm0
|
|
mulps xmm4, xmm1
|
|
mulps xmm5, xmm2
|
|
|
|
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
|
|
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7
|
|
|
|
// transform first tangent
|
|
movaps xmm3, [esi-2*BASEVECTOR_SIZE]
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, xmm0
|
|
mulps xmm4, xmm1
|
|
mulps xmm5, xmm2
|
|
|
|
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
|
|
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+8], xmm7
|
|
|
|
// transform second tangent
|
|
movaps xmm3, [esi-1*BASEVECTOR_SIZE]
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm3
|
|
mulps xmm2, xmm3
|
|
|
|
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
|
|
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+8], xmm7
|
|
|
|
jl loopVert
|
|
|
|
done:
|
|
pop ebx
|
|
mov esi, bounds
|
|
movaps xmm6, tmpMin
|
|
movaps xmm7, tmpMax
|
|
movhps [esi+ 0], xmm6
|
|
movss [esi+ 8], xmm6
|
|
movhps [esi+12], xmm7
|
|
movss [esi+20], xmm7
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::TransformVertsAndTangentsFast
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::TransformVertsAndTangentsFast( idDrawVert *verts, const int numVerts, idBounds &bounds, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
|
|
ALIGN16( float tmpMin[4] );
|
|
ALIGN16( float tmpMax[4] );
|
|
|
|
assert_16_byte_aligned( joints );
|
|
assert_16_byte_aligned( base );
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
imul eax, DRAWVERT_SIZE
|
|
|
|
mov ecx, verts
|
|
mov edx, weights
|
|
mov esi, base
|
|
mov edi, joints
|
|
|
|
add ecx, eax
|
|
neg eax
|
|
|
|
movaps xmm6, SIMD_SP_infinity
|
|
movaps xmm7, SIMD_SP_negInfinity
|
|
movaps tmpMin, xmm6
|
|
movaps tmpMax, xmm7
|
|
|
|
loopVert:
|
|
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
|
|
|
|
add esi, 4*BASEVECTOR_SIZE
|
|
|
|
movaps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
|
|
movaps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
|
|
movaps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
|
|
|
|
add edx, dword ptr [edx+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET]
|
|
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
// transform vertex
|
|
movaps xmm3, [esi-4*BASEVECTOR_SIZE]
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, xmm0
|
|
mulps xmm4, xmm1
|
|
mulps xmm5, xmm2
|
|
|
|
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1
|
|
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
movss xmm6, xmm7
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
|
|
|
|
movaps xmm5, xmm6
|
|
minps xmm5, tmpMin
|
|
maxps xmm6, tmpMax
|
|
movaps tmpMin, xmm5
|
|
movaps tmpMax, xmm6
|
|
|
|
// transform normal
|
|
movaps xmm3, [esi-3*BASEVECTOR_SIZE]
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, xmm0
|
|
mulps xmm4, xmm1
|
|
mulps xmm5, xmm2
|
|
|
|
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
|
|
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7
|
|
|
|
// transform first tangent
|
|
movaps xmm3, [esi-2*BASEVECTOR_SIZE]
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
|
|
mulps xmm3, xmm0
|
|
mulps xmm4, xmm1
|
|
mulps xmm5, xmm2
|
|
|
|
movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm3, xmm4 // xmm3 = m2, m5, t0, t1
|
|
addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT0_OFFSET+8], xmm7
|
|
|
|
// transform second tangent
|
|
movaps xmm3, [esi-1*BASEVECTOR_SIZE]
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm3
|
|
mulps xmm2, xmm3
|
|
|
|
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
|
|
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+0], xmm6
|
|
|
|
pshufd xmm7, xmm6, R_SHUFFLE_D( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8
|
|
addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT1_OFFSET+8], xmm7
|
|
|
|
jl loopVert
|
|
|
|
done:
|
|
pop ebx
|
|
mov esi, bounds
|
|
movaps xmm6, tmpMin
|
|
movaps xmm7, tmpMax
|
|
movhps [esi+ 0], xmm6
|
|
movss [esi+ 8], xmm6
|
|
movhps [esi+12], xmm7
|
|
movss [esi+20], xmm7
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::ShadowVolume_CountFacing
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_SSE2::ShadowVolume_CountFacing( const byte *facing, const int numFaces ) {
|
|
#if 1
|
|
ALIGN16( int n[4]; )
|
|
|
|
__asm {
|
|
|
|
mov eax, numFaces
|
|
mov edi, facing
|
|
test eax, eax
|
|
jz done
|
|
|
|
pxor xmm6, xmm6
|
|
pxor xmm7, xmm7
|
|
|
|
sub eax, 256
|
|
jl run16
|
|
|
|
loop256:
|
|
movdqa xmm0, [edi+ 0*16]
|
|
movdqa xmm1, [edi+ 1*16]
|
|
movdqa xmm2, [edi+ 2*16]
|
|
movdqa xmm3, [edi+ 3*16]
|
|
paddusb xmm0, [edi+ 4*16]
|
|
paddusb xmm1, [edi+ 5*16]
|
|
paddusb xmm2, [edi+ 6*16]
|
|
paddusb xmm3, [edi+ 7*16]
|
|
paddusb xmm0, [edi+ 8*16]
|
|
paddusb xmm1, [edi+ 9*16]
|
|
paddusb xmm2, [edi+10*16]
|
|
paddusb xmm3, [edi+11*16]
|
|
paddusb xmm0, [edi+12*16]
|
|
paddusb xmm1, [edi+13*16]
|
|
paddusb xmm2, [edi+14*16]
|
|
paddusb xmm3, [edi+15*16]
|
|
paddusb xmm0, xmm1
|
|
paddusb xmm2, xmm3
|
|
paddusb xmm0, xmm2
|
|
|
|
add edi, 256
|
|
sub eax, 256
|
|
|
|
movdqa xmm1, xmm0
|
|
punpcklbw xmm0, xmm7
|
|
punpckhbw xmm1, xmm7
|
|
paddusw xmm0, xmm1
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm0, xmm7
|
|
punpckhwd xmm1, xmm7
|
|
paddd xmm0, xmm1
|
|
paddd xmm6, xmm0
|
|
|
|
jge loop256
|
|
|
|
run16:
|
|
pxor xmm0, xmm0
|
|
add eax, 256 - 16
|
|
jl run4
|
|
|
|
loop16:
|
|
paddusb xmm0, [edi]
|
|
add edi, 16
|
|
sub eax, 16
|
|
jge loop16
|
|
|
|
run4:
|
|
add eax, 16 - 4
|
|
jl run1
|
|
|
|
pxor xmm1, xmm1
|
|
|
|
loop4:
|
|
movd xmm1, [edi]
|
|
paddusb xmm0, xmm1
|
|
add edi, 4
|
|
sub eax, 4
|
|
jge loop4
|
|
|
|
run1:
|
|
movdqa xmm1, xmm0
|
|
punpcklbw xmm0, xmm7
|
|
punpckhbw xmm1, xmm7
|
|
paddusw xmm0, xmm1
|
|
movdqa xmm1, xmm0
|
|
punpcklwd xmm0, xmm7
|
|
punpckhwd xmm1, xmm7
|
|
paddd xmm0, xmm1
|
|
paddd xmm6, xmm0
|
|
|
|
movdqa n, xmm6
|
|
add eax, 4-1
|
|
jl done
|
|
|
|
mov edx, dword ptr n
|
|
|
|
loop1:
|
|
movzx ecx, [edi]
|
|
add edx, ecx
|
|
add edi, 1
|
|
sub eax, 1
|
|
jge loop1
|
|
|
|
mov dword ptr n, edx
|
|
|
|
done:
|
|
}
|
|
|
|
return n[0] + n[1] + n[2] + n[3];
|
|
|
|
#else
|
|
|
|
int i, n0, n1, n2, n3;
|
|
|
|
n0 = n1 = n2 = n3 = 0;
|
|
for ( i = 0; i < numFaces - 3; i += 4 ) {
|
|
n0 += facing[i+0];
|
|
n1 += facing[i+1];
|
|
n2 += facing[i+2];
|
|
n3 += facing[i+3];
|
|
}
|
|
for ( ; i < numFaces; i++ ) {
|
|
n0 += facing[i];
|
|
}
|
|
return n0 + n1 + n2 + n3;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::ShadowVolume_CountFacingCull
|
|
============
|
|
*/
|
|
#pragma warning( disable : 4731 ) // frame pointer register 'ebx' modified by inline assembly code
|
|
|
|
int VPCALL idSIMD_SSE2::ShadowVolume_CountFacingCull( byte *facing, const int numFaces, const int *indexes, const byte *cull ) {
|
|
#if 1
|
|
|
|
ALIGN16( int n[4]; )
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numFaces
|
|
mov esi, indexes
|
|
mov edi, cull
|
|
mov ebx, facing
|
|
test eax, eax
|
|
jz done
|
|
add ebx, eax
|
|
neg eax
|
|
|
|
pxor xmm5, xmm5
|
|
pxor xmm6, xmm6
|
|
movdqa xmm7, SIMD_B_one
|
|
|
|
add eax, 4
|
|
jg run1
|
|
|
|
loop4:
|
|
|
|
mov ecx, dword ptr [esi+0*4]
|
|
movzx edx, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+1*4]
|
|
and dl, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+2*4]
|
|
and dl, byte ptr [edi+ecx]
|
|
|
|
mov ecx, dword ptr [esi+3*4]
|
|
mov dh, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+4*4]
|
|
and dh, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+5*4]
|
|
and dh, byte ptr [edi+ecx]
|
|
movd xmm0, edx
|
|
|
|
mov ecx, dword ptr [esi+6*4]
|
|
movzx edx, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+7*4]
|
|
and dl, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+8*4]
|
|
and dl, byte ptr [edi+ecx]
|
|
|
|
mov ecx, dword ptr [esi+9*4]
|
|
mov dh, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+10*4]
|
|
and dh, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+11*4]
|
|
and dh, byte ptr [edi+ecx]
|
|
pinsrw xmm0, edx, 1
|
|
|
|
add esi, 12*4
|
|
|
|
movd xmm1, [ebx+eax-4]
|
|
pcmpgtb xmm0, xmm6
|
|
pand xmm0, xmm7
|
|
por xmm1, xmm0
|
|
movd [ebx+eax-4], xmm1
|
|
|
|
add eax, 4
|
|
|
|
punpcklbw xmm1, xmm6
|
|
punpcklwd xmm1, xmm6
|
|
paddd xmm5, xmm1
|
|
|
|
jle loop4
|
|
|
|
run1:
|
|
sub eax, 4
|
|
jge done
|
|
|
|
loop1:
|
|
mov ecx, dword ptr [esi+0*4]
|
|
movzx edx, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+1*4]
|
|
and dl, byte ptr [edi+ecx]
|
|
mov ecx, dword ptr [esi+2*4]
|
|
and dl, byte ptr [edi+ecx]
|
|
|
|
neg edx
|
|
shr edx, 31
|
|
movzx ecx, byte ptr [ebx+eax]
|
|
or ecx, edx
|
|
mov byte ptr [ebx+eax], cl
|
|
movd xmm0, ecx
|
|
paddd xmm5, xmm0
|
|
|
|
add esi, 3*4
|
|
add eax, 1
|
|
jl loop1
|
|
|
|
done:
|
|
pop ebx
|
|
movdqa dword ptr n, xmm5
|
|
}
|
|
|
|
return n[0] + n[1] + n[2] + n[3];
|
|
|
|
#else
|
|
|
|
int i, n;
|
|
|
|
n = 0;
|
|
for ( i = 0; i < numFaces - 3; i += 4 ) {
|
|
int c0 = cull[indexes[0]] & cull[indexes[ 1]] & cull[indexes[ 2]];
|
|
int c1 = cull[indexes[3]] & cull[indexes[ 4]] & cull[indexes[ 5]];
|
|
int c2 = cull[indexes[6]] & cull[indexes[ 7]] & cull[indexes[ 8]];
|
|
int c3 = cull[indexes[9]] & cull[indexes[10]] & cull[indexes[11]];
|
|
facing[i+0] |= ( (-c0) >> 31 ) & 1;
|
|
facing[i+1] |= ( (-c1) >> 31 ) & 1;
|
|
facing[i+2] |= ( (-c2) >> 31 ) & 1;
|
|
facing[i+3] |= ( (-c3) >> 31 ) & 1;
|
|
n += facing[i+0];
|
|
n += facing[i+1];
|
|
n += facing[i+2];
|
|
n += facing[i+3];
|
|
indexes += 12;
|
|
}
|
|
for ( ; i < numFaces; i++ ) {
|
|
int c = cull[indexes[0]] & cull[indexes[1]] & cull[indexes[2]];
|
|
c = ( (-c) >> 31 ) & 1;
|
|
facing[i] |= c;
|
|
n += facing[i];
|
|
indexes += 3;
|
|
}
|
|
return n;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::ShadowVolume_CreateSilTriangles
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_SSE2::ShadowVolume_CreateSilTriangles( int *shadowIndexes, const byte *facing, const silEdge_s *silEdges, const int numSilEdges ) {
|
|
#if 1
|
|
|
|
int num;
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numSilEdges
|
|
mov ebx, shadowIndexes
|
|
mov esi, facing
|
|
mov edi, silEdges
|
|
shl eax, 4
|
|
jz done
|
|
add edi, eax
|
|
neg eax
|
|
shr ebx, 3
|
|
|
|
add eax, 4*16
|
|
jg run1
|
|
|
|
loop4:
|
|
mov ecx, dword ptr [edi+eax-4*16+0]
|
|
movzx ecx, byte ptr [esi+ecx]
|
|
movd xmm2, ecx
|
|
mov edx, dword ptr [edi+eax-4*16+4]
|
|
movzx edx, byte ptr [esi+edx]
|
|
pinsrw xmm2, edx, 2
|
|
movq xmm0, qword ptr [edi+eax-4*16+8]
|
|
paddd xmm0, xmm0 //
|
|
pshufd xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 1, 1 )
|
|
xor ecx, edx
|
|
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
|
|
lea edx, [ecx*2+ecx]
|
|
pxor xmm0, xmm1
|
|
add edx, ebx
|
|
movlps qword ptr [ebx*8+0*4], xmm0
|
|
pxor xmm2, xmm0
|
|
movhps qword ptr [ebx*8+2*4], xmm0
|
|
movlps qword ptr [ebx*8+4*4], xmm2
|
|
|
|
mov ecx, dword ptr [edi+eax-3*16+0]
|
|
movzx ecx, byte ptr [esi+ecx]
|
|
movd xmm3, ecx
|
|
mov ebx, dword ptr [edi+eax-3*16+4]
|
|
movzx ebx, byte ptr [esi+ebx]
|
|
pinsrw xmm3, ebx, 2
|
|
movq xmm0, qword ptr [edi+eax-3*16+8]
|
|
paddd xmm0, xmm0 //
|
|
pshufd xmm1, xmm3, R_SHUFFLE_PS( 2, 0, 1, 1 )
|
|
xor ecx, ebx
|
|
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
|
|
lea ebx, [ecx*2+ecx]
|
|
pxor xmm0, xmm1
|
|
add ebx, edx
|
|
movlps qword ptr [edx*8+0*4], xmm0
|
|
pxor xmm3, xmm0
|
|
movhps qword ptr [edx*8+2*4], xmm0
|
|
movlps qword ptr [edx*8+4*4], xmm3
|
|
|
|
mov ecx, dword ptr [edi+eax-2*16+0]
|
|
movzx ecx, byte ptr [esi+ecx]
|
|
movd xmm2, ecx
|
|
mov edx, dword ptr [edi+eax-2*16+4]
|
|
movzx edx, byte ptr [esi+edx]
|
|
pinsrw xmm2, edx, 2
|
|
movq xmm0, qword ptr [edi+eax-2*16+8]
|
|
paddd xmm0, xmm0 //
|
|
pshufd xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 1, 1 )
|
|
xor ecx, edx
|
|
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
|
|
lea edx, [ecx*2+ecx]
|
|
pxor xmm0, xmm1
|
|
add edx, ebx
|
|
movlps qword ptr [ebx*8+0*4], xmm0
|
|
pxor xmm2, xmm0
|
|
movhps qword ptr [ebx*8+2*4], xmm0
|
|
movlps qword ptr [ebx*8+4*4], xmm2
|
|
|
|
mov ecx, dword ptr [edi+eax-1*16+0]
|
|
movzx ecx, byte ptr [esi+ecx]
|
|
movd xmm3, ecx
|
|
mov ebx, dword ptr [edi+eax-1*16+4]
|
|
movzx ebx, byte ptr [esi+ebx]
|
|
pinsrw xmm3, ebx, 2
|
|
movq xmm0, qword ptr [edi+eax-1*16+8]
|
|
paddd xmm0, xmm0 //
|
|
pshufd xmm1, xmm3, R_SHUFFLE_PS( 2, 0, 1, 1 )
|
|
xor ecx, ebx
|
|
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
|
|
lea ebx, [ecx*2+ecx]
|
|
pxor xmm0, xmm1
|
|
add ebx, edx
|
|
movlps qword ptr [edx*8+0*4], xmm0
|
|
pxor xmm3, xmm0
|
|
movhps qword ptr [edx*8+2*4], xmm0
|
|
add eax, 4*16
|
|
movlps qword ptr [edx*8+4*4], xmm3
|
|
|
|
jle loop4
|
|
|
|
run1:
|
|
sub eax, 4*16
|
|
jge done
|
|
|
|
loop1:
|
|
mov ecx, dword ptr [edi+eax+0]
|
|
movzx ecx, byte ptr [esi+ecx]
|
|
movd xmm2, ecx
|
|
mov edx, dword ptr [edi+eax+4]
|
|
movzx edx, byte ptr [esi+edx]
|
|
pinsrw xmm2, edx, 2
|
|
movq xmm0, qword ptr [edi+eax+8]
|
|
paddd xmm0, xmm0 //
|
|
pshufd xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 1, 1 )
|
|
pshufd xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 1, 0 )
|
|
pxor xmm0, xmm1
|
|
movlps qword ptr [ebx*8+0*4], xmm0
|
|
movhps qword ptr [ebx*8+2*4], xmm0
|
|
pxor xmm2, xmm0
|
|
movlps qword ptr [ebx*8+4*4], xmm2
|
|
xor ecx, edx
|
|
lea edx, [ecx*2+ecx]
|
|
add ebx, edx
|
|
|
|
add eax, 16
|
|
jl loop1
|
|
|
|
done:
|
|
shl ebx, 3
|
|
mov num, ebx
|
|
pop ebx
|
|
}
|
|
|
|
return ( num - (int)shadowIndexes ) >> 2;
|
|
|
|
#else
|
|
|
|
int i;
|
|
const silEdge_t *sil;
|
|
int *si;
|
|
|
|
si = shadowIndexes;
|
|
for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) {
|
|
|
|
int f1 = facing[sil->p1];
|
|
int f2 = facing[sil->p2];
|
|
|
|
if ( !( f1 ^ f2 ) ) {
|
|
continue;
|
|
}
|
|
|
|
int v1 = sil->v1 << 1;
|
|
int v2 = sil->v2 << 1;
|
|
|
|
// set the two triangle winding orders based on facing
|
|
// without using a poorly-predictable branch
|
|
|
|
si[0] = v1;
|
|
si[1] = v2 ^ f1;
|
|
si[2] = v2 ^ f2;
|
|
si[3] = v1 ^ f2;
|
|
si[4] = v1 ^ f1;
|
|
si[5] = v2 ^ 1;
|
|
|
|
si += 6;
|
|
}
|
|
return si - shadowIndexes;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::ShadowVolume_CreateCapTriangles
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_SSE2::ShadowVolume_CreateCapTriangles( int *shadowIndexes, const byte *facing, const int *indexes, const int numIndexes ) {
|
|
#if 1
|
|
|
|
int num = numIndexes / 3;
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numIndexes
|
|
mov ebx, shadowIndexes
|
|
mov esi, facing
|
|
mov edi, indexes
|
|
shl eax, 2
|
|
jz done
|
|
add edi, eax
|
|
mov eax, num
|
|
add esi, eax
|
|
neg eax
|
|
shr ebx, 3
|
|
|
|
movaps xmm6, SIMD_DW_capTris_c0
|
|
movaps xmm7, SIMD_DW_capTris_c1
|
|
movaps xmm5, SIMD_DW_capTris_c2
|
|
|
|
add eax, 4
|
|
lea edx, [eax*2+eax]
|
|
jg run1
|
|
|
|
loop4:
|
|
movdqa xmm0, [edi+edx*4-4*3*4+0] // xmm0 = 0, 1, 2, 3
|
|
paddd xmm0, xmm0
|
|
pshufd xmm1, xmm0, R_SHUFFLE_PS( 2, 1, 0, 0 ) // xmm1 = 2, 1, 0, 0
|
|
movzx ecx, byte ptr [esi+eax-4]
|
|
pshufd xmm2, xmm0, R_SHUFFLE_PS( 1, 2, 1, 2 ) // xmm2 = 1, 2, 1, 2
|
|
sub ecx, 1
|
|
pxor xmm1, xmm6
|
|
and ecx, 3
|
|
movlps qword ptr [ebx*8+0*4], xmm1
|
|
add ecx, ebx
|
|
movhps qword ptr [ebx*8+2*4], xmm1
|
|
pxor xmm2, xmm7
|
|
movlps qword ptr [ebx*8+4*4], xmm2
|
|
|
|
movdqa xmm3, [edi+edx*4-3*3*4+4] // xmm3 = 4, 5, 6, 7
|
|
paddd xmm3, xmm3
|
|
shufps xmm0, xmm3, R_SHUFFLE_PS( 3, 3, 1, 0 ) // xmm0 = 3 3, 5, 4
|
|
movzx ebx, byte ptr [esi+eax-3]
|
|
movdqa xmm2, xmm3 // xmm2 = 4, 5, 6, 7
|
|
sub ebx, 1
|
|
pxor xmm0, xmm5
|
|
and ebx, 3
|
|
movhps qword ptr [ecx*8+0*4], xmm0
|
|
add ebx, ecx
|
|
movlps qword ptr [ecx*8+2*4], xmm0
|
|
pxor xmm2, xmm7
|
|
movlps qword ptr [ecx*8+4*4], xmm2
|
|
|
|
movdqa xmm0, [edi+edx*4-1*3*4-4] // xmm0 = 8, 9, 10, 11
|
|
paddd xmm0, xmm0
|
|
shufps xmm3, xmm0, R_SHUFFLE_PS( 2, 3, 0, 1 ) // xmm3 = 6, 7, 8, 9
|
|
pshufd xmm1, xmm3, R_SHUFFLE_PS( 2, 1, 0, 0 ) // xmm1 = 8, 7, 6, 6
|
|
movzx ecx, byte ptr [esi+eax-2]
|
|
pshufd xmm2, xmm3, R_SHUFFLE_PS( 1, 2, 1, 2 ) // xmm2 = 7, 8, 7, 8
|
|
sub ecx, 1
|
|
pxor xmm1, xmm6
|
|
and ecx, 3
|
|
movlps qword ptr [ebx*8+0*4], xmm1
|
|
add ecx, ebx
|
|
movhps qword ptr [ebx*8+2*4], xmm1
|
|
pxor xmm2, xmm7
|
|
movlps qword ptr [ebx*8+4*4], xmm2
|
|
|
|
pshufd xmm1, xmm0, R_SHUFFLE_PS( 3, 2, 1, 1 )
|
|
movzx ebx, byte ptr [esi+eax-1]
|
|
pshufd xmm2, xmm0, R_SHUFFLE_PS( 2, 3, 2, 3 )
|
|
sub ebx, 1
|
|
pxor xmm1, xmm6
|
|
and ebx, 3
|
|
movlps qword ptr [ecx*8+0*4], xmm1
|
|
add ebx, ecx
|
|
movhps qword ptr [ecx*8+2*4], xmm1
|
|
pxor xmm2, xmm7
|
|
movlps qword ptr [ecx*8+4*4], xmm2
|
|
|
|
add edx, 3*4
|
|
add eax, 4
|
|
jle loop4
|
|
|
|
run1:
|
|
sub eax, 4
|
|
jge done
|
|
|
|
loop1:
|
|
lea edx, [eax*2+eax]
|
|
movdqu xmm0, [edi+edx*4+0]
|
|
paddd xmm0, xmm0
|
|
pshufd xmm1, xmm0, R_SHUFFLE_PS( 2, 1, 0, 0 )
|
|
pshufd xmm2, xmm0, R_SHUFFLE_PS( 1, 2, 1, 2 )
|
|
pxor xmm1, xmm6
|
|
movlps qword ptr [ebx*8+0*4], xmm1
|
|
pxor xmm2, xmm7
|
|
movhps qword ptr [ebx*8+2*4], xmm1
|
|
movzx ecx, byte ptr [esi+eax]
|
|
movlps qword ptr [ebx*8+4*4], xmm2
|
|
sub ecx, 1
|
|
and ecx, 3
|
|
add ebx, ecx
|
|
|
|
add eax, 1
|
|
jl loop1
|
|
|
|
done:
|
|
shl ebx, 3
|
|
mov num, ebx
|
|
pop ebx
|
|
}
|
|
|
|
return ( num - (int)shadowIndexes ) >> 2;
|
|
|
|
#else
|
|
|
|
int i, j;
|
|
int *si;
|
|
|
|
si = shadowIndexes;
|
|
for ( i = 0, j = 0; i < numIndexes; i += 3, j++ ) {
|
|
if ( facing[j] ) {
|
|
continue;
|
|
}
|
|
|
|
int i0 = indexes[i+0];
|
|
int i1 = indexes[i+1];
|
|
int i2 = indexes[i+2];
|
|
|
|
i0 += i0;
|
|
i1 += i1;
|
|
i2 += i2;
|
|
|
|
si[0] = i2;
|
|
si[1] = i1;
|
|
si[2] = i0;
|
|
si[3] = i0 ^ 1;
|
|
si[4] = i1 ^ 1;
|
|
si[5] = i2 ^ 1;
|
|
|
|
si += 6;
|
|
}
|
|
return si - shadowIndexes;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
|
|
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
|
|
|
|
__asm {
|
|
|
|
mov eax, numSamples
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop16:
|
|
|
|
movaps xmm0, [edi+eax+0*16]
|
|
movaps xmm1, [edi+eax+1*16]
|
|
movaps xmm2, [edi+eax+2*16]
|
|
movaps xmm3, [edi+eax+3*16]
|
|
|
|
add esi, 4*4*2
|
|
|
|
cvtps2dq xmm4, xmm0
|
|
cvtps2dq xmm5, xmm1
|
|
cvtps2dq xmm6, xmm2
|
|
cvtps2dq xmm7, xmm3
|
|
|
|
prefetchnta [edi+eax+128]
|
|
|
|
packssdw xmm4, xmm5
|
|
packssdw xmm6, xmm7
|
|
|
|
add eax, 4*16
|
|
|
|
movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
|
|
movhps [esi-3*4*2], xmm4
|
|
movlps [esi-2*4*2], xmm6
|
|
movhps [esi-1*4*2], xmm6
|
|
|
|
jl loop16
|
|
}
|
|
}
|
|
|
|
#endif /* _WINDOWS */
|