mirror of
https://github.com/dhewm/dhewm3.git
synced 2024-12-02 09:13:51 +00:00
736ec20d4d
Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards.
872 lines
24 KiB
C++
872 lines
24 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include "sys/platform.h"
|
|
|
|
#include "idlib/math/Simd_SSE2.h"
|
|
|
|
//===============================================================
|
|
//
|
|
// SSE2 implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
#if defined(__GNUC__) && defined(__SSE2__)
|
|
|
|
#include <emmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE2::GetName( void ) const {
|
|
return "MMX & SSE & SSE2";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpLT
|
|
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
int i, cnt, pre, post;
|
|
float *aligned;
|
|
__m128 xmm0, xmm1;
|
|
__m128i xmm0i;
|
|
int cnt_l;
|
|
char *src0_p;
|
|
char *constant_p;
|
|
char *dst_p;
|
|
int mask_l;
|
|
int dst_l;
|
|
|
|
/* if the float array is not aligned on a 4 byte boundary */
|
|
if ( ptrdiff_t(src0) & 3 ) {
|
|
/* unaligned memory access */
|
|
pre = 0;
|
|
cnt = count >> 2;
|
|
post = count - (cnt<<2);
|
|
|
|
/*
|
|
__asm mov edx, cnt
|
|
__asm test edx, edx
|
|
__asm je doneCmp
|
|
*/
|
|
cnt_l = cnt;
|
|
if(cnt_l != 0) {
|
|
/*
|
|
__asm push ebx
|
|
__asm neg edx
|
|
__asm mov esi, src0
|
|
__asm prefetchnta [esi+64]
|
|
__asm movss xmm1, constant
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
__asm mov edi, dst
|
|
__asm mov cl, bitNum
|
|
*/
|
|
cnt_l = -cnt_l;
|
|
src0_p = (char *) src0;
|
|
_mm_prefetch(src0_p+64, _MM_HINT_NTA);
|
|
constant_p = (char *) &constant;
|
|
xmm1 = _mm_load_ss((float *)constant_p);
|
|
xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
dst_p = (char *)dst;
|
|
/*
|
|
__asm loopNA:
|
|
*/
|
|
do {
|
|
/*
|
|
__asm movups xmm0, [esi]
|
|
__asm prefetchnta [esi+128]
|
|
__asm cmpltps xmm0, xmm1
|
|
__asm movmskps eax, xmm0 \
|
|
__asm mov ah, al
|
|
__asm shr ah, 1
|
|
__asm mov bx, ax
|
|
__asm shl ebx, 14
|
|
__asm mov bx, ax
|
|
__asm and ebx, 0x01010101
|
|
__asm shl ebx, cl
|
|
__asm or ebx, dword ptr [edi]
|
|
__asm mov dword ptr [edi], ebx
|
|
__asm add esi, 16
|
|
__asm add edi, 4
|
|
__asm inc edx
|
|
__asm jl loopNA
|
|
__asm pop ebx
|
|
*/
|
|
xmm0 = _mm_loadu_ps((float *) src0_p);
|
|
_mm_prefetch(src0_p+128, _MM_HINT_NTA);
|
|
xmm0 = _mm_cmplt_ps(xmm0, xmm1);
|
|
// Simplify using SSE2
|
|
xmm0i = (__m128i) xmm0;
|
|
xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
|
|
xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
|
|
mask_l = _mm_cvtsi128_si32(xmm0i);
|
|
// End
|
|
mask_l = mask_l & 0x01010101;
|
|
mask_l = mask_l << bitNum;
|
|
dst_l = *((int *) dst_p);
|
|
mask_l = mask_l | dst_l;
|
|
*((int *) dst_p) = mask_l;
|
|
src0_p = src0_p + 16;
|
|
dst_p = dst_p + 4;
|
|
cnt_l = cnt_l + 1;
|
|
} while (cnt_l < 0);
|
|
}
|
|
}
|
|
else {
|
|
/* aligned memory access */
|
|
aligned = (float *) ((ptrdiff_t(src0) + 15) & ~15);
|
|
if ( ptrdiff_t(aligned) > ptrdiff_t(src0) + count ) {
|
|
pre = count;
|
|
post = 0;
|
|
}
|
|
else {
|
|
pre = aligned - src0;
|
|
cnt = (count - pre) >> 2;
|
|
post = count - pre - (cnt<<2);
|
|
/*
|
|
__asm mov edx, cnt
|
|
__asm test edx, edx
|
|
__asm je doneCmp
|
|
*/
|
|
cnt_l = cnt;
|
|
if(cnt_l != 0) {
|
|
/*
|
|
__asm push ebx
|
|
__asm neg edx
|
|
__asm mov esi, aligned
|
|
__asm prefetchnta [esi+64]
|
|
__asm movss xmm1, constant
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
__asm mov edi, dst
|
|
__asm add edi, pre
|
|
__asm mov cl, bitNum
|
|
*/
|
|
cnt_l = -cnt_l;
|
|
src0_p = (char *) src0;
|
|
_mm_prefetch(src0_p+64, _MM_HINT_NTA);
|
|
constant_p = (char *) &constant;
|
|
xmm1 = _mm_load_ss((float *)constant_p);
|
|
xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
dst_p = (char *)dst;
|
|
dst_p = dst_p + pre;
|
|
/*
|
|
__asm loopA:
|
|
*/
|
|
do {
|
|
/*
|
|
__asm movaps xmm0, [esi]
|
|
__asm prefetchnta [esi+128]
|
|
__asm cmpltps xmm0, xmm1
|
|
__asm movmskps eax, xmm0 \
|
|
__asm mov ah, al
|
|
__asm shr ah, 1
|
|
__asm mov bx, ax
|
|
__asm shl ebx, 14
|
|
__asm mov bx, ax
|
|
__asm and ebx, 0x01010101
|
|
__asm shl ebx, cl
|
|
__asm or ebx, dword ptr [edi]
|
|
__asm mov dword ptr [edi], ebx
|
|
__asm add esi, 16
|
|
__asm add edi, 4
|
|
__asm inc edx
|
|
__asm jl loopA
|
|
__asm pop ebx
|
|
*/
|
|
xmm0 = _mm_load_ps((float *) src0_p);
|
|
_mm_prefetch(src0_p+128, _MM_HINT_NTA);
|
|
xmm0 = _mm_cmplt_ps(xmm0, xmm1);
|
|
// Simplify using SSE2
|
|
xmm0i = (__m128i) xmm0;
|
|
xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
|
|
xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
|
|
mask_l = _mm_cvtsi128_si32(xmm0i);
|
|
// End
|
|
mask_l = mask_l & 0x01010101;
|
|
mask_l = mask_l << bitNum;
|
|
dst_l = *((int *) dst_p);
|
|
mask_l = mask_l | dst_l;
|
|
*((int *) dst_p) = mask_l;
|
|
src0_p = src0_p + 16;
|
|
dst_p = dst_p + 4;
|
|
cnt_l = cnt_l + 1;
|
|
} while (cnt_l < 0);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
doneCmp:
|
|
*/
|
|
float c = constant;
|
|
for ( i = 0; i < pre; i++ ) {
|
|
dst[i] |= ( src0[i] < c ) << bitNum;
|
|
}
|
|
for ( i = count - post; i < count; i++ ) {
|
|
dst[i] |= ( src0[i] < c ) << bitNum;
|
|
}
|
|
}
|
|
|
|
#elif defined(_MSC_VER)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
|
|
#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
|
|
|
|
|
|
#define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
|
|
#define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
|
|
#define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
|
|
|
|
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
|
|
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
|
|
|
|
ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
|
|
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
|
|
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
|
|
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE2::GetName( void ) const {
|
|
return "MMX & SSE & SSE2";
|
|
}
|
|
|
|
#if 0 // the SSE2 code is ungodly slow
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MatX_LowerTriangularSolve
|
|
|
|
solves x in Lx = b for the n * n sub-matrix of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
if ( skip >= n ) {
|
|
return;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
#define NSKIP( n, s ) ((n<<3)|(s&7))
|
|
switch( NSKIP( n, skip ) ) {
|
|
case NSKIP( 1, 0 ): x[0] = b[0];
|
|
return;
|
|
case NSKIP( 2, 0 ): x[0] = b[0];
|
|
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
return;
|
|
case NSKIP( 3, 0 ): x[0] = b[0];
|
|
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
return;
|
|
case NSKIP( 4, 0 ): x[0] = b[0];
|
|
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
return;
|
|
case NSKIP( 5, 0 ): x[0] = b[0];
|
|
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
return;
|
|
case NSKIP( 6, 0 ): x[0] = b[0];
|
|
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
return;
|
|
case NSKIP( 7, 0 ): x[0] = b[0];
|
|
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// process first 4 rows
|
|
switch( skip ) {
|
|
case 0: x[0] = b[0];
|
|
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
skip = 4;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, skip // eax = i
|
|
shl eax, 2 // eax = i*4
|
|
mov edx, n // edx = n
|
|
shl edx, 2 // edx = n*4
|
|
mov esi, x // esi = x
|
|
mov edi, lptr // edi = lptr
|
|
add esi, eax
|
|
add edi, eax
|
|
mov ebx, b // ebx = b
|
|
// aligned
|
|
looprow:
|
|
mov ecx, eax
|
|
neg ecx
|
|
cvtps2pd xmm0, [esi+ecx]
|
|
cvtps2pd xmm2, [edi+ecx]
|
|
mulpd xmm0, xmm2
|
|
cvtps2pd xmm1, [esi+ecx+8]
|
|
cvtps2pd xmm3, [edi+ecx+8]
|
|
mulpd xmm1, xmm3
|
|
add ecx, 20*4
|
|
jg donedot16
|
|
dot16:
|
|
cvtps2pd xmm2, [esi+ecx-(16*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(16*4)]
|
|
cvtps2pd xmm4, [esi+ecx-(14*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(14*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(12*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(12*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(10*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(10*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(8*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(8*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(6*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(6*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(4*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm2
|
|
add ecx, 16*4
|
|
mulpd xmm4, xmm5
|
|
addpd xmm1, xmm4
|
|
jle dot16
|
|
donedot16:
|
|
sub ecx, 8*4
|
|
jg donedot8
|
|
dot8:
|
|
cvtps2pd xmm2, [esi+ecx-(8*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(8*4)]
|
|
cvtps2pd xmm7, [esi+ecx-(6*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(6*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm6, [esi+ecx-(4*4)]
|
|
mulpd xmm7, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
addpd xmm1, xmm7
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm6, xmm3
|
|
cvtps2pd xmm7, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm6
|
|
add ecx, 8*4
|
|
mulpd xmm4, xmm7
|
|
addpd xmm1, xmm4
|
|
donedot8:
|
|
sub ecx, 4*4
|
|
jg donedot4
|
|
dot4:
|
|
cvtps2pd xmm2, [esi+ecx-(4*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm2
|
|
add ecx, 4*4
|
|
mulpd xmm4, xmm5
|
|
addpd xmm1, xmm4
|
|
donedot4:
|
|
addpd xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
|
|
addsd xmm0, xmm1
|
|
sub ecx, 4*4
|
|
jz dot0
|
|
add ecx, 4
|
|
jz dot1
|
|
add ecx, 4
|
|
jz dot2
|
|
//dot3:
|
|
cvtss2sd xmm1, [esi-(3*4)]
|
|
cvtss2sd xmm2, [edi-(3*4)]
|
|
mulsd xmm1, xmm2
|
|
addsd xmm0, xmm1
|
|
dot2:
|
|
cvtss2sd xmm3, [esi-(2*4)]
|
|
cvtss2sd xmm4, [edi-(2*4)]
|
|
mulsd xmm3, xmm4
|
|
addsd xmm0, xmm3
|
|
dot1:
|
|
cvtss2sd xmm5, [esi-(1*4)]
|
|
cvtss2sd xmm6, [edi-(1*4)]
|
|
mulsd xmm5, xmm6
|
|
addsd xmm0, xmm5
|
|
dot0:
|
|
cvtss2sd xmm1, [ebx+eax]
|
|
subsd xmm1, xmm0
|
|
cvtsd2ss xmm0, xmm1
|
|
movss [esi], xmm0
|
|
add eax, 4
|
|
cmp eax, edx
|
|
jge done
|
|
add esi, 4
|
|
mov ecx, nc
|
|
shl ecx, 2
|
|
add edi, ecx
|
|
add edi, 4
|
|
jmp looprow
|
|
// done
|
|
done:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L'x = b for the n * n sub-matrix of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x[1] = b[1];
|
|
x[0] = b[0] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 3:
|
|
x[2] = b[2];
|
|
x[1] = b[1] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 4:
|
|
x[3] = b[3];
|
|
x[2] = b[2] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 5:
|
|
x[4] = b[4];
|
|
x[3] = b[3] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 6:
|
|
x[5] = b[5];
|
|
x[4] = b[4] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 7:
|
|
x[6] = b[6];
|
|
x[5] = b[5] - lptr[6*nc+5] * x[6];
|
|
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int i, j, m;
|
|
float *xptr;
|
|
double s0;
|
|
|
|
// if the number of columns is not a multiple of 2 we're screwed for alignment.
|
|
// however, if the number of columns is a multiple of 2 but the number of to be
|
|
// processed rows is not a multiple of 2 we can still run 8 byte aligned
|
|
m = n;
|
|
if ( m & 1 ) {
|
|
m--;
|
|
x[m] = b[m];
|
|
|
|
lptr = L[m] + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows_1:
|
|
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
xor ecx, ecx
|
|
sub eax, m
|
|
neg eax
|
|
jz done4x4_1
|
|
process4x4_1: // process 4x4 blocks
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+0]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+4]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
cvtps2pd xmm3, [edi]
|
|
mulpd xmm6, xmm7
|
|
cvtps2pd xmm4, [edi+8]
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+8]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+12]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
add ecx, 4
|
|
mulpd xmm6, xmm7
|
|
cmp ecx, eax
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
jl process4x4_1
|
|
done4x4_1: // process left over of the 4 rows
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
cvtss2sd xmm5, [esi+4*ecx]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
mulpd xmm4, xmm5
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
|
|
add eax, m
|
|
sub eax, 4
|
|
movapd xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
|
|
movapd xmm3, xmm2
|
|
shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
|
|
sub edi, edx
|
|
cvtsd2ss xmm7, xmm3
|
|
movss [esi-4], xmm7 // xptr[-1] = s3
|
|
movsd xmm4, xmm3
|
|
movsd xmm5, xmm3
|
|
cvtss2sd xmm7, [edi+8]
|
|
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
|
|
subsd xmm2, xmm3
|
|
cvtsd2ss xmm7, xmm2
|
|
movss [esi-8], xmm7 // xptr[-2] = s2
|
|
movsd xmm6, xmm2
|
|
sub edi, edx
|
|
subsd xmm0, xmm5
|
|
subsd xmm1, xmm4
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
|
|
subsd xmm1, xmm2
|
|
cvtsd2ss xmm7, xmm1
|
|
movss [esi-12], xmm7 // xptr[-3] = s1
|
|
subsd xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
|
|
subsd xmm0, xmm1
|
|
cvtsd2ss xmm7, xmm0
|
|
movss [esi-16], xmm7 // xptr[-4] = s0
|
|
jl done4rows_1
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows_1
|
|
done4rows_1:
|
|
pop ebx
|
|
}
|
|
}
|
|
else {
|
|
lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows:
|
|
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
sub eax, m
|
|
jz done4x4
|
|
neg eax
|
|
xor ecx, ecx
|
|
process4x4: // process 4x4 blocks
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+0]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+4]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
cvtps2pd xmm3, [edi]
|
|
mulpd xmm6, xmm7
|
|
cvtps2pd xmm4, [edi+8]
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+8]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+12]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
add ecx, 4
|
|
mulpd xmm6, xmm7
|
|
cmp ecx, eax
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
jl process4x4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
done4x4: // process left over of the 4 rows
|
|
add eax, m
|
|
sub eax, 4
|
|
movapd xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
|
|
movapd xmm3, xmm2
|
|
shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
|
|
sub edi, edx
|
|
cvtsd2ss xmm7, xmm3
|
|
movss [esi-4], xmm7 // xptr[-1] = s3
|
|
movsd xmm4, xmm3
|
|
movsd xmm5, xmm3
|
|
cvtss2sd xmm7, [edi+8]
|
|
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
|
|
subsd xmm2, xmm3
|
|
cvtsd2ss xmm7, xmm2
|
|
movss [esi-8], xmm7 // xptr[-2] = s2
|
|
movsd xmm6, xmm2
|
|
sub edi, edx
|
|
subsd xmm0, xmm5
|
|
subsd xmm1, xmm4
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
|
|
subsd xmm1, xmm2
|
|
cvtsd2ss xmm7, xmm1
|
|
movss [esi-12], xmm7 // xptr[-3] = s1
|
|
subsd xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
|
|
subsd xmm0, xmm1
|
|
cvtsd2ss xmm7, xmm0
|
|
movss [esi-16], xmm7 // xptr[-4] = s0
|
|
jl done4rows
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows
|
|
done4rows:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
// process left over rows
|
|
for ( i = (m&3)-1; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[i+1] + i;
|
|
for ( j = i + 1; j < m; j++ ) {
|
|
s0 -= lptr[0] * x[j];
|
|
lptr += nc;
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
|
|
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
|
|
|
|
__asm {
|
|
|
|
mov eax, numSamples
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop16:
|
|
|
|
movaps xmm0, [edi+eax+0*16]
|
|
movaps xmm1, [edi+eax+1*16]
|
|
movaps xmm2, [edi+eax+2*16]
|
|
movaps xmm3, [edi+eax+3*16]
|
|
|
|
add esi, 4*4*2
|
|
|
|
cvtps2dq xmm4, xmm0
|
|
cvtps2dq xmm5, xmm1
|
|
cvtps2dq xmm6, xmm2
|
|
cvtps2dq xmm7, xmm3
|
|
|
|
prefetchnta [edi+eax+128]
|
|
|
|
packssdw xmm4, xmm5
|
|
packssdw xmm6, xmm7
|
|
|
|
add eax, 4*16
|
|
|
|
movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
|
|
movhps [esi-3*4*2], xmm4
|
|
movlps [esi-2*4*2], xmm6
|
|
movhps [esi-1*4*2], xmm6
|
|
|
|
jl loop16
|
|
}
|
|
}
|
|
|
|
#endif /* _WIN32 */
|