mirror of
https://github.com/dhewm/dhewm3.git
synced 2024-11-27 06:32:27 +00:00
545004a04c
sizeof(float) != sizeof(long) on this ABI.
877 lines
24 KiB
C++
877 lines
24 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include "../precompiled.h"
|
|
#pragma hdrstop
|
|
|
|
#include "Simd_Generic.h"
|
|
#include "Simd_MMX.h"
|
|
#include "Simd_SSE.h"
|
|
#include "Simd_SSE2.h"
|
|
|
|
|
|
//===============================================================
|
|
//
|
|
// SSE2 implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
#if defined(MACOS_X) && defined(__i386__)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE2::GetName( void ) const {
|
|
return "MMX & SSE & SSE2";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpLT
|
|
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
int i, cnt, pre, post;
|
|
float *aligned;
|
|
__m128 xmm0, xmm1;
|
|
__m128i xmm0i;
|
|
int cnt_l;
|
|
char *src0_p;
|
|
char *constant_p;
|
|
char *dst_p;
|
|
int mask_l;
|
|
int dst_l;
|
|
|
|
/* if the float array is not aligned on a 4 byte boundary */
|
|
if ( ((int) src0) & 3 ) {
|
|
/* unaligned memory access */
|
|
pre = 0;
|
|
cnt = count >> 2;
|
|
post = count - (cnt<<2);
|
|
|
|
/*
|
|
__asm mov edx, cnt
|
|
__asm test edx, edx
|
|
__asm je doneCmp
|
|
*/
|
|
cnt_l = cnt;
|
|
if(cnt_l != 0) {
|
|
/*
|
|
__asm push ebx
|
|
__asm neg edx
|
|
__asm mov esi, src0
|
|
__asm prefetchnta [esi+64]
|
|
__asm movss xmm1, constant
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
__asm mov edi, dst
|
|
__asm mov cl, bitNum
|
|
*/
|
|
cnt_l = -cnt_l;
|
|
src0_p = (char *) src0;
|
|
_mm_prefetch(src0_p+64, _MM_HINT_NTA);
|
|
constant_p = (char *) &constant;
|
|
xmm1 = _mm_load_ss((float *)constant_p);
|
|
xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
dst_p = (char *)dst;
|
|
/*
|
|
__asm loopNA:
|
|
*/
|
|
do {
|
|
/*
|
|
__asm movups xmm0, [esi]
|
|
__asm prefetchnta [esi+128]
|
|
__asm cmpltps xmm0, xmm1
|
|
__asm movmskps eax, xmm0 \
|
|
__asm mov ah, al
|
|
__asm shr ah, 1
|
|
__asm mov bx, ax
|
|
__asm shl ebx, 14
|
|
__asm mov bx, ax
|
|
__asm and ebx, 0x01010101
|
|
__asm shl ebx, cl
|
|
__asm or ebx, dword ptr [edi]
|
|
__asm mov dword ptr [edi], ebx
|
|
__asm add esi, 16
|
|
__asm add edi, 4
|
|
__asm inc edx
|
|
__asm jl loopNA
|
|
__asm pop ebx
|
|
*/
|
|
xmm0 = _mm_loadu_ps((float *) src0_p);
|
|
_mm_prefetch(src0_p+128, _MM_HINT_NTA);
|
|
xmm0 = _mm_cmplt_ps(xmm0, xmm1);
|
|
// Simplify using SSE2
|
|
xmm0i = (__m128i) xmm0;
|
|
xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
|
|
xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
|
|
mask_l = _mm_cvtsi128_si32(xmm0i);
|
|
// End
|
|
mask_l = mask_l & 0x01010101;
|
|
mask_l = mask_l << bitNum;
|
|
dst_l = *((int *) dst_p);
|
|
mask_l = mask_l | dst_l;
|
|
*((int *) dst_p) = mask_l;
|
|
src0_p = src0_p + 16;
|
|
dst_p = dst_p + 4;
|
|
cnt_l = cnt_l + 1;
|
|
} while (cnt_l < 0);
|
|
}
|
|
}
|
|
else {
|
|
/* aligned memory access */
|
|
aligned = (float *) ((((int) src0) + 15) & ~15);
|
|
if ( (int)aligned > ((int)src0) + count ) {
|
|
pre = count;
|
|
post = 0;
|
|
}
|
|
else {
|
|
pre = aligned - src0;
|
|
cnt = (count - pre) >> 2;
|
|
post = count - pre - (cnt<<2);
|
|
/*
|
|
__asm mov edx, cnt
|
|
__asm test edx, edx
|
|
__asm je doneCmp
|
|
*/
|
|
cnt_l = cnt;
|
|
if(cnt_l != 0) {
|
|
/*
|
|
__asm push ebx
|
|
__asm neg edx
|
|
__asm mov esi, aligned
|
|
__asm prefetchnta [esi+64]
|
|
__asm movss xmm1, constant
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
__asm mov edi, dst
|
|
__asm add edi, pre
|
|
__asm mov cl, bitNum
|
|
*/
|
|
cnt_l = -cnt_l;
|
|
src0_p = (char *) src0;
|
|
_mm_prefetch(src0_p+64, _MM_HINT_NTA);
|
|
constant_p = (char *) &constant;
|
|
xmm1 = _mm_load_ss((float *)constant_p);
|
|
xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
dst_p = (char *)dst;
|
|
dst_p = dst_p + pre;
|
|
/*
|
|
__asm loopA:
|
|
*/
|
|
do {
|
|
/*
|
|
__asm movaps xmm0, [esi]
|
|
__asm prefetchnta [esi+128]
|
|
__asm cmpltps xmm0, xmm1
|
|
__asm movmskps eax, xmm0 \
|
|
__asm mov ah, al
|
|
__asm shr ah, 1
|
|
__asm mov bx, ax
|
|
__asm shl ebx, 14
|
|
__asm mov bx, ax
|
|
__asm and ebx, 0x01010101
|
|
__asm shl ebx, cl
|
|
__asm or ebx, dword ptr [edi]
|
|
__asm mov dword ptr [edi], ebx
|
|
__asm add esi, 16
|
|
__asm add edi, 4
|
|
__asm inc edx
|
|
__asm jl loopA
|
|
__asm pop ebx
|
|
*/
|
|
xmm0 = _mm_load_ps((float *) src0_p);
|
|
_mm_prefetch(src0_p+128, _MM_HINT_NTA);
|
|
xmm0 = _mm_cmplt_ps(xmm0, xmm1);
|
|
// Simplify using SSE2
|
|
xmm0i = (__m128i) xmm0;
|
|
xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
|
|
xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
|
|
mask_l = _mm_cvtsi128_si32(xmm0i);
|
|
// End
|
|
mask_l = mask_l & 0x01010101;
|
|
mask_l = mask_l << bitNum;
|
|
dst_l = *((int *) dst_p);
|
|
mask_l = mask_l | dst_l;
|
|
*((int *) dst_p) = mask_l;
|
|
src0_p = src0_p + 16;
|
|
dst_p = dst_p + 4;
|
|
cnt_l = cnt_l + 1;
|
|
} while (cnt_l < 0);
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
doneCmp:
|
|
*/
|
|
float c = constant;
|
|
for ( i = 0; i < pre; i++ ) {
|
|
dst[i] |= ( src0[i] < c ) << bitNum;
|
|
}
|
|
for ( i = count - post; i < count; i++ ) {
|
|
dst[i] |= ( src0[i] < c ) << bitNum;
|
|
}
|
|
}
|
|
|
|
#elif defined(_WIN32)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
|
|
#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
|
|
|
|
|
|
#define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
|
|
#define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
|
|
#define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
|
|
|
|
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
|
|
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
|
|
|
|
ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
|
|
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
|
|
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
|
|
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE2::GetName( void ) const {
|
|
return "MMX & SSE & SSE2";
|
|
}
|
|
|
|
#if 0 // the SSE2 code is ungodly slow
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MatX_LowerTriangularSolve
|
|
|
|
solves x in Lx = b for the n * n sub-matrix of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
if ( skip >= n ) {
|
|
return;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
#define NSKIP( n, s ) ((n<<3)|(s&7))
|
|
switch( NSKIP( n, skip ) ) {
|
|
case NSKIP( 1, 0 ): x[0] = b[0];
|
|
return;
|
|
case NSKIP( 2, 0 ): x[0] = b[0];
|
|
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
return;
|
|
case NSKIP( 3, 0 ): x[0] = b[0];
|
|
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
return;
|
|
case NSKIP( 4, 0 ): x[0] = b[0];
|
|
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
return;
|
|
case NSKIP( 5, 0 ): x[0] = b[0];
|
|
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
return;
|
|
case NSKIP( 6, 0 ): x[0] = b[0];
|
|
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
return;
|
|
case NSKIP( 7, 0 ): x[0] = b[0];
|
|
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// process first 4 rows
|
|
switch( skip ) {
|
|
case 0: x[0] = b[0];
|
|
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
skip = 4;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, skip // eax = i
|
|
shl eax, 2 // eax = i*4
|
|
mov edx, n // edx = n
|
|
shl edx, 2 // edx = n*4
|
|
mov esi, x // esi = x
|
|
mov edi, lptr // edi = lptr
|
|
add esi, eax
|
|
add edi, eax
|
|
mov ebx, b // ebx = b
|
|
// aligned
|
|
looprow:
|
|
mov ecx, eax
|
|
neg ecx
|
|
cvtps2pd xmm0, [esi+ecx]
|
|
cvtps2pd xmm2, [edi+ecx]
|
|
mulpd xmm0, xmm2
|
|
cvtps2pd xmm1, [esi+ecx+8]
|
|
cvtps2pd xmm3, [edi+ecx+8]
|
|
mulpd xmm1, xmm3
|
|
add ecx, 20*4
|
|
jg donedot16
|
|
dot16:
|
|
cvtps2pd xmm2, [esi+ecx-(16*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(16*4)]
|
|
cvtps2pd xmm4, [esi+ecx-(14*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(14*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(12*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(12*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(10*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(10*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(8*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(8*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(6*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(6*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm2, [esi+ecx-(4*4)]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
addpd xmm1, xmm4
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm2
|
|
add ecx, 16*4
|
|
mulpd xmm4, xmm5
|
|
addpd xmm1, xmm4
|
|
jle dot16
|
|
donedot16:
|
|
sub ecx, 8*4
|
|
jg donedot8
|
|
dot8:
|
|
cvtps2pd xmm2, [esi+ecx-(8*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(8*4)]
|
|
cvtps2pd xmm7, [esi+ecx-(6*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(6*4)]
|
|
addpd xmm0, xmm2
|
|
cvtps2pd xmm6, [esi+ecx-(4*4)]
|
|
mulpd xmm7, xmm5
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
addpd xmm1, xmm7
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm6, xmm3
|
|
cvtps2pd xmm7, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm6
|
|
add ecx, 8*4
|
|
mulpd xmm4, xmm7
|
|
addpd xmm1, xmm4
|
|
donedot8:
|
|
sub ecx, 4*4
|
|
jg donedot4
|
|
dot4:
|
|
cvtps2pd xmm2, [esi+ecx-(4*4)]
|
|
cvtps2pd xmm3, [edi+ecx-(4*4)]
|
|
cvtps2pd xmm4, [esi+ecx-(2*4)]
|
|
mulpd xmm2, xmm3
|
|
cvtps2pd xmm5, [edi+ecx-(2*4)]
|
|
addpd xmm0, xmm2
|
|
add ecx, 4*4
|
|
mulpd xmm4, xmm5
|
|
addpd xmm1, xmm4
|
|
donedot4:
|
|
addpd xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
|
|
addsd xmm0, xmm1
|
|
sub ecx, 4*4
|
|
jz dot0
|
|
add ecx, 4
|
|
jz dot1
|
|
add ecx, 4
|
|
jz dot2
|
|
//dot3:
|
|
cvtss2sd xmm1, [esi-(3*4)]
|
|
cvtss2sd xmm2, [edi-(3*4)]
|
|
mulsd xmm1, xmm2
|
|
addsd xmm0, xmm1
|
|
dot2:
|
|
cvtss2sd xmm3, [esi-(2*4)]
|
|
cvtss2sd xmm4, [edi-(2*4)]
|
|
mulsd xmm3, xmm4
|
|
addsd xmm0, xmm3
|
|
dot1:
|
|
cvtss2sd xmm5, [esi-(1*4)]
|
|
cvtss2sd xmm6, [edi-(1*4)]
|
|
mulsd xmm5, xmm6
|
|
addsd xmm0, xmm5
|
|
dot0:
|
|
cvtss2sd xmm1, [ebx+eax]
|
|
subsd xmm1, xmm0
|
|
cvtsd2ss xmm0, xmm1
|
|
movss [esi], xmm0
|
|
add eax, 4
|
|
cmp eax, edx
|
|
jge done
|
|
add esi, 4
|
|
mov ecx, nc
|
|
shl ecx, 2
|
|
add edi, ecx
|
|
add edi, 4
|
|
jmp looprow
|
|
// done
|
|
done:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L'x = b for the n * n sub-matrix of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x[1] = b[1];
|
|
x[0] = b[0] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 3:
|
|
x[2] = b[2];
|
|
x[1] = b[1] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 4:
|
|
x[3] = b[3];
|
|
x[2] = b[2] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 5:
|
|
x[4] = b[4];
|
|
x[3] = b[3] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 6:
|
|
x[5] = b[5];
|
|
x[4] = b[4] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 7:
|
|
x[6] = b[6];
|
|
x[5] = b[5] - lptr[6*nc+5] * x[6];
|
|
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int i, j, m;
|
|
float *xptr;
|
|
double s0;
|
|
|
|
// if the number of columns is not a multiple of 2 we're screwed for alignment.
|
|
// however, if the number of columns is a multiple of 2 but the number of to be
|
|
// processed rows is not a multiple of 2 we can still run 8 byte aligned
|
|
m = n;
|
|
if ( m & 1 ) {
|
|
m--;
|
|
x[m] = b[m];
|
|
|
|
lptr = L[m] + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows_1:
|
|
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
xor ecx, ecx
|
|
sub eax, m
|
|
neg eax
|
|
jz done4x4_1
|
|
process4x4_1: // process 4x4 blocks
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+0]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+4]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
cvtps2pd xmm3, [edi]
|
|
mulpd xmm6, xmm7
|
|
cvtps2pd xmm4, [edi+8]
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+8]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+12]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
add ecx, 4
|
|
mulpd xmm6, xmm7
|
|
cmp ecx, eax
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
jl process4x4_1
|
|
done4x4_1: // process left over of the 4 rows
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
cvtss2sd xmm5, [esi+4*ecx]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
mulpd xmm4, xmm5
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
|
|
add eax, m
|
|
sub eax, 4
|
|
movapd xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
|
|
movapd xmm3, xmm2
|
|
shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
|
|
sub edi, edx
|
|
cvtsd2ss xmm7, xmm3
|
|
movss [esi-4], xmm7 // xptr[-1] = s3
|
|
movsd xmm4, xmm3
|
|
movsd xmm5, xmm3
|
|
cvtss2sd xmm7, [edi+8]
|
|
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
|
|
subsd xmm2, xmm3
|
|
cvtsd2ss xmm7, xmm2
|
|
movss [esi-8], xmm7 // xptr[-2] = s2
|
|
movsd xmm6, xmm2
|
|
sub edi, edx
|
|
subsd xmm0, xmm5
|
|
subsd xmm1, xmm4
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
|
|
subsd xmm1, xmm2
|
|
cvtsd2ss xmm7, xmm1
|
|
movss [esi-12], xmm7 // xptr[-3] = s1
|
|
subsd xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
|
|
subsd xmm0, xmm1
|
|
cvtsd2ss xmm7, xmm0
|
|
movss [esi-16], xmm7 // xptr[-4] = s0
|
|
jl done4rows_1
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows_1
|
|
done4rows_1:
|
|
pop ebx
|
|
}
|
|
}
|
|
else {
|
|
lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows:
|
|
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
sub eax, m
|
|
jz done4x4
|
|
neg eax
|
|
xor ecx, ecx
|
|
process4x4: // process 4x4 blocks
|
|
cvtps2pd xmm3, [edi]
|
|
cvtps2pd xmm4, [edi+8]
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+0]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+4]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
cvtps2pd xmm3, [edi]
|
|
mulpd xmm6, xmm7
|
|
cvtps2pd xmm4, [edi+8]
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
add edi, edx
|
|
cvtss2sd xmm5, [esi+4*ecx+8]
|
|
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm3, xmm5
|
|
cvtps2pd xmm1, [edi]
|
|
mulpd xmm4, xmm5
|
|
cvtps2pd xmm6, [edi+8]
|
|
subpd xmm0, xmm3
|
|
subpd xmm2, xmm4
|
|
add edi, edx
|
|
cvtss2sd xmm7, [esi+4*ecx+12]
|
|
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
|
|
mulpd xmm1, xmm7
|
|
add ecx, 4
|
|
mulpd xmm6, xmm7
|
|
cmp ecx, eax
|
|
subpd xmm0, xmm1
|
|
subpd xmm2, xmm6
|
|
jl process4x4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
done4x4: // process left over of the 4 rows
|
|
add eax, m
|
|
sub eax, 4
|
|
movapd xmm1, xmm0
|
|
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
|
|
movapd xmm3, xmm2
|
|
shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
|
|
sub edi, edx
|
|
cvtsd2ss xmm7, xmm3
|
|
movss [esi-4], xmm7 // xptr[-1] = s3
|
|
movsd xmm4, xmm3
|
|
movsd xmm5, xmm3
|
|
cvtss2sd xmm7, [edi+8]
|
|
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
|
|
subsd xmm2, xmm3
|
|
cvtsd2ss xmm7, xmm2
|
|
movss [esi-8], xmm7 // xptr[-2] = s2
|
|
movsd xmm6, xmm2
|
|
sub edi, edx
|
|
subsd xmm0, xmm5
|
|
subsd xmm1, xmm4
|
|
cvtss2sd xmm7, [edi+4]
|
|
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
|
|
subsd xmm1, xmm2
|
|
cvtsd2ss xmm7, xmm1
|
|
movss [esi-12], xmm7 // xptr[-3] = s1
|
|
subsd xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
cvtss2sd xmm7, [edi]
|
|
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
|
|
subsd xmm0, xmm1
|
|
cvtsd2ss xmm7, xmm0
|
|
movss [esi-16], xmm7 // xptr[-4] = s0
|
|
jl done4rows
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows
|
|
done4rows:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
// process left over rows
|
|
for ( i = (m&3)-1; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[i+1] + i;
|
|
for ( j = i + 1; j < m; j++ ) {
|
|
s0 -= lptr[0] * x[j];
|
|
lptr += nc;
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE2::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
|
|
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
|
|
|
|
__asm {
|
|
|
|
mov eax, numSamples
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop16:
|
|
|
|
movaps xmm0, [edi+eax+0*16]
|
|
movaps xmm1, [edi+eax+1*16]
|
|
movaps xmm2, [edi+eax+2*16]
|
|
movaps xmm3, [edi+eax+3*16]
|
|
|
|
add esi, 4*4*2
|
|
|
|
cvtps2dq xmm4, xmm0
|
|
cvtps2dq xmm5, xmm1
|
|
cvtps2dq xmm6, xmm2
|
|
cvtps2dq xmm7, xmm3
|
|
|
|
prefetchnta [edi+eax+128]
|
|
|
|
packssdw xmm4, xmm5
|
|
packssdw xmm6, xmm7
|
|
|
|
add eax, 4*16
|
|
|
|
movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
|
|
movhps [esi-3*4*2], xmm4
|
|
movlps [esi-2*4*2], xmm6
|
|
movhps [esi-1*4*2], xmm6
|
|
|
|
jl loop16
|
|
}
|
|
}
|
|
|
|
#endif /* _WIN32 */
|