quadrilateralcowboy/idlib/math/Simd_SSE2.cpp
Ethan Lee 69420a703a Port Linux/macOS to 64-bit, using new CMake build system.
Huge shoutout to dhewm3 and RBDOOM-3-BFG for doing 99% of this work before us!
2020-10-22 13:16:42 -04:00

877 lines
25 KiB
C++

/*
===========================================================================
Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"
#include "Simd_SSE2.h"
//===============================================================
//
// SSE2 implementation of idSIMDProcessor
//
//===============================================================
#if defined(MACOS_X) && defined(__i386__)
#include <xmmintrin.h>
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
/*
============
idSIMD_SSE2::GetName
============
*/
const char * idSIMD_SSE2::GetName( void ) const {
return "MMX & SSE & SSE2";
}
/*
============
idSIMD_SSE::CmpLT
dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
int i, cnt, pre, post;
float *aligned;
__m128 xmm0, xmm1;
__m128i xmm0i;
int cnt_l;
char *src0_p;
char *constant_p;
char *dst_p;
int mask_l;
int dst_l;
/* if the float array is not aligned on a 4 byte boundary */
if ( ((int) src0) & 3 ) {
/* unaligned memory access */
pre = 0;
cnt = count >> 2;
post = count - (cnt<<2);
/*
__asm mov edx, cnt
__asm test edx, edx
__asm je doneCmp
*/
cnt_l = cnt;
if(cnt_l != 0) {
/*
__asm push ebx
__asm neg edx
__asm mov esi, src0
__asm prefetchnta [esi+64]
__asm movss xmm1, constant
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
__asm mov edi, dst
__asm mov cl, bitNum
*/
cnt_l = -cnt_l;
src0_p = (char *) src0;
_mm_prefetch(src0_p+64, _MM_HINT_NTA);
constant_p = (char *) &constant;
xmm1 = _mm_load_ss((float *)constant_p);
xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
dst_p = (char *)dst;
/*
__asm loopNA:
*/
do {
/*
__asm movups xmm0, [esi]
__asm prefetchnta [esi+128]
__asm cmpltps xmm0, xmm1
__asm movmskps eax, xmm0 \
__asm mov ah, al
__asm shr ah, 1
__asm mov bx, ax
__asm shl ebx, 14
__asm mov bx, ax
__asm and ebx, 0x01010101
__asm shl ebx, cl
__asm or ebx, dword ptr [edi]
__asm mov dword ptr [edi], ebx
__asm add esi, 16
__asm add edi, 4
__asm inc edx
__asm jl loopNA
__asm pop ebx
*/
xmm0 = _mm_loadu_ps((float *) src0_p);
_mm_prefetch(src0_p+128, _MM_HINT_NTA);
xmm0 = _mm_cmplt_ps(xmm0, xmm1);
// Simplify using SSE2
xmm0i = (__m128i) xmm0;
xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
mask_l = _mm_cvtsi128_si32(xmm0i);
// End
mask_l = mask_l & 0x01010101;
mask_l = mask_l << bitNum;
dst_l = *((int *) dst_p);
mask_l = mask_l | dst_l;
*((int *) dst_p) = mask_l;
src0_p = src0_p + 16;
dst_p = dst_p + 4;
cnt_l = cnt_l + 1;
} while (cnt_l < 0);
}
}
else {
/* aligned memory access */
aligned = (float *) ((((int) src0) + 15) & ~15);
if ( (int)aligned > ((int)src0) + count ) {
pre = count;
post = 0;
}
else {
pre = aligned - src0;
cnt = (count - pre) >> 2;
post = count - pre - (cnt<<2);
/*
__asm mov edx, cnt
__asm test edx, edx
__asm je doneCmp
*/
cnt_l = cnt;
if(cnt_l != 0) {
/*
__asm push ebx
__asm neg edx
__asm mov esi, aligned
__asm prefetchnta [esi+64]
__asm movss xmm1, constant
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
__asm mov edi, dst
__asm add edi, pre
__asm mov cl, bitNum
*/
cnt_l = -cnt_l;
src0_p = (char *) src0;
_mm_prefetch(src0_p+64, _MM_HINT_NTA);
constant_p = (char *) &constant;
xmm1 = _mm_load_ss((float *)constant_p);
xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
dst_p = (char *)dst;
dst_p = dst_p + pre;
/*
__asm loopA:
*/
do {
/*
__asm movaps xmm0, [esi]
__asm prefetchnta [esi+128]
__asm cmpltps xmm0, xmm1
__asm movmskps eax, xmm0 \
__asm mov ah, al
__asm shr ah, 1
__asm mov bx, ax
__asm shl ebx, 14
__asm mov bx, ax
__asm and ebx, 0x01010101
__asm shl ebx, cl
__asm or ebx, dword ptr [edi]
__asm mov dword ptr [edi], ebx
__asm add esi, 16
__asm add edi, 4
__asm inc edx
__asm jl loopA
__asm pop ebx
*/
xmm0 = _mm_load_ps((float *) src0_p);
_mm_prefetch(src0_p+128, _MM_HINT_NTA);
xmm0 = _mm_cmplt_ps(xmm0, xmm1);
// Simplify using SSE2
xmm0i = (__m128i) xmm0;
xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
mask_l = _mm_cvtsi128_si32(xmm0i);
// End
mask_l = mask_l & 0x01010101;
mask_l = mask_l << bitNum;
dst_l = *((int *) dst_p);
mask_l = mask_l | dst_l;
*((int *) dst_p) = mask_l;
src0_p = src0_p + 16;
dst_p = dst_p + 4;
cnt_l = cnt_l + 1;
} while (cnt_l < 0);
}
}
}
/*
doneCmp:
*/
float c = constant;
for ( i = 0; i < pre; i++ ) {
dst[i] |= ( src0[i] < c ) << bitNum;
}
for ( i = count - post; i < count; i++ ) {
dst[i] |= ( src0[i] < c ) << bitNum;
}
}
#elif defined(_WIN32)
#include <xmmintrin.h>
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
#define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
#define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
#define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
#define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
#define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 );
ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) );
ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) );
ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) );
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
/*
============
idSIMD_SSE2::GetName
============
*/
const char * idSIMD_SSE2::GetName( void ) const {
return "MMX & SSE & SSE2";
}
#if 0 // the SSE2 code is ungodly slow
/*
============
idSIMD_SSE2::MatX_LowerTriangularSolve
solves x in Lx = b for the n * n sub-matrix of L
if skip > 0 the first skip elements of x are assumed to be valid already
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
int nc;
const float *lptr;
if ( skip >= n ) {
return;
}
lptr = L[skip];
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
#define NSKIP( n, s ) ((n<<3)|(s&7))
switch( NSKIP( n, skip ) ) {
case NSKIP( 1, 0 ): x[0] = b[0];
return;
case NSKIP( 2, 0 ): x[0] = b[0];
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
return;
case NSKIP( 3, 0 ): x[0] = b[0];
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
return;
case NSKIP( 4, 0 ): x[0] = b[0];
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
return;
case NSKIP( 5, 0 ): x[0] = b[0];
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
return;
case NSKIP( 6, 0 ): x[0] = b[0];
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
return;
case NSKIP( 7, 0 ): x[0] = b[0];
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
return;
}
return;
}
// process first 4 rows
switch( skip ) {
case 0: x[0] = b[0];
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
skip = 4;
}
lptr = L[skip];
__asm {
push ebx
mov eax, skip // eax = i
shl eax, 2 // eax = i*4
mov edx, n // edx = n
shl edx, 2 // edx = n*4
mov esi, x // esi = x
mov edi, lptr // edi = lptr
add esi, eax
add edi, eax
mov ebx, b // ebx = b
// aligned
looprow:
mov ecx, eax
neg ecx
cvtps2pd xmm0, [esi+ecx]
cvtps2pd xmm2, [edi+ecx]
mulpd xmm0, xmm2
cvtps2pd xmm1, [esi+ecx+8]
cvtps2pd xmm3, [edi+ecx+8]
mulpd xmm1, xmm3
add ecx, 20*4
jg donedot16
dot16:
cvtps2pd xmm2, [esi+ecx-(16*4)]
cvtps2pd xmm3, [edi+ecx-(16*4)]
cvtps2pd xmm4, [esi+ecx-(14*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(14*4)]
addpd xmm0, xmm2
cvtps2pd xmm2, [esi+ecx-(12*4)]
mulpd xmm4, xmm5
cvtps2pd xmm3, [edi+ecx-(12*4)]
addpd xmm1, xmm4
cvtps2pd xmm4, [esi+ecx-(10*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(10*4)]
addpd xmm0, xmm2
cvtps2pd xmm2, [esi+ecx-(8*4)]
mulpd xmm4, xmm5
cvtps2pd xmm3, [edi+ecx-(8*4)]
addpd xmm1, xmm4
cvtps2pd xmm4, [esi+ecx-(6*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(6*4)]
addpd xmm0, xmm2
cvtps2pd xmm2, [esi+ecx-(4*4)]
mulpd xmm4, xmm5
cvtps2pd xmm3, [edi+ecx-(4*4)]
addpd xmm1, xmm4
cvtps2pd xmm4, [esi+ecx-(2*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(2*4)]
addpd xmm0, xmm2
add ecx, 16*4
mulpd xmm4, xmm5
addpd xmm1, xmm4
jle dot16
donedot16:
sub ecx, 8*4
jg donedot8
dot8:
cvtps2pd xmm2, [esi+ecx-(8*4)]
cvtps2pd xmm3, [edi+ecx-(8*4)]
cvtps2pd xmm7, [esi+ecx-(6*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(6*4)]
addpd xmm0, xmm2
cvtps2pd xmm6, [esi+ecx-(4*4)]
mulpd xmm7, xmm5
cvtps2pd xmm3, [edi+ecx-(4*4)]
addpd xmm1, xmm7
cvtps2pd xmm4, [esi+ecx-(2*4)]
mulpd xmm6, xmm3
cvtps2pd xmm7, [edi+ecx-(2*4)]
addpd xmm0, xmm6
add ecx, 8*4
mulpd xmm4, xmm7
addpd xmm1, xmm4
donedot8:
sub ecx, 4*4
jg donedot4
dot4:
cvtps2pd xmm2, [esi+ecx-(4*4)]
cvtps2pd xmm3, [edi+ecx-(4*4)]
cvtps2pd xmm4, [esi+ecx-(2*4)]
mulpd xmm2, xmm3
cvtps2pd xmm5, [edi+ecx-(2*4)]
addpd xmm0, xmm2
add ecx, 4*4
mulpd xmm4, xmm5
addpd xmm1, xmm4
donedot4:
addpd xmm0, xmm1
movaps xmm1, xmm0
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
addsd xmm0, xmm1
sub ecx, 4*4
jz dot0
add ecx, 4
jz dot1
add ecx, 4
jz dot2
//dot3:
cvtss2sd xmm1, [esi-(3*4)]
cvtss2sd xmm2, [edi-(3*4)]
mulsd xmm1, xmm2
addsd xmm0, xmm1
dot2:
cvtss2sd xmm3, [esi-(2*4)]
cvtss2sd xmm4, [edi-(2*4)]
mulsd xmm3, xmm4
addsd xmm0, xmm3
dot1:
cvtss2sd xmm5, [esi-(1*4)]
cvtss2sd xmm6, [edi-(1*4)]
mulsd xmm5, xmm6
addsd xmm0, xmm5
dot0:
cvtss2sd xmm1, [ebx+eax]
subsd xmm1, xmm0
cvtsd2ss xmm0, xmm1
movss [esi], xmm0
add eax, 4
cmp eax, edx
jge done
add esi, 4
mov ecx, nc
shl ecx, 2
add edi, ecx
add edi, 4
jmp looprow
// done
done:
pop ebx
}
}
/*
============
idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
solves x in L'x = b for the n * n sub-matrix of L
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
int nc;
const float *lptr;
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x[1] = b[1];
x[0] = b[0] - lptr[1*nc+0] * x[1];
return;
case 3:
x[2] = b[2];
x[1] = b[1] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 4:
x[3] = b[3];
x[2] = b[2] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 5:
x[4] = b[4];
x[3] = b[3] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 6:
x[5] = b[5];
x[4] = b[4] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 7:
x[6] = b[6];
x[5] = b[5] - lptr[6*nc+5] * x[6];
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
}
return;
}
int i, j, m;
float *xptr;
double s0;
// if the number of columns is not a multiple of 2 we're screwed for alignment.
// however, if the number of columns is a multiple of 2 but the number of to be
// processed rows is not a multiple of 2 we can still run 8 byte aligned
m = n;
if ( m & 1 ) {
m--;
x[m] = b[m];
lptr = L[m] + m - 4;
xptr = x + m;
__asm {
push ebx
mov eax, m // eax = i
mov esi, xptr // esi = xptr
mov edi, lptr // edi = lptr
mov ebx, b // ebx = b
mov edx, nc // edx = nc*sizeof(float)
shl edx, 2
process4rows_1:
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
xor ecx, ecx
sub eax, m
neg eax
jz done4x4_1
process4x4_1: // process 4x4 blocks
cvtps2pd xmm3, [edi]
cvtps2pd xmm4, [edi+8]
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+0]
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+4]
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
mulpd xmm1, xmm7
cvtps2pd xmm3, [edi]
mulpd xmm6, xmm7
cvtps2pd xmm4, [edi+8]
subpd xmm0, xmm1
subpd xmm2, xmm6
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+8]
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+12]
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
mulpd xmm1, xmm7
add ecx, 4
mulpd xmm6, xmm7
cmp ecx, eax
subpd xmm0, xmm1
subpd xmm2, xmm6
jl process4x4_1
done4x4_1: // process left over of the 4 rows
cvtps2pd xmm3, [edi]
cvtps2pd xmm4, [edi+8]
cvtss2sd xmm5, [esi+4*ecx]
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
mulpd xmm3, xmm5
mulpd xmm4, xmm5
subpd xmm0, xmm3
subpd xmm2, xmm4
imul ecx, edx
sub edi, ecx
neg eax
add eax, m
sub eax, 4
movapd xmm1, xmm0
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
movapd xmm3, xmm2
shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
sub edi, edx
cvtsd2ss xmm7, xmm3
movss [esi-4], xmm7 // xptr[-1] = s3
movsd xmm4, xmm3
movsd xmm5, xmm3
cvtss2sd xmm7, [edi+8]
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
cvtss2sd xmm7, [edi+4]
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
cvtss2sd xmm7, [edi]
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
subsd xmm2, xmm3
cvtsd2ss xmm7, xmm2
movss [esi-8], xmm7 // xptr[-2] = s2
movsd xmm6, xmm2
sub edi, edx
subsd xmm0, xmm5
subsd xmm1, xmm4
cvtss2sd xmm7, [edi+4]
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
cvtss2sd xmm7, [edi]
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
subsd xmm1, xmm2
cvtsd2ss xmm7, xmm1
movss [esi-12], xmm7 // xptr[-3] = s1
subsd xmm0, xmm6
sub edi, edx
cmp eax, 4
cvtss2sd xmm7, [edi]
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
subsd xmm0, xmm1
cvtsd2ss xmm7, xmm0
movss [esi-16], xmm7 // xptr[-4] = s0
jl done4rows_1
sub edi, edx
sub edi, 16
sub esi, 16
jmp process4rows_1
done4rows_1:
pop ebx
}
}
else {
lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
xptr = x + m;
__asm {
push ebx
mov eax, m // eax = i
mov esi, xptr // esi = xptr
mov edi, lptr // edi = lptr
mov ebx, b // ebx = b
mov edx, nc // edx = nc*sizeof(float)
shl edx, 2
process4rows:
cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
sub eax, m
jz done4x4
neg eax
xor ecx, ecx
process4x4: // process 4x4 blocks
cvtps2pd xmm3, [edi]
cvtps2pd xmm4, [edi+8]
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+0]
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+4]
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
mulpd xmm1, xmm7
cvtps2pd xmm3, [edi]
mulpd xmm6, xmm7
cvtps2pd xmm4, [edi+8]
subpd xmm0, xmm1
subpd xmm2, xmm6
add edi, edx
cvtss2sd xmm5, [esi+4*ecx+8]
shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
mulpd xmm3, xmm5
cvtps2pd xmm1, [edi]
mulpd xmm4, xmm5
cvtps2pd xmm6, [edi+8]
subpd xmm0, xmm3
subpd xmm2, xmm4
add edi, edx
cvtss2sd xmm7, [esi+4*ecx+12]
shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
mulpd xmm1, xmm7
add ecx, 4
mulpd xmm6, xmm7
cmp ecx, eax
subpd xmm0, xmm1
subpd xmm2, xmm6
jl process4x4
imul ecx, edx
sub edi, ecx
neg eax
done4x4: // process left over of the 4 rows
add eax, m
sub eax, 4
movapd xmm1, xmm0
shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
movapd xmm3, xmm2
shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
sub edi, edx
cvtsd2ss xmm7, xmm3
movss [esi-4], xmm7 // xptr[-1] = s3
movsd xmm4, xmm3
movsd xmm5, xmm3
cvtss2sd xmm7, [edi+8]
mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
cvtss2sd xmm7, [edi+4]
mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
cvtss2sd xmm7, [edi]
mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
subsd xmm2, xmm3
cvtsd2ss xmm7, xmm2
movss [esi-8], xmm7 // xptr[-2] = s2
movsd xmm6, xmm2
sub edi, edx
subsd xmm0, xmm5
subsd xmm1, xmm4
cvtss2sd xmm7, [edi+4]
mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
cvtss2sd xmm7, [edi]
mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
subsd xmm1, xmm2
cvtsd2ss xmm7, xmm1
movss [esi-12], xmm7 // xptr[-3] = s1
subsd xmm0, xmm6
sub edi, edx
cmp eax, 4
cvtss2sd xmm7, [edi]
mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
subsd xmm0, xmm1
cvtsd2ss xmm7, xmm0
movss [esi-16], xmm7 // xptr[-4] = s0
jl done4rows
sub edi, edx
sub edi, 16
sub esi, 16
jmp process4rows
done4rows:
pop ebx
}
}
// process left over rows
for ( i = (m&3)-1; i >= 0; i-- ) {
s0 = b[i];
lptr = L[i+1] + i;
for ( j = i + 1; j < m; j++ ) {
s0 -= lptr[0] * x[j];
lptr += nc;
}
x[i] = s0;
}
}
#endif
/*
============
idSIMD_SSE2::MixedSoundToSamples
============
*/
void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
__asm {
mov eax, numSamples
mov edi, mixBuffer
mov esi, samples
shl eax, 2
add edi, eax
neg eax
loop16:
movaps xmm0, [edi+eax+0*16]
movaps xmm1, [edi+eax+1*16]
movaps xmm2, [edi+eax+2*16]
movaps xmm3, [edi+eax+3*16]
add esi, 4*4*2
cvtps2dq xmm4, xmm0
cvtps2dq xmm5, xmm1
cvtps2dq xmm6, xmm2
cvtps2dq xmm7, xmm3
prefetchnta [edi+eax+128]
packssdw xmm4, xmm5
packssdw xmm6, xmm7
add eax, 4*16
movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
movhps [esi-3*4*2], xmm4
movlps [esi-2*4*2], xmm6
movhps [esi-1*4*2], xmm6
jl loop16
}
}
#endif /* _WIN32 */