/* =========================================================================== Doom 3 GPL Source Code Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company. This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code"). Doom 3 Source Code is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Doom 3 Source Code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Doom 3 Source Code. If not, see . In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. =========================================================================== */ #include "../precompiled.h" #pragma hdrstop #include "Simd_Generic.h" #include "Simd_MMX.h" #include "Simd_SSE.h" #include "Simd_SSE2.h" //=============================================================== // // SSE2 implementation of idSIMDProcessor // //=============================================================== #if defined(MACOS_X) && defined(__i386__) #include #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) /* ============ idSIMD_SSE2::GetName ============ */ const char * idSIMD_SSE2::GetName( void ) const { return "MMX & SSE & SSE2"; } /* ============ idSIMD_SSE::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { int i, cnt, pre, post; float *aligned; __m128 xmm0, xmm1; __m128i xmm0i; int cnt_l; char *src0_p; char *constant_p; char *dst_p; int mask_l; int dst_l; /* if the float array is not aligned on a 4 byte boundary */ if ( ((int) src0) & 3 ) { /* unaligned memory access */ pre = 0; cnt = count >> 2; post = count - (cnt<<2); /* __asm mov edx, cnt __asm test edx, edx __asm je doneCmp */ cnt_l = cnt; if(cnt_l != 0) { /* __asm push ebx __asm neg edx __asm mov esi, src0 __asm prefetchnta [esi+64] __asm movss xmm1, constant __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) __asm mov edi, dst __asm mov cl, bitNum */ cnt_l = -cnt_l; src0_p = (char *) src0; _mm_prefetch(src0_p+64, _MM_HINT_NTA); constant_p = (char *) &constant; xmm1 = _mm_load_ss((float *)constant_p); xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )); dst_p = (char *)dst; /* __asm loopNA: */ do { /* __asm movups xmm0, [esi] __asm prefetchnta [esi+128] __asm cmpltps xmm0, xmm1 __asm movmskps eax, xmm0 \ __asm mov ah, al __asm shr ah, 1 __asm mov bx, ax __asm shl ebx, 14 __asm mov bx, ax __asm and ebx, 0x01010101 __asm shl ebx, cl __asm or ebx, dword ptr [edi] __asm mov dword ptr [edi], ebx __asm add esi, 16 __asm add edi, 4 __asm inc edx __asm jl loopNA __asm pop ebx */ xmm0 = _mm_loadu_ps((float *) src0_p); _mm_prefetch(src0_p+128, _MM_HINT_NTA); xmm0 = _mm_cmplt_ps(xmm0, xmm1); // Simplify using SSE2 xmm0i = (__m128i) xmm0; xmm0i = _mm_packs_epi32(xmm0i, xmm0i); xmm0i = _mm_packs_epi16(xmm0i, xmm0i); mask_l = _mm_cvtsi128_si32(xmm0i); // End mask_l = mask_l & 0x01010101; mask_l = mask_l << bitNum; dst_l = *((int *) dst_p); mask_l = mask_l | dst_l; *((int *) dst_p) = mask_l; src0_p = src0_p + 16; dst_p = dst_p + 4; cnt_l = cnt_l + 1; } while (cnt_l < 0); } } else { /* aligned memory access */ aligned = (float *) ((((int) src0) + 15) & ~15); if ( (int)aligned > ((int)src0) + count ) { pre = count; post = 0; } else { pre = aligned - src0; cnt = (count - pre) >> 2; post = count - pre - (cnt<<2); /* __asm mov edx, cnt __asm test edx, edx __asm je doneCmp */ cnt_l = cnt; if(cnt_l != 0) { /* __asm push ebx __asm neg edx __asm mov esi, aligned __asm prefetchnta [esi+64] __asm movss xmm1, constant __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) __asm mov edi, dst __asm add edi, pre __asm mov cl, bitNum */ cnt_l = -cnt_l; src0_p = (char *) src0; _mm_prefetch(src0_p+64, _MM_HINT_NTA); constant_p = (char *) &constant; xmm1 = _mm_load_ss((float *)constant_p); xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )); dst_p = (char *)dst; dst_p = dst_p + pre; /* __asm loopA: */ do { /* __asm movaps xmm0, [esi] __asm prefetchnta [esi+128] __asm cmpltps xmm0, xmm1 __asm movmskps eax, xmm0 \ __asm mov ah, al __asm shr ah, 1 __asm mov bx, ax __asm shl ebx, 14 __asm mov bx, ax __asm and ebx, 0x01010101 __asm shl ebx, cl __asm or ebx, dword ptr [edi] __asm mov dword ptr [edi], ebx __asm add esi, 16 __asm add edi, 4 __asm inc edx __asm jl loopA __asm pop ebx */ xmm0 = _mm_load_ps((float *) src0_p); _mm_prefetch(src0_p+128, _MM_HINT_NTA); xmm0 = _mm_cmplt_ps(xmm0, xmm1); // Simplify using SSE2 xmm0i = (__m128i) xmm0; xmm0i = _mm_packs_epi32(xmm0i, xmm0i); xmm0i = _mm_packs_epi16(xmm0i, xmm0i); mask_l = _mm_cvtsi128_si32(xmm0i); // End mask_l = mask_l & 0x01010101; mask_l = mask_l << bitNum; dst_l = *((int *) dst_p); mask_l = mask_l | dst_l; *((int *) dst_p) = mask_l; src0_p = src0_p + 16; dst_p = dst_p + 4; cnt_l = cnt_l + 1; } while (cnt_l < 0); } } } /* doneCmp: */ float c = constant; for ( i = 0; i < pre; i++ ) { dst[i] |= ( src0[i] < c ) << bitNum; } for ( i = count - post; i < count; i++ ) { dst[i] |= ( src0[i] < c ) << bitNum; } } #elif defined(_WIN32) #include #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 )) #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 )) #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT } #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 } #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT } ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 ); ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 ); ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 ); ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) ); ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) ); ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) ); ALIGN4_INIT1( float SIMD_SP_zero, 0.0f ); ALIGN4_INIT1( float SIMD_SP_one, 1.0f ); ALIGN4_INIT1( float SIMD_SP_two, 2.0f ); ALIGN4_INIT1( float SIMD_SP_three, 3.0f ); ALIGN4_INIT1( float SIMD_SP_four, 4.0f ); ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) ); ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f ); ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI ); ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI ); ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI ); ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI ); ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY ); /* ============ idSIMD_SSE2::GetName ============ */ const char * idSIMD_SSE2::GetName( void ) const { return "MMX & SSE & SSE2"; } #if 0 // the SSE2 code is ungodly slow /* ============ idSIMD_SSE2::MatX_LowerTriangularSolve solves x in Lx = b for the n * n sub-matrix of L if skip > 0 the first skip elements of x are assumed to be valid already L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) { int nc; const float *lptr; if ( skip >= n ) { return; } lptr = L[skip]; nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { #define NSKIP( n, s ) ((n<<3)|(s&7)) switch( NSKIP( n, skip ) ) { case NSKIP( 1, 0 ): x[0] = b[0]; return; case NSKIP( 2, 0 ): x[0] = b[0]; case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; return; case NSKIP( 3, 0 ): x[0] = b[0]; case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; return; case NSKIP( 4, 0 ): x[0] = b[0]; case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; return; case NSKIP( 5, 0 ): x[0] = b[0]; case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; return; case NSKIP( 6, 0 ): x[0] = b[0]; case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; return; case NSKIP( 7, 0 ): x[0] = b[0]; case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5]; return; } return; } // process first 4 rows switch( skip ) { case 0: x[0] = b[0]; case 1: x[1] = b[1] - lptr[1*nc+0] * x[0]; case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; skip = 4; } lptr = L[skip]; __asm { push ebx mov eax, skip // eax = i shl eax, 2 // eax = i*4 mov edx, n // edx = n shl edx, 2 // edx = n*4 mov esi, x // esi = x mov edi, lptr // edi = lptr add esi, eax add edi, eax mov ebx, b // ebx = b // aligned looprow: mov ecx, eax neg ecx cvtps2pd xmm0, [esi+ecx] cvtps2pd xmm2, [edi+ecx] mulpd xmm0, xmm2 cvtps2pd xmm1, [esi+ecx+8] cvtps2pd xmm3, [edi+ecx+8] mulpd xmm1, xmm3 add ecx, 20*4 jg donedot16 dot16: cvtps2pd xmm2, [esi+ecx-(16*4)] cvtps2pd xmm3, [edi+ecx-(16*4)] cvtps2pd xmm4, [esi+ecx-(14*4)] mulpd xmm2, xmm3 cvtps2pd xmm5, [edi+ecx-(14*4)] addpd xmm0, xmm2 cvtps2pd xmm2, [esi+ecx-(12*4)] mulpd xmm4, xmm5 cvtps2pd xmm3, [edi+ecx-(12*4)] addpd xmm1, xmm4 cvtps2pd xmm4, [esi+ecx-(10*4)] mulpd xmm2, xmm3 cvtps2pd xmm5, [edi+ecx-(10*4)] addpd xmm0, xmm2 cvtps2pd xmm2, [esi+ecx-(8*4)] mulpd xmm4, xmm5 cvtps2pd xmm3, [edi+ecx-(8*4)] addpd xmm1, xmm4 cvtps2pd xmm4, [esi+ecx-(6*4)] mulpd xmm2, xmm3 cvtps2pd xmm5, [edi+ecx-(6*4)] addpd xmm0, xmm2 cvtps2pd xmm2, [esi+ecx-(4*4)] mulpd xmm4, xmm5 cvtps2pd xmm3, [edi+ecx-(4*4)] addpd xmm1, xmm4 cvtps2pd xmm4, [esi+ecx-(2*4)] mulpd xmm2, xmm3 cvtps2pd xmm5, [edi+ecx-(2*4)] addpd xmm0, xmm2 add ecx, 16*4 mulpd xmm4, xmm5 addpd xmm1, xmm4 jle dot16 donedot16: sub ecx, 8*4 jg donedot8 dot8: cvtps2pd xmm2, [esi+ecx-(8*4)] cvtps2pd xmm3, [edi+ecx-(8*4)] cvtps2pd xmm7, [esi+ecx-(6*4)] mulpd xmm2, xmm3 cvtps2pd xmm5, [edi+ecx-(6*4)] addpd xmm0, xmm2 cvtps2pd xmm6, [esi+ecx-(4*4)] mulpd xmm7, xmm5 cvtps2pd xmm3, [edi+ecx-(4*4)] addpd xmm1, xmm7 cvtps2pd xmm4, [esi+ecx-(2*4)] mulpd xmm6, xmm3 cvtps2pd xmm7, [edi+ecx-(2*4)] addpd xmm0, xmm6 add ecx, 8*4 mulpd xmm4, xmm7 addpd xmm1, xmm4 donedot8: sub ecx, 4*4 jg donedot4 dot4: cvtps2pd xmm2, [esi+ecx-(4*4)] cvtps2pd xmm3, [edi+ecx-(4*4)] cvtps2pd xmm4, [esi+ecx-(2*4)] mulpd xmm2, xmm3 cvtps2pd xmm5, [edi+ecx-(2*4)] addpd xmm0, xmm2 add ecx, 4*4 mulpd xmm4, xmm5 addpd xmm1, xmm4 donedot4: addpd xmm0, xmm1 movaps xmm1, xmm0 shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 ) addsd xmm0, xmm1 sub ecx, 4*4 jz dot0 add ecx, 4 jz dot1 add ecx, 4 jz dot2 //dot3: cvtss2sd xmm1, [esi-(3*4)] cvtss2sd xmm2, [edi-(3*4)] mulsd xmm1, xmm2 addsd xmm0, xmm1 dot2: cvtss2sd xmm3, [esi-(2*4)] cvtss2sd xmm4, [edi-(2*4)] mulsd xmm3, xmm4 addsd xmm0, xmm3 dot1: cvtss2sd xmm5, [esi-(1*4)] cvtss2sd xmm6, [edi-(1*4)] mulsd xmm5, xmm6 addsd xmm0, xmm5 dot0: cvtss2sd xmm1, [ebx+eax] subsd xmm1, xmm0 cvtsd2ss xmm0, xmm1 movss [esi], xmm0 add eax, 4 cmp eax, edx jge done add esi, 4 mov ecx, nc shl ecx, 2 add edi, ecx add edi, 4 jmp looprow // done done: pop ebx } } /* ============ idSIMD_SSE2::MatX_LowerTriangularSolveTranspose solves x in L'x = b for the n * n sub-matrix of L L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) { int nc; const float *lptr; lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[1] = b[1]; x[0] = b[0] - lptr[1*nc+0] * x[1]; return; case 3: x[2] = b[2]; x[1] = b[1] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 4: x[3] = b[3]; x[2] = b[2] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 5: x[4] = b[4]; x[3] = b[3] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 6: x[5] = b[5]; x[4] = b[4] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 7: x[6] = b[6]; x[5] = b[5] - lptr[6*nc+5] * x[6]; x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; } return; } int i, j, m; float *xptr; double s0; // if the number of columns is not a multiple of 2 we're screwed for alignment. // however, if the number of columns is a multiple of 2 but the number of to be // processed rows is not a multiple of 2 we can still run 8 byte aligned m = n; if ( m & 1 ) { m--; x[m] = b[m]; lptr = L[m] + m - 4; xptr = x + m; __asm { push ebx mov eax, m // eax = i mov esi, xptr // esi = xptr mov edi, lptr // edi = lptr mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 process4rows_1: cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3] xor ecx, ecx sub eax, m neg eax jz done4x4_1 process4x4_1: // process 4x4 blocks cvtps2pd xmm3, [edi] cvtps2pd xmm4, [edi+8] add edi, edx cvtss2sd xmm5, [esi+4*ecx+0] shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 ) mulpd xmm3, xmm5 cvtps2pd xmm1, [edi] mulpd xmm4, xmm5 cvtps2pd xmm6, [edi+8] subpd xmm0, xmm3 subpd xmm2, xmm4 add edi, edx cvtss2sd xmm7, [esi+4*ecx+4] shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 ) mulpd xmm1, xmm7 cvtps2pd xmm3, [edi] mulpd xmm6, xmm7 cvtps2pd xmm4, [edi+8] subpd xmm0, xmm1 subpd xmm2, xmm6 add edi, edx cvtss2sd xmm5, [esi+4*ecx+8] shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 ) mulpd xmm3, xmm5 cvtps2pd xmm1, [edi] mulpd xmm4, xmm5 cvtps2pd xmm6, [edi+8] subpd xmm0, xmm3 subpd xmm2, xmm4 add edi, edx cvtss2sd xmm7, [esi+4*ecx+12] shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 ) mulpd xmm1, xmm7 add ecx, 4 mulpd xmm6, xmm7 cmp ecx, eax subpd xmm0, xmm1 subpd xmm2, xmm6 jl process4x4_1 done4x4_1: // process left over of the 4 rows cvtps2pd xmm3, [edi] cvtps2pd xmm4, [edi+8] cvtss2sd xmm5, [esi+4*ecx] shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 ) mulpd xmm3, xmm5 mulpd xmm4, xmm5 subpd xmm0, xmm3 subpd xmm2, xmm4 imul ecx, edx sub edi, ecx neg eax add eax, m sub eax, 4 movapd xmm1, xmm0 shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 ) movapd xmm3, xmm2 shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 ) sub edi, edx cvtsd2ss xmm7, xmm3 movss [esi-4], xmm7 // xptr[-1] = s3 movsd xmm4, xmm3 movsd xmm5, xmm3 cvtss2sd xmm7, [edi+8] mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3 cvtss2sd xmm7, [edi+4] mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3 cvtss2sd xmm7, [edi] mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3 subsd xmm2, xmm3 cvtsd2ss xmm7, xmm2 movss [esi-8], xmm7 // xptr[-2] = s2 movsd xmm6, xmm2 sub edi, edx subsd xmm0, xmm5 subsd xmm1, xmm4 cvtss2sd xmm7, [edi+4] mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2 cvtss2sd xmm7, [edi] mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2 subsd xmm1, xmm2 cvtsd2ss xmm7, xmm1 movss [esi-12], xmm7 // xptr[-3] = s1 subsd xmm0, xmm6 sub edi, edx cmp eax, 4 cvtss2sd xmm7, [edi] mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1 subsd xmm0, xmm1 cvtsd2ss xmm7, xmm0 movss [esi-16], xmm7 // xptr[-4] = s0 jl done4rows_1 sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows_1 done4rows_1: pop ebx } } else { lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4; xptr = x + m; __asm { push ebx mov eax, m // eax = i mov esi, xptr // esi = xptr mov edi, lptr // edi = lptr mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 process4rows: cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3] sub eax, m jz done4x4 neg eax xor ecx, ecx process4x4: // process 4x4 blocks cvtps2pd xmm3, [edi] cvtps2pd xmm4, [edi+8] add edi, edx cvtss2sd xmm5, [esi+4*ecx+0] shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 ) mulpd xmm3, xmm5 cvtps2pd xmm1, [edi] mulpd xmm4, xmm5 cvtps2pd xmm6, [edi+8] subpd xmm0, xmm3 subpd xmm2, xmm4 add edi, edx cvtss2sd xmm7, [esi+4*ecx+4] shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 ) mulpd xmm1, xmm7 cvtps2pd xmm3, [edi] mulpd xmm6, xmm7 cvtps2pd xmm4, [edi+8] subpd xmm0, xmm1 subpd xmm2, xmm6 add edi, edx cvtss2sd xmm5, [esi+4*ecx+8] shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 ) mulpd xmm3, xmm5 cvtps2pd xmm1, [edi] mulpd xmm4, xmm5 cvtps2pd xmm6, [edi+8] subpd xmm0, xmm3 subpd xmm2, xmm4 add edi, edx cvtss2sd xmm7, [esi+4*ecx+12] shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 ) mulpd xmm1, xmm7 add ecx, 4 mulpd xmm6, xmm7 cmp ecx, eax subpd xmm0, xmm1 subpd xmm2, xmm6 jl process4x4 imul ecx, edx sub edi, ecx neg eax done4x4: // process left over of the 4 rows add eax, m sub eax, 4 movapd xmm1, xmm0 shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 ) movapd xmm3, xmm2 shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 ) sub edi, edx cvtsd2ss xmm7, xmm3 movss [esi-4], xmm7 // xptr[-1] = s3 movsd xmm4, xmm3 movsd xmm5, xmm3 cvtss2sd xmm7, [edi+8] mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3 cvtss2sd xmm7, [edi+4] mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3 cvtss2sd xmm7, [edi] mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3 subsd xmm2, xmm3 cvtsd2ss xmm7, xmm2 movss [esi-8], xmm7 // xptr[-2] = s2 movsd xmm6, xmm2 sub edi, edx subsd xmm0, xmm5 subsd xmm1, xmm4 cvtss2sd xmm7, [edi+4] mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2 cvtss2sd xmm7, [edi] mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2 subsd xmm1, xmm2 cvtsd2ss xmm7, xmm1 movss [esi-12], xmm7 // xptr[-3] = s1 subsd xmm0, xmm6 sub edi, edx cmp eax, 4 cvtss2sd xmm7, [edi] mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1 subsd xmm0, xmm1 cvtsd2ss xmm7, xmm0 movss [esi-16], xmm7 // xptr[-4] = s0 jl done4rows sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows done4rows: pop ebx } } // process left over rows for ( i = (m&3)-1; i >= 0; i-- ) { s0 = b[i]; lptr = L[i+1] + i; for ( j = i + 1; j < m; j++ ) { s0 -= lptr[0] * x[j]; lptr += nc; } x[i] = s0; } } #endif /* ============ idSIMD_SSE2::MixedSoundToSamples ============ */ void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) { assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 ); __asm { mov eax, numSamples mov edi, mixBuffer mov esi, samples shl eax, 2 add edi, eax neg eax loop16: movaps xmm0, [edi+eax+0*16] movaps xmm1, [edi+eax+1*16] movaps xmm2, [edi+eax+2*16] movaps xmm3, [edi+eax+3*16] add esi, 4*4*2 cvtps2dq xmm4, xmm0 cvtps2dq xmm5, xmm1 cvtps2dq xmm6, xmm2 cvtps2dq xmm7, xmm3 prefetchnta [edi+eax+128] packssdw xmm4, xmm5 packssdw xmm6, xmm7 add eax, 4*16 movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data movhps [esi-3*4*2], xmm4 movlps [esi-2*4*2], xmm6 movhps [esi-1*4*2], xmm6 jl loop16 } } #endif /* _WIN32 */