/* =========================================================================== Doom 3 GPL Source Code Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company. This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code"). Doom 3 Source Code is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Doom 3 Source Code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Doom 3 Source Code. If not, see . In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below. If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA. =========================================================================== */ #include "sys/platform.h" #include "idlib/geometry/DrawVert.h" #include "idlib/math/Simd_SSE.h" //=============================================================== // M // SSE implementation of idSIMDProcessor MrE // E //=============================================================== #define DRAWVERT_SIZE 60 #define DRAWVERT_XYZ_OFFSET (0*4) #define DRAWVERT_ST_OFFSET (3*4) #define DRAWVERT_NORMAL_OFFSET (5*4) #define DRAWVERT_TANGENT0_OFFSET (8*4) #define DRAWVERT_TANGENT1_OFFSET (11*4) #define DRAWVERT_COLOR_OFFSET (14*4) #if defined(__GNUC__) && defined(__SSE__) #include #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) /* ============ idSIMD_SSE::GetName ============ */ const char * idSIMD_SSE::GetName( void ) const { return "MMX & SSE"; } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i].xyz + constant[3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) { // 0, 1, 2 // 3, 4, 5 // 6, 7, 8 // 9, 10, 11 /* mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst */ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers. int count_l4 = count; // count_l4 = eax int count_l1 = count; // count_l1 = edx char *constant_p = (char *)&constant; // constant_p = edi char *src_p = (char *) src; // src_p = esi char *dst_p = (char *) dst; // dst_p = ecx assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET ); /* and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [edi+12] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) */ count_l4 = count_l4 & ~3; xmm4 = _mm_load_ss((float *) (constant_p)); xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )); xmm5 = _mm_load_ss((float *) (constant_p + 4)); xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )); xmm6 = _mm_load_ss((float *) (constant_p + 8)); xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )); xmm7 = _mm_load_ss((float *) (constant_p + 12)); xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )); /* jz startVert1 */ if(count_l4 != 0) { /* imul eax, DRAWVERT_SIZE add esi, eax neg eax */ count_l4 = count_l4 * DRAWVERT_SIZE; src_p = src_p + count_l4; count_l4 = -count_l4; /* loopVert4: */ do { /* movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 movaps xmm1, xmm0 // 3, X, 0, 1 */ xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1 xmm1 = xmm0; // 3, X, 0, 1 /* movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 */ xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1 xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5 /* movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 */ xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7 xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9 /* movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 */ xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7 xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10 /* movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 */ xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11 /* add ecx, 16 add eax, 4*DRAWVERT_SIZE */ dst_p = dst_p + 16; count_l4 = count_l4 + 4*DRAWVERT_SIZE; /* mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 */ xmm0 = _mm_mul_ps(xmm0, xmm4); xmm1 = _mm_mul_ps(xmm1, xmm5); xmm2 = _mm_mul_ps(xmm2, xmm6); xmm0 = _mm_add_ps(xmm0, xmm7); xmm0 = _mm_add_ps(xmm0, xmm1); xmm0 = _mm_add_ps(xmm0, xmm2); /* movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 */ _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0); _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0); } while(count_l4 < 0); } /* startVert1: and edx, 3 jz done */ count_l1 = count_l1 & 3; if(count_l1 != 0) { /* loopVert1: movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 addss xmm0, xmm7 add ecx, 4 addss xmm0, xmm1 add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 */ do { xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0)); xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4)); xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8)); xmm0 = _mm_mul_ss(xmm0, xmm4); xmm1 = _mm_mul_ss(xmm1, xmm5); xmm2 = _mm_mul_ss(xmm2, xmm6); xmm0 = _mm_add_ss(xmm0, xmm7); dst_p = dst_p + 4; xmm0 = _mm_add_ss(xmm0, xmm1); count_l4 = count_l4 + DRAWVERT_SIZE; xmm0 = _mm_add_ss(xmm0, xmm2); count_l1 = count_l1 - 1; _mm_store_ss((float *) (dst_p-4), xmm0); } while( count_l1 != 0); } /* done: */ } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET ); __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; char *indexes_p; char *src_p; int count_l; int edx; char *min_p; char *max_p; /* movss xmm0, idMath::INFINITY xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 */ xmm0 = _mm_load_ss(&idMath::INFINITY); // To satisfy the compiler use xmm0 instead. xmm1 = _mm_xor_ps(xmm0, xmm0); xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )); xmm1 = _mm_sub_ps(xmm1, xmm0); xmm2 = xmm0; xmm3 = xmm1; /* mov edi, indexes mov esi, src mov eax, count and eax, ~3 jz done4 */ indexes_p = (char *) indexes; src_p = (char *) src; count_l = count; count_l = count_l & ~3; if(count_l != 0) { /* shl eax, 2 add edi, eax neg eax */ count_l = count_l << 2; indexes_p = indexes_p + count_l; count_l = -count_l; /* loop4: // prefetchnta [edi+128] // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] */ do { /* mov edx, [edi+eax+0] imul edx, DRAWVERT_SIZE movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 */ edx = *((int*)(indexes_p+count_l+0)); edx = edx * DRAWVERT_SIZE; xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8)); xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) ); xmm0 = _mm_min_ps(xmm0, xmm4); xmm1 = _mm_max_ps(xmm1, xmm4); /* mov edx, [edi+eax+4] imul edx, DRAWVERT_SIZE movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4] minps xmm2, xmm5 maxps xmm3, xmm5 */ edx = *((int*)(indexes_p+count_l+4)); edx = edx * DRAWVERT_SIZE; xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0)); xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) ); xmm2 = _mm_min_ps(xmm2, xmm5); xmm3 = _mm_max_ps(xmm3, xmm5); /* mov edx, [edi+eax+8] imul edx, DRAWVERT_SIZE movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm6 maxps xmm1, xmm6 */ edx = *((int*)(indexes_p+count_l+8)); edx = edx * DRAWVERT_SIZE; xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8)); xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) ); xmm0 = _mm_min_ps(xmm0, xmm6); xmm1 = _mm_max_ps(xmm1, xmm6); /* mov edx, [edi+eax+12] imul edx, DRAWVERT_SIZE movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4] minps xmm2, xmm7 maxps xmm3, xmm7 */ edx = *((int*)(indexes_p+count_l+12)); edx = edx * DRAWVERT_SIZE; xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0)); xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) ); xmm2 = _mm_min_ps(xmm2, xmm7); xmm3 = _mm_max_ps(xmm3, xmm7); /* add eax, 4*4 jl loop4 */ count_l = count_l + 4*4; } while (count_l < 0); } /* done4: mov eax, count and eax, 3 jz done1 */ count_l = count; count_l = count_l & 3; if(count_l != 0) { /* shl eax, 2 add edi, eax neg eax */ count_l = count_l << 2; indexes_p = indexes_p + count_l; count_l = -count_l; /* loop1: */ do{ /* mov edx, [edi+eax+0] imul edx, DRAWVERT_SIZE; movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 */ edx = *((int*)(indexes_p+count_l+0)); edx = edx * DRAWVERT_SIZE; xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8)); xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) ); xmm0 = _mm_min_ps(xmm0, xmm4); xmm1 = _mm_max_ps(xmm1, xmm4); /* add eax, 4 jl loop1 */ count_l = count_l + 4; } while (count_l < 0); } /* done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 */ xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )); xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )); xmm0 = _mm_min_ps(xmm0, xmm2); xmm1 = _mm_max_ps(xmm1, xmm3); min_p = (char *) &min; _mm_storeh_pi((__m64 *)(min_p), xmm0); _mm_store_ss((float *)(min_p+8), xmm0); max_p = (char *) &max; _mm_storeh_pi((__m64 *)(max_p), xmm1); _mm_store_ss((float *)(max_p+8), xmm1); } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i].Normal() + src[i][3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) { int count_l4; int count_l1; char *constant_p; char *src_p; char *dst_p; __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // DG: GCC and clang warn about xmm1-4 maybe being used uninitialized below. // according to https://stackoverflow.com/a/18749079 the initialization // code is generated anyway, so make it explicit to shut up the warning xmm1 = _mm_setzero_ps(); xmm2 = _mm_setzero_ps(); xmm3 = _mm_setzero_ps(); xmm4 = _mm_setzero_ps(); /* mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 */ count_l4 = count; constant_p = (char *) &constant; count_l1 = count_l4; src_p = (char *) src; dst_p = (char *) dst; count_l4 = count_l4 & ~3; /* movss xmm5, [edi+0] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [edi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) */ xmm5 = _mm_load_ss((float *) (constant_p+0)); xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )); xmm6 = _mm_load_ss((float *) (constant_p+4)); xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )); xmm7 = _mm_load_ss((float *) (constant_p+8)); xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )); /* jz startVert1 */ if (count_l4 != 0) { /* imul eax, 16 add esi, eax neg eax */ count_l4 = count_l4 * 16; src_p = src_p + count_l4; count_l4 = -count_l4; /* loopVert4: */ do { /* movlps xmm1, [esi+eax+ 0] movlps xmm3, [esi+eax+ 8] movhps xmm1, [esi+eax+16] movhps xmm3, [esi+eax+24] movlps xmm2, [esi+eax+32] movlps xmm4, [esi+eax+40] movhps xmm2, [esi+eax+48] movhps xmm4, [esi+eax+56] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) movaps xmm2, xmm3 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) */ xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0)); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8)); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16)); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24)); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32)); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40)); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48)); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56)); xmm0 = xmm1; xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )); xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )); xmm2 = xmm3; xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )); xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )); /* add ecx, 16 add eax, 4*16 */ dst_p = dst_p + 16; count_l4 = count_l4 + 4*16; /* mulps xmm0, xmm5 mulps xmm1, xmm6 mulps xmm2, xmm7 addps xmm0, xmm3 addps xmm0, xmm1 addps xmm0, xmm2 */ xmm0 = _mm_mul_ps(xmm0, xmm5); xmm1 = _mm_mul_ps(xmm1, xmm6); xmm2 = _mm_mul_ps(xmm2, xmm7); xmm0 = _mm_add_ps(xmm0, xmm3); xmm0 = _mm_add_ps(xmm0, xmm1); xmm0 = _mm_add_ps(xmm0, xmm2); /* movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 */ _mm_storel_pi((__m64 *) (dst_p-16+0), xmm0); _mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0); } while (count_l4 < 0); } /* startVert1: and edx, 3 jz done */ count_l1 = count_l1 & 3; if(count_l1 != 0) { /* loopVert1: */ do { /* movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm5 mulss xmm1, xmm6 mulss xmm2, xmm7 addss xmm0, [esi+eax+12] add ecx, 4 addss xmm0, xmm1 add eax, 16 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 */ xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0)); xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4)); xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8)); xmm3 = _mm_load_ss((float *) (src_p+count_l4+12)); xmm0 = _mm_mul_ss(xmm0, xmm5); xmm1 = _mm_mul_ss(xmm1, xmm6); xmm2 = _mm_mul_ss(xmm2, xmm7); xmm0 = _mm_add_ss(xmm0, xmm3); dst_p = dst_p + 4; xmm0 = _mm_add_ss(xmm0, xmm1); count_l4 = count_l4 + 16; xmm0 = _mm_add_ss(xmm0, xmm2); count_l1 = count_l1 - 1; _mm_store_ss((float *) (dst_p-4), xmm0); } while (count_l1 != 0); } /* done: */ } #elif defined(_MSC_VER) && defined(_M_IX86) #include #include "idlib/geometry/JointTransform.h" #include "idlib/math/Vector.h" #include "idlib/math/Matrix.h" #include "idlib/math/Quat.h" #include "idlib/math/Plane.h" #include "renderer/Model.h" #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 )) #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 )) // transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary) #define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \ __asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \ __asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \ __asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \ __asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \ __asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \ __asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \ __asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \ __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \ __asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \ __asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \ __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \ __asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */ // transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary) #define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \ __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \ __asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \ __asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \ __asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \ __asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \ __asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \ __asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \ __asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \ __asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \ __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \ __asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \ __asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \ __asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \ __asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */ // transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary) #define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \ __asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \ __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \ __asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \ __asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \ __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \ __asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \ __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \ __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \ __asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \ __asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \ __asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \ __asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \ __asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \ __asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */ // transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary) #define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \ __asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \ __asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \ __asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \ __asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \ __asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \ __asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \ __asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \ __asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */ // transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary) #define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \ __asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \ __asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \ __asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \ __asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \ __asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \ __asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \ __asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \ __asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \ __asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \ __asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */ // transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary) #define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \ __asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \ __asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \ __asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \ __asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \ __asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \ __asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \ __asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \ __asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \ __asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \ __asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ // with alignment #define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST ) #define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\ __asm mov ecx,DST \ __asm shr ecx,2 \ __asm mov ebx,COUNT \ __asm neg ecx \ __asm mov edx,SRC0 \ __asm and ecx,3 \ __asm mov esi,SRC1 \ __asm sub ebx,ecx \ __asm jge noUnderFlow \ __asm xor ebx,ebx \ __asm mov ecx,COUNT \ __asm noUnderFlow: \ __asm mov PRE,ecx \ __asm mov eax,ebx \ __asm mov edi,DST \ __asm and eax,8-1 \ __asm mov POST,eax \ __asm and ebx,0xfffffff8 \ __asm jle done \ __asm shl ebx,2 \ __asm lea ecx,[ecx*4+ebx] \ __asm neg ebx \ __asm add edx,ecx \ __asm add esi,ecx \ __asm add edi,ecx \ __asm mov eax,edx \ __asm or eax,esi // without alignment (pre==0) #define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST ) #define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\ __asm mov eax,COUNT \ __asm mov PRE,0 \ __asm and eax,8-1 \ __asm mov ebx,COUNT \ __asm mov POST,eax \ __asm and ebx,0xfffffff8 \ __asm je done \ __asm shl ebx,2 \ __asm mov edx,SRC0 \ __asm mov esi,SRC1 \ __asm mov edi,DST \ __asm add edx,ebx \ __asm add esi,ebx \ __asm add edi,ebx \ __asm mov eax,edx \ __asm or eax,esi \ __asm or eax,edi \ __asm neg ebx \ /* when OPER is called: edx = s0 esi = s1 edi = d ebx = index*4 xmm0 & xmm1 must not be trashed */ #define KMOVDS1( DST, SRC0 ) \ __asm movss xmm2,SRC0 \ __asm movss DST,xmm2 #define KMOVDS4( DST, SRC0 ) \ __asm movups xmm2,SRC0 \ __asm movups DST,xmm2 #define KMINDS1( DST, SRC0 ) \ __asm movss xmm2,SRC0 \ __asm minss DST,xmm2 #define KMAXDS1( DST, SRC0 ) \ __asm movss xmm2,SRC0 \ __asm maxss DST,xmm2 // general ALU operation #define KALUDSS1( OP, DST, SRC0, SRC1 ) \ __asm movss xmm2,SRC0 \ __asm OP##ss xmm2,SRC1 \ __asm movss DST,xmm2 #define KALUDSS4( OP, DST, SRC0, SRC1 ) \ __asm movups xmm2,SRC0 \ __asm movups xmm3,SRC1 \ __asm OP##ps xmm2,xmm3 \ __asm movups DST,xmm2 #define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 ) #define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 ) #define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 ) #define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 ) #define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 ) #define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 ) #define KDIVDSS1( DST, SRC0, SRC1 ) \ __asm movss xmm2,SRC1 \ __asm rcpss xmm3,xmm2 \ __asm mulss xmm2,xmm3 \ __asm mulss xmm2,xmm3 \ __asm addss xmm3,xmm3 \ __asm subss xmm3,xmm2 \ __asm mulss xmm3,SRC0 \ __asm movss DST,xmm3 #define KDIVDSS4( DST, SRC0, SRC1 ) \ __asm movups xmm2,SRC1 \ __asm rcpps xmm3,xmm2 \ __asm mulps xmm2,xmm3 \ __asm mulps xmm2,xmm3 \ __asm addps xmm3,xmm3 \ __asm subps xmm3,xmm2 \ __asm movups xmm2,SRC0 \ __asm mulps xmm3,xmm2 \ __asm movups DST,xmm3 #define KF2IDS1( SRC0 ) \ __asm movss xmm2,SRC0 \ __asm cvttps2pi mm2,xmm2 \ __asm movd [edi+ebx],mm2 #define KF2IDS4( SRC0 ) \ __asm movups xmm2,SRC0 \ __asm cvttps2pi mm2,xmm2 \ __asm movq [edi+ebx+0],mm2 \ __asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \ __asm cvttps2pi mm2,xmm2 \ __asm movq [edi+ebx+8],mm2 #define KISQRTDS1( DST,SRC0 ) \ __asm movss xmm2,SRC0 \ __asm rsqrtss xmm3,xmm2 \ __asm mulss xmm2,xmm3 \ __asm mulss xmm2,xmm3 \ __asm subss xmm2,xmm1 \ __asm mulss xmm3,xmm0 \ __asm mulss xmm3,xmm2 \ __asm movss DST,xmm3 #define KISQRTDS4( DST,SRC0 ) \ __asm movups xmm2,SRC0 \ __asm rsqrtps xmm3,xmm2 \ __asm mulps xmm2,xmm3 \ __asm mulps xmm2,xmm3 \ __asm subps xmm2,xmm1 \ __asm mulps xmm3,xmm0 \ __asm mulps xmm3,xmm2 \ __asm movups DST,xmm3 // this is used in vector4 implementation to shift constant V4 #define KANDREGDSV( DST, SRC0, VALUE ) \ __asm mov DST,SRC0 \ __asm and DST,VALUE // this is used in vector4 code to operate with float arrays as sources #define KEXPANDFLOAT( DST, SRC ) \ __asm movss DST,SRC \ __asm shufps DST,DST,0 #define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC ) #define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC ) #define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC ) #define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC ) #define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC ) #define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC ) #define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC ) #define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC ) // handles pre & post leftovers #define KFLOATOPER( OPER, OPER4, COUNT ) \ __asm mov ecx,pre \ __asm mov ebx,COUNT \ __asm cmp ebx,ecx \ __asm cmovl ecx,COUNT \ __asm test ecx,ecx \ __asm je preDone \ __asm xor ebx,ebx \ __asm lpPre: \ OPER \ __asm add ebx,4 \ __asm dec ecx \ __asm jg lpPre \ __asm preDone: \ __asm mov ecx,post \ __asm mov ebx,COUNT \ __asm sub ebx,ecx \ __asm shl ebx,2 \ __asm cmp ecx,4 \ __asm jl post4Done \ OPER4 \ __asm sub ecx,4 \ __asm add ebx,4*4 \ __asm post4Done: \ __asm test ecx,ecx \ __asm je postDone \ __asm lpPost: \ OPER \ __asm add ebx,4 \ __asm dec ecx \ __asm jg lpPost \ __asm postDone: // operate on a constant and a float array #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \ int pre,post; \ __asm movss xmm0,CONSTANT \ __asm shufps xmm0,xmm0,0 \ KFLOATINITDS( DST, SRC, COUNT, pre, post ) \ __asm and eax,15 \ __asm jne lpNA \ __asm jmp lpA \ __asm align 16 \ __asm lpA: \ __asm prefetchnta [edx+ebx+64] \ __asm movaps xmm1,xmm0 \ __asm movaps xmm2,xmm0 \ __asm ALUOP##ps xmm1,[edx+ebx] \ __asm ALUOP##ps xmm2,[edx+ebx+16] \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpA \ __asm jmp done \ __asm align 16 \ __asm lpNA: \ __asm prefetchnta [edx+ebx+64] \ __asm movaps xmm1,xmm0 \ __asm movaps xmm2,xmm0 \ __asm movups xmm3,[edx+ebx] \ __asm movups xmm4,[edx+ebx+16] \ __asm ALUOP##ps xmm1,xmm3 \ __asm ALUOP##ps xmm2,xmm4 \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpNA \ __asm done: \ __asm mov edx,SRC \ __asm mov edi,DST \ __asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \ __asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT ) // operate on two float arrays #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \ int pre,post; \ KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \ __asm and eax,15 \ __asm jne lpNA \ __asm jmp lpA \ __asm align 16 \ __asm lpA: \ __asm movaps xmm1,[edx+ebx] \ __asm movaps xmm2,[edx+ebx+16] \ __asm ALUOP##ps xmm1,[esi+ebx] \ __asm ALUOP##ps xmm2,[esi+ebx+16] \ __asm prefetchnta [edx+ebx+64] \ __asm prefetchnta [esi+ebx+64] \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpA \ __asm jmp done \ __asm align 16 \ __asm lpNA: \ __asm movups xmm1,[edx+ebx] \ __asm movups xmm2,[edx+ebx+16] \ __asm movups xmm3,[esi+ebx] \ __asm movups xmm4,[esi+ebx+16] \ __asm prefetchnta [edx+ebx+64] \ __asm prefetchnta [esi+ebx+64] \ __asm ALUOP##ps xmm1,xmm3 \ __asm ALUOP##ps xmm2,xmm4 \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpNA \ __asm done: \ __asm mov edx,SRC0 \ __asm mov esi,SRC1 \ __asm mov edi,DST \ KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \ KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT ) #define DRAWVERT_SIZE 60 #define JOINTQUAT_SIZE (7*4) #define JOINTMAT_SIZE (4*3*4) #define JOINTWEIGHT_SIZE (4*4) #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT } #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 } #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT } ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 ); ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 ); ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) ); ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) ); ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) ); ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) ); ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 ); ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) ); ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) ); ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) ); ALIGN4_INIT1( unsigned int SIMD_SP_not, 0xFFFFFFFF ); ALIGN4_INIT1( float SIMD_SP_zero, 0.0f ); ALIGN4_INIT1( float SIMD_SP_half, 0.5f ); ALIGN4_INIT1( float SIMD_SP_one, 1.0f ); ALIGN4_INIT1( float SIMD_SP_two, 2.0f ); ALIGN4_INIT1( float SIMD_SP_three, 3.0f ); ALIGN4_INIT1( float SIMD_SP_four, 4.0f ); ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) ); ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f ); ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI ); ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI ); ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI ); ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI ); ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY ); ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f ); ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f ); ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f ); ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f ); ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f ); ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f ); ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f ); ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f ); ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f ); ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f ); ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f ); ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f ); ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f ); ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f ); ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f ); ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f ); ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f ); ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f ); ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f ); ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f ); ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f ); ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f ); /* ============ SSE_InvSqrt ============ */ float SSE_InvSqrt( float x ) { float y; __asm { movss xmm0, x rsqrtss xmm1, xmm0 mulss xmm0, xmm1 mulss xmm0, xmm1 subss xmm0, SIMD_SP_rsqrt_c0 mulss xmm1, SIMD_SP_rsqrt_c1 mulss xmm0, xmm1 movss y, xmm0 } return y; } /* ============ SSE_InvSqrt4 ============ */ void SSE_InvSqrt4( float x[4] ) { __asm { mov edi, x movaps xmm0, [edi] rsqrtps xmm1, xmm0 mulps xmm0, xmm1 mulps xmm0, xmm1 subps xmm0, SIMD_SP_rsqrt_c0 mulps xmm1, SIMD_SP_rsqrt_c1 mulps xmm0, xmm1 movaps [edi], xmm0 } } /* ============ SSE_SinZeroHalfPI The angle must be between zero and half PI. ============ */ float SSE_SinZeroHalfPI( float a ) { #if 1 float t; assert( a >= 0.0f && a <= idMath::HALF_PI ); __asm { movss xmm0, a movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, SIMD_SP_sin_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 movss t, xmm2 } return t; #else float s, t; assert( a >= 0.0f && a <= idMath::HALF_PI ); s = a * a; t = -2.39e-08f; t *= s; t += 2.7526e-06f; t *= s; t += -1.98409e-04f; t *= s; t += 8.3333315e-03f; t *= s; t += -1.666666664e-01f; t *= s; t += 1.0f; t *= a; return t; #endif } /* ============ SSE_Sin4ZeroHalfPI The angle must be between zero and half PI. ============ */ void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) { __asm { mov edi, a mov esi, s movaps xmm0, [edi] movaps xmm1, xmm0 mulps xmm1, xmm1 movaps xmm2, SIMD_SP_sin_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 movaps [esi], xmm2 } } /* ============ SSE_Sin ============ */ float SSE_Sin( float a ) { #if 1 float t; __asm { movss xmm1, a movss xmm2, xmm1 movss xmm3, xmm1 mulss xmm2, SIMD_SP_oneOverTwoPI cvttss2si ecx, xmm2 cmpltss xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one cvtsi2ss xmm2, ecx subss xmm2, xmm3 mulss xmm2, SIMD_SP_twoPI subss xmm1, xmm2 movss xmm0, SIMD_SP_PI // xmm0 = PI subss xmm0, xmm1 // xmm0 = PI - a movss xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a ) movss xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movss xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, SIMD_SP_sin_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 movss t, xmm2 } return t; #else float s, t; if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) { a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI; } a = idMath::PI - a; if ( fabs( a ) >= idMath::HALF_PI ) { a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a; } s = a * a; t = -2.39e-08f; t *= s; t += 2.7526e-06f; t *= s; t += -1.98409e-04f; t *= s; t += 8.3333315e-03f; t *= s; t += -1.666666664e-01f; t *= s; t += 1.0f; t *= a; return t; #endif } /* ============ SSE_Sin4 ============ */ void SSE_Sin4( float a[4], float s[4] ) { __asm { mov edi, a mov esi, s movaps xmm1, [edi] movaps xmm2, xmm1 mulps xmm2, SIMD_SP_oneOverTwoPI movhlps xmm3, xmm2 cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 ) cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 ) movaps xmm3, xmm1 cmpltps xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one subps xmm2, xmm3 mulps xmm2, SIMD_SP_twoPI subps xmm1, xmm2 movaps xmm0, SIMD_SP_PI // xmm0 = PI subps xmm0, xmm1 // xmm0 = PI - a movaps xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a ) movaps xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movaps xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movaps xmm1, xmm0 mulps xmm1, xmm1 movaps xmm2, SIMD_SP_sin_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 movaps [esi], xmm2 } } /* ============ SSE_CosZeroHalfPI The angle must be between zero and half PI. ============ */ float SSE_CosZeroHalfPI( float a ) { #if 1 float t; assert( a >= 0.0f && a <= idMath::HALF_PI ); __asm { movss xmm0, a mulss xmm0, xmm0 movss xmm1, SIMD_SP_cos_c0 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c1 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c2 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c3 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c4 mulss xmm1, xmm0 addss xmm1, SIMD_SP_one movss t, xmm1 } return t; #else float s, t; assert( a >= 0.0f && a <= idMath::HALF_PI ); s = a * a; t = -2.605e-07f; t *= s; t += 2.47609e-05f; t *= s; t += -1.3888397e-03f; t *= s; t += 4.16666418e-02f; t *= s; t += -4.999999963e-01f; t *= s; t += 1.0f; return t; #endif } /* ============ SSE_Cos4ZeroHalfPI The angle must be between zero and half PI. ============ */ void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) { __asm { mov edi, a mov esi, c movaps xmm0, [edi] mulps xmm0, xmm0 movaps xmm1, SIMD_SP_cos_c0 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c1 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c2 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c3 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c4 mulps xmm1, xmm0 addps xmm1, SIMD_SP_one movaps [esi], xmm2 } } /* ============ SSE_Cos ============ */ float SSE_Cos( float a ) { #if 1 float t; __asm { movss xmm1, a movss xmm2, xmm1 movss xmm3, xmm1 mulss xmm2, SIMD_SP_oneOverTwoPI cvttss2si ecx, xmm2 cmpltss xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one cvtsi2ss xmm2, ecx subss xmm2, xmm3 mulss xmm2, SIMD_SP_twoPI subss xmm1, xmm2 movss xmm0, SIMD_SP_PI // xmm0 = PI subss xmm0, xmm1 // xmm0 = PI - a movss xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a ) movss xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movss xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 mulss xmm0, xmm0 movss xmm1, SIMD_SP_cos_c0 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c1 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c2 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c3 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c4 mulss xmm1, xmm0 addss xmm1, SIMD_SP_one xorps xmm2, SIMD_SP_signBitMask xorps xmm1, xmm2 movss t, xmm1 } return t; #else float s, t; if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) { a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI; } a = idMath::PI - a; if ( fabs( a ) >= idMath::HALF_PI ) { a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a; d = 1.0f; } else { d = -1.0f; } s = a * a; t = -2.605e-07f; t *= s; t += 2.47609e-05f; t *= s; t += -1.3888397e-03f; t *= s; t += 4.16666418e-02f; t *= s; t += -4.999999963e-01f; t *= s; t += 1.0f; t *= d; return t; #endif } /* ============ SSE_Cos4 ============ */ void SSE_Cos4( float a[4], float c[4] ) { __asm { mov edi, a mov esi, c movaps xmm1, [edi] movaps xmm2, xmm1 mulps xmm2, SIMD_SP_oneOverTwoPI movhlps xmm3, xmm2 cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 ) cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 ) movaps xmm3, xmm1 cmpltps xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one subps xmm2, xmm3 mulps xmm2, SIMD_SP_twoPI subps xmm1, xmm2 movaps xmm0, SIMD_SP_PI // xmm0 = PI subps xmm0, xmm1 // xmm0 = PI - a movaps xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a ) movaps xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movaps xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 mulps xmm0, xmm0 movaps xmm1, SIMD_SP_cos_c0 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c1 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c2 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c3 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c4 mulps xmm1, xmm0 addps xmm1, SIMD_SP_one xorps xmm2, SIMD_SP_signBitMask xorps xmm1, xmm2 movaps [esi], xmm1 } } /* ============ SSE_SinCos ============ */ void SSE_SinCos( float a, float &s, float &c ) { __asm { mov edi, s mov esi, c movss xmm1, a movss xmm2, xmm1 movss xmm3, xmm1 mulss xmm2, SIMD_SP_oneOverTwoPI cvttss2si ecx, xmm2 cmpltss xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one cvtsi2ss xmm2, ecx subss xmm2, xmm3 mulss xmm2, SIMD_SP_twoPI subss xmm1, xmm2 movss xmm0, SIMD_SP_PI // xmm0 = PI subss xmm0, xmm1 // xmm0 = PI - a movss xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a ) movss xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movss xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm3, SIMD_SP_sin_c0 movss xmm4, SIMD_SP_cos_c0 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c1 addss xmm4, SIMD_SP_cos_c1 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c2 addss xmm4, SIMD_SP_cos_c2 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c3 addss xmm4, SIMD_SP_cos_c3 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c4 addss xmm4, SIMD_SP_cos_c4 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_one addss xmm4, SIMD_SP_one mulss xmm3, xmm0 xorps xmm2, SIMD_SP_signBitMask xorps xmm4, xmm2 movss [edi], xmm2 movss [esi], xmm3 } } /* ============ SSE_SinCos4 ============ */ void SSE_SinCos4( float a[4], float s[4], float c[4] ) { __asm { mov eax, a mov edi, s mov esi, c movaps xmm1, [eax] movaps xmm2, xmm1 mulps xmm2, SIMD_SP_oneOverTwoPI movhlps xmm3, xmm2 cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 ) cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 ) movaps xmm3, xmm1 cmpltps xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one subps xmm2, xmm3 mulps xmm2, SIMD_SP_twoPI subps xmm1, xmm2 movaps xmm0, SIMD_SP_PI // xmm0 = PI subps xmm0, xmm1 // xmm0 = PI - a movaps xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a ) movaps xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movaps xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movaps xmm0, [eax] movaps xmm1, xmm0 mulps xmm1, xmm1 movaps xmm3, SIMD_SP_sin_c0 movaps xmm4, SIMD_SP_cos_c0 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c1 addps xmm4, SIMD_SP_cos_c1 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c2 addps xmm4, SIMD_SP_cos_c2 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c3 addps xmm4, SIMD_SP_cos_c3 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c4 addps xmm4, SIMD_SP_cos_c4 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_one addps xmm4, SIMD_SP_one mulps xmm3, xmm0 xorps xmm2, SIMD_SP_signBitMask xorps xmm4, xmm2 movaps [edi], xmm3 movaps [esi], xmm4 } } /* ============ SSE_ATanPositive Both 'x' and 'y' must be positive. ============ */ float SSE_ATanPositive( float y, float x ) { #if 1 float t; assert( y >= 0.0f && x >= 0.0f ); __asm { movss xmm0, x movss xmm3, xmm0 movss xmm1, y minss xmm0, xmm1 maxss xmm1, xmm3 cmpeqss xmm3, xmm0 rcpss xmm2, xmm1 mulss xmm1, xmm2 mulss xmm1, xmm2 addss xmm2, xmm2 subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulss xmm0, xmm2 // xmm0 = x / y or y / x movss xmm1, xmm3 andps xmm1, SIMD_SP_signBitMask xorps xmm0, xmm1 // xmm0 = -x / y or y / x andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f movss xmm1, xmm0 mulss xmm1, xmm1 // xmm1 = s movss xmm2, SIMD_SP_atan_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c5 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c6 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c7 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 addss xmm2, xmm3 movss t, xmm2 } return t; #else float a, d, s, t; assert( y >= 0.0f && x >= 0.0f ); if ( y > x ) { a = -x / y; d = idMath::HALF_PI; } else { a = y / x; d = 0.0f; } s = a * a; t = 0.0028662257f; t *= s; t += -0.0161657367f; t *= s; t += 0.0429096138f; t *= s; t += -0.0752896400f; t *= s; t += 0.1065626393f; t *= s; t += -0.1420889944f; t *= s; t += 0.1999355085f; t *= s; t += -0.3333314528f; t *= s; t += 1.0f; t *= a; t += d; return t; #endif } /* ============ SSE_ATan4Positive Both 'x' and 'y' must be positive. ============ */ void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) { __asm { mov esi, x mov edi, y mov edx, at movaps xmm0, [esi] movaps xmm3, xmm0 movaps xmm1, [edi] minps xmm0, xmm1 maxps xmm1, xmm3 cmpeqps xmm3, xmm0 rcpps xmm2, xmm1 mulps xmm1, xmm2 mulps xmm1, xmm2 addps xmm2, xmm2 subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulps xmm0, xmm2 // xmm0 = x / y or y / x movaps xmm1, xmm3 andps xmm1, SIMD_SP_signBitMask xorps xmm0, xmm1 // xmm0 = -x / y or y / x andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f movaps xmm1, xmm0 mulps xmm1, xmm1 // xmm1 = s movaps xmm2, SIMD_SP_atan_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c5 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c6 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c7 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 addps xmm2, xmm3 movaps [edx], xmm2 } } /* ============ SSE_ATan ============ */ float SSE_ATan( float y, float x ) { #if 1 float t; __asm { movss xmm0, x movss xmm3, xmm0 movss xmm4, xmm0 andps xmm0, SIMD_SP_absMask movss xmm1, y xorps xmm4, xmm1 andps xmm1, SIMD_SP_absMask andps xmm4, SIMD_SP_signBitMask minss xmm0, xmm1 maxss xmm1, xmm3 cmpeqss xmm3, xmm0 rcpss xmm2, xmm1 mulss xmm1, xmm2 mulss xmm1, xmm2 addss xmm2, xmm2 subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulss xmm0, xmm2 // xmm0 = x / y or y / x xorps xmm0, xmm4 movss xmm1, xmm3 andps xmm1, SIMD_SP_signBitMask xorps xmm0, xmm1 // xmm0 = -x / y or y / x orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f movss xmm1, xmm0 mulss xmm1, xmm1 // xmm1 = s movss xmm2, SIMD_SP_atan_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c5 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c6 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c7 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 addss xmm2, xmm3 movss t, xmm2 } return t; #else float a, d, s, t; if ( fabs( y ) > fabs( x ) ) { a = -x / y; d = idMath::HALF_PI; *((unsigned int *)&d) ^= ( *((unsigned int *)&x) ^ *((unsigned int *)&y) ) & (1<<31); } else { a = y / x; d = 0.0f; } s = a * a; t = 0.0028662257f; t *= s; t += -0.0161657367f; t *= s; t += 0.0429096138f; t *= s; t += -0.0752896400f; t *= s; t += 0.1065626393f; t *= s; t += -0.1420889944f; t *= s; t += 0.1999355085f; t *= s; t += -0.3333314528f; t *= s; t += 1.0f; t *= a; t += d; return t; #endif } /* ============ SSE_ATan4 ============ */ void SSE_ATan4( float y[4], float x[4], float at[4] ) { __asm { mov esi, x mov edi, y mov edx, at movaps xmm0, [esi] movaps xmm3, xmm0 movaps xmm4, xmm0 andps xmm0, SIMD_SP_absMask movaps xmm1, [edi] xorps xmm4, xmm1 andps xmm1, SIMD_SP_absMask andps xmm4, SIMD_SP_signBitMask minps xmm0, xmm1 maxps xmm1, xmm3 cmpeqps xmm3, xmm0 rcpps xmm2, xmm1 mulps xmm1, xmm2 mulps xmm1, xmm2 addps xmm2, xmm2 subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulps xmm0, xmm2 // xmm0 = x / y or y / x xorps xmm0, xmm4 movaps xmm1, xmm3 andps xmm1, SIMD_SP_signBitMask xorps xmm0, xmm1 // xmm0 = -x / y or y / x orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f movaps xmm1, xmm0 mulps xmm1, xmm1 // xmm1 = s movaps xmm2, SIMD_SP_atan_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c5 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c6 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c7 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 addps xmm2, xmm3 movaps [edx], xmm2 } } /* ============ SSE_TestTrigonometry ============ */ void SSE_TestTrigonometry( void ) { int i; float a, s1, s2, c1, c2; for ( i = 0; i < 100; i++ ) { a = i * idMath::HALF_PI / 100.0f; s1 = sin( a ); s2 = SSE_SinZeroHalfPI( a ); if ( fabs( s1 - s2 ) > 1e-7f ) { assert( 0 ); } c1 = cos( a ); c2 = SSE_CosZeroHalfPI( a ); if ( fabs( c1 - c2 ) > 1e-7f ) { assert( 0 ); } } for ( i = -200; i < 200; i++ ) { a = i * idMath::TWO_PI / 100.0f; s1 = sin( a ); s2 = SSE_Sin( a ); if ( fabs( s1 - s2 ) > 1e-6f ) { assert( 0 ); } c1 = cos( a ); c2 = SSE_Cos( a ); if ( fabs( c1 - c2 ) > 1e-6f ) { assert( 0 ); } SSE_SinCos( a, s2, c2 ); if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) { assert( 0 ); } } } /* ============ idSIMD_SSE::GetName ============ */ const char * idSIMD_SSE::GetName( void ) const { return "MMX & SSE"; } /* ============ idSIMD_SSE::Add dst[i] = constant + src[i]; ============ */ void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) { KFLOAT_CA( add, dst, src, constant, count ) } /* ============ idSIMD_SSE::Add dst[i] = src0[i] + src1[i]; ============ */ void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) { KFLOAT_AA( add, dst, src0, src1, count ) } /* ============ idSIMD_SSE::Sub dst[i] = constant - src[i]; ============ */ void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) { KFLOAT_CA( sub, dst, src, constant, count ) } /* ============ idSIMD_SSE::Sub dst[i] = src0[i] - src1[i]; ============ */ void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) { KFLOAT_AA( sub, dst, src0, src1, count ) } /* ============ idSIMD_SSE::Mul dst[i] = constant * src[i]; ============ */ void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) { KFLOAT_CA( mul, dst, src, constant, count ) } /* ============ idSIMD_SSE::Mul dst[i] = src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) { KFLOAT_AA( mul, dst, src0, src1, count ) } /* ============ idSIMD_SSE::Div dst[i] = constant / src[i]; ============ */ void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) { int pre, post; // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x)); __asm { movss xmm1,constant shufps xmm1,xmm1,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [edx+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 mulps xmm4,xmm1 mulps xmm5,xmm1 movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [edx+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 mulps xmm4,xmm1 mulps xmm5,xmm1 movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpNA done: mov edx,src mov edi,dst KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ), KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count ) } } /* ============ idSIMD_SSE::Div dst[i] = src0[i] / src1[i]; ============ */ void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) { int pre,post; // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x)); __asm { KFLOATINITDSS( dst, src0, src1, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[esi+ebx] movaps xmm3,[esi+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [esi+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 mulps xmm4,[edx+ebx] mulps xmm5,[edx+ebx+16] movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[esi+ebx] movups xmm3,[esi+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [esi+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] mulps xmm4,xmm2 mulps xmm5,xmm3 movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpNA done: mov edx,src0 mov esi,src1 mov edi,dst KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ), KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count ) } } /* ============ Simd_MulAdd assumes count >= 7 ============ */ static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) { __asm mov esi, dst __asm mov edi, src __asm mov eax, count __asm shl eax, 2 __asm mov ecx, esi __asm mov edx, eax __asm or ecx, edi __asm fld constant __asm and ecx, 15 __asm jz SimdMulAdd16 __asm and ecx, 3 __asm jnz SimdMulAdd8 __asm mov ecx, esi __asm xor ecx, edi __asm and ecx, 15 __asm jnz MulAdd8 __asm mov ecx, esi __asm and ecx, 15 __asm neg ecx __asm add ecx, 16 __asm sub eax, ecx __asm add edi, ecx __asm add esi, ecx __asm neg ecx __asm mov edx, eax __asm loopPreMulAdd16: __asm fld st __asm fmul dword ptr [edi+ecx] __asm fadd dword ptr [esi+ecx] __asm fstp dword ptr [esi+ecx] __asm add ecx, 4 __asm jl loopPreMulAdd16 __asm SimdMulAdd16: __asm and eax, ~15 __asm movss xmm1, constant __asm shufps xmm1, xmm1, 0x00 __asm add esi, eax __asm add edi, eax __asm neg eax __asm align 16 __asm loopMulAdd16: __asm movaps xmm0, [edi+eax] __asm mulps xmm0, xmm1 __asm addps xmm0, [esi+eax] __asm movaps [esi+eax], xmm0 __asm add eax, 16 __asm jl loopMulAdd16 __asm jmp postMulAdd __asm MulAdd8: __asm mov ecx, esi __asm and ecx, 7 __asm jz SimdMulAdd8 __asm sub eax, ecx __asm add esi, ecx __asm add edi, ecx __asm neg ecx __asm mov edx, eax __asm loopPreMulAdd8: __asm fld st __asm fmul dword ptr [edi+ecx] __asm fadd dword ptr [esi+ecx] __asm fstp dword ptr [esi+ecx] __asm add ecx, 4 __asm jl loopPreMulAdd8 __asm SimdMulAdd8: __asm and eax, ~15 __asm movss xmm1, constant __asm shufps xmm1, xmm1, 0x00 __asm add esi, eax __asm add edi, eax __asm neg eax __asm align 16 __asm loopMulAdd8: __asm movlps xmm0, [edi+eax] __asm movhps xmm0, [edi+eax+8] __asm mulps xmm0, xmm1 __asm movlps xmm2, [esi+eax] __asm movhps xmm2, [esi+eax+8] __asm addps xmm0, xmm2 __asm movlps [esi+eax], xmm0 __asm movhps [esi+eax+8], xmm0 __asm add eax, 16 __asm jl loopMulAdd8 __asm jmp postMulAdd __asm postMulAdd: __asm and edx, 15 __asm jz MulAddDone __asm add esi, edx __asm add edi, edx __asm neg edx __asm loopPostMulAdd: __asm fld st __asm fmul dword ptr [edi+edx] __asm fadd dword ptr [esi+edx] __asm fstp dword ptr [esi+edx] __asm add edx, 4 __asm jl loopPostMulAdd __asm MulAddDone: __asm fstp st } #define MULADD_FEW( OPER ) \ switch( count ) { \ case 0: \ return; \ case 1: \ dst[0] OPER c * src[0]; \ return; \ case 2: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \ return; \ case 3: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \ return; \ case 4: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ return; \ case 5: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; \ return; \ case 6: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \ return; \ case 7: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \ return; \ case 8: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ return; \ case 9: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ dst[8] OPER c * src[8]; \ return; \ case 10: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \ return; \ case 11: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \ return; \ } /* ============ idSIMD_SSE::MulAdd dst[i] += constant * src[i]; ============ */ void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) { float c = constant; MULADD_FEW( += ) Simd_MulAdd( dst, constant, src, count ); } /* ============ idSIMD_SSE::MulAdd dst[i] += src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) { for ( int i = 0; i < count; i++ ) { dst[i] += src0[i] + src1[i]; } } /* ============ idSIMD_SSE::MulSub dst[i] -= constant * src[i]; ============ */ void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) { float c = constant; MULADD_FEW( -= ) Simd_MulAdd( dst, -constant, src, count ); } /* ============ idSIMD_SSE::MulSub dst[i] -= src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) { for ( int i = 0; i < count; i++ ) { dst[i] -= src0[i] + src1[i]; } } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) { __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) jz done4 imul eax, 12 add esi, eax neg eax loop4: movlps xmm1, [esi+eax+ 0] movlps xmm2, [esi+eax+ 8] movlps xmm3, [esi+eax+16] movhps xmm1, [esi+eax+24] movhps xmm2, [esi+eax+32] movhps xmm3, [esi+eax+40] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 ) shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) add ecx, 16 add eax, 4*12 mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 ) movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loop4 done4: and edx, 3 jz done1 loop1: movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 add ecx, 4 addss xmm0, xmm1 add eax, 12 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loop1 done1: } } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i].Normal() + src[i][3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) { __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm5, [edi+0] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [edi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) jz startVert1 imul eax, 16 add esi, eax neg eax loopVert4: movlps xmm1, [esi+eax+ 0] movlps xmm3, [esi+eax+ 8] movhps xmm1, [esi+eax+16] movhps xmm3, [esi+eax+24] movlps xmm2, [esi+eax+32] movlps xmm4, [esi+eax+40] movhps xmm2, [esi+eax+48] movhps xmm4, [esi+eax+56] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) movaps xmm2, xmm3 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) add ecx, 16 add eax, 4*16 mulps xmm0, xmm5 mulps xmm1, xmm6 mulps xmm2, xmm7 addps xmm0, xmm3 addps xmm0, xmm1 addps xmm0, xmm2 movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 startVert1: and edx, 3 jz done loopVert1: movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm5 mulss xmm1, xmm6 mulss xmm2, xmm7 addss xmm0, [esi+eax+12] add ecx, 4 addss xmm0, xmm1 add eax, 16 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 done: } } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i].xyz; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); // 0, 1, 2 // 3, 4, 5 // 6, 7, 8 // 9, 10, 11 __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) jz startVert1 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert4: movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 movaps xmm1, xmm0 // 3, X, 0, 1 movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 add ecx, 16 add eax, 4*DRAWVERT_SIZE mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 startVert1: and edx, 3 jz done loopVert1: movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 add ecx, 4 addss xmm0, xmm1 add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 done: } } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i] + constant[3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) { __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [edi+12] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) jz done4 imul eax, 12 add esi, eax neg eax loop4: movlps xmm1, [esi+eax+ 0] movlps xmm2, [esi+eax+ 8] movlps xmm3, [esi+eax+16] movhps xmm1, [esi+eax+24] movhps xmm2, [esi+eax+32] movhps xmm3, [esi+eax+40] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 ) shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) add ecx, 16 add eax, 4*12 mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 ) movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loop4 done4: and edx, 3 jz done1 loop1: movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 addss xmm0, xmm7 add ecx, 4 addss xmm0, xmm1 add eax, 12 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loop1 done1: } } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) { #define SINGLE_OP(SRC, DEST) \ __asm movlps xmm0,[SRC] \ __asm movlps xmm1,[SRC+8] \ __asm mulps xmm0,xmm4 \ __asm mulps xmm1,xmm5 \ __asm addps xmm0,xmm1 \ __asm movaps xmm1,xmm0 \ __asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \ __asm addss xmm0,xmm1 \ __asm movss [DEST],xmm0 \ __asm add SRC,16 \ __asm add DEST,4 #define DUAL_OP(SRC, DEST) \ __asm movlps xmm0,[SRC] \ __asm movlps xmm1,[SRC+8] \ __asm movhps xmm0,[SRC+16] \ __asm movhps xmm1,[SRC+24] \ __asm mulps xmm0,xmm4 \ __asm mulps xmm1,xmm5 \ __asm addps xmm0,xmm1 \ __asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \ __asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \ __asm addps xmm0,xmm1 \ __asm movhps [DEST],xmm0 \ __asm add SRC,32 \ __asm add DEST,8 __asm { mov edx, dst mov eax, src mov ebx, constant mov ecx, count movlps xmm4, [ebx] shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0) movlps xmm5, [ebx+8] shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0) xorps xmm0, xmm0 xorps xmm1, xmm1 _lpAlignDest: test edx, 0x0f jz _destAligned SINGLE_OP(eax,edx) dec ecx jnz _lpAlignDest jmp _vpExit _destAligned: push ecx cmp ecx, 4 jl _post and ecx, ~3 shl ecx, 2 lea eax, [eax+ecx*4] add edx, ecx neg ecx movlps xmm0, [eax+ecx*4] movhps xmm0, [eax+ecx*4+16] movlps xmm2, [eax+ecx*4+32] movhps xmm2, [eax+ecx*4+48] jmp _lpStart align 16 _lp: prefetchnta [eax+ecx*4+128] addps xmm1, xmm0 movlps xmm0, [eax+ecx*4] movhps xmm0, [eax+ecx*4+16] movlps xmm2, [eax+ecx*4+32] movhps xmm2, [eax+ecx*4+48] movaps [edx+ecx-16],xmm1 _lpStart: movlps xmm1, [eax+ecx*4+8] movhps xmm1, [eax+ecx*4+24] movlps xmm3, [eax+ecx*4+40] movhps xmm3, [eax+ecx*4+56] add ecx, 16 mulps xmm1, xmm5 mulps xmm2, xmm4 mulps xmm3, xmm5 addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2 mulps xmm0, xmm4 addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0 movaps xmm1, xmm0 shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0 shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0 js _lp addps xmm1, xmm0 movaps [edx+ecx-16], xmm1 _post: pop ecx and ecx, 0x3 cmp ecx, 2 jl _post1 DUAL_OP(eax,edx) sub ecx, 2 _post1: cmp ecx, 1 jne _vpExit SINGLE_OP(eax,edx) _vpExit: } #undef DUAL_OP #undef SINGLE_OP } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i].xyz + constant[3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); // 0, 1, 2 // 3, 4, 5 // 6, 7, 8 // 9, 10, 11 __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [edi+12] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) jz startVert1 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert4: movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1 movaps xmm1, xmm0 // 3, X, 0, 1 movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1 shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5 movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7 shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9 movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7 shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10 movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11 add ecx, 16 add eax, 4*DRAWVERT_SIZE mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 startVert1: and edx, 3 jz done loopVert1: movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 addss xmm0, xmm7 add ecx, 4 addss xmm0, xmm1 add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 done: } } /* ============ idSIMD_SSE::Dot dst[i] = src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) { __asm { mov eax, count mov edi, src0 mov edx, eax mov esi, src1 mov ecx, dst and eax, ~3 jz done4 imul eax, 12 add edi, eax add esi, eax neg eax loop4: movlps xmm0, [esi+eax] // 0, 1, X, X movlps xmm3, [edi+eax] // 0, 1, X, X movlps xmm1, [esi+eax+8] // 2, 3, X, X movlps xmm4, [edi+eax+8] // 2, 3, X, X movhps xmm0, [esi+eax+24] // 0, 1, 6, 7 movhps xmm3, [edi+eax+24] // 0, 1, 6, 7 movhps xmm1, [esi+eax+32] // 2, 3, 8, 9 movhps xmm4, [edi+eax+32] // 2, 3, 8, 9 movlps xmm2, [esi+eax+16] // 4, 5, X, X movlps xmm5, [edi+eax+16] // 4, 5, X, X movhps xmm2, [esi+eax+40] // 4, 5, 10, 11 movhps xmm5, [edi+eax+40] // 4, 5, 10, 11 add ecx, 16 add eax, 48 mulps xmm0, xmm3 mulps xmm1, xmm4 mulps xmm2, xmm5 movaps xmm7, xmm0 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) // 0, 6, 3, 9 shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 ) // 1, 7, 4, 10 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) // 2, 8, 5, 11 addps xmm7, xmm0 addps xmm7, xmm1 shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 ) movlps [ecx-16+0], xmm7 movhps [ecx-16+8], xmm7 jl loop4 done4: and edx, 3 jz done1 loop1: movss xmm0, [esi+eax+0] movss xmm3, [edi+eax+0] movss xmm1, [esi+eax+4] movss xmm4, [edi+eax+4] movss xmm2, [esi+eax+8] movss xmm5, [edi+eax+8] mulss xmm0, xmm3 mulss xmm1, xmm4 mulss xmm2, xmm5 add ecx, 4 addss xmm0, xmm1 add eax, 12 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loop1 done1: } } /* ============ idSIMD_SSE::Dot dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ... ============ */ void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) { switch( count ) { case 0: dot = 0.0f; return; case 1: dot = src1[0] * src2[0]; return; case 2: dot = src1[0] * src2[0] + src1[1] * src2[1]; return; case 3: dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2]; return; default: __asm { mov ecx, src1 mov edx, src2 mov eax, ecx or eax, edx and eax, 15 jz alignedDot // unaligned mov eax, count shr eax, 2 shl eax, 4 add ecx, eax add edx, eax neg eax movups xmm0, [ecx+eax] movups xmm1, [edx+eax] mulps xmm0, xmm1 add eax, 16 jz doneDot loopUnalignedDot: movups xmm1, [ecx+eax] movups xmm2, [edx+eax] mulps xmm1, xmm2 addps xmm0, xmm1 add eax, 16 jl loopUnalignedDot jmp doneDot // aligned alignedDot: mov eax, count shr eax, 2 shl eax, 4 add ecx, eax add edx, eax neg eax movaps xmm0, [ecx+eax] movaps xmm1, [edx+eax] mulps xmm0, xmm1 add eax, 16 jz doneDot loopAlignedDot: movaps xmm1, [ecx+eax] movaps xmm2, [edx+eax] mulps xmm1, xmm2 addps xmm0, xmm1 add eax, 16 jl loopAlignedDot doneDot: } switch( count & 3 ) { case 1: __asm { movss xmm1, [ecx] movss xmm2, [edx] mulss xmm1, xmm2 addss xmm0, xmm1 } break; case 2: __asm { xorps xmm2, xmm2 movlps xmm1, [ecx] movlps xmm2, [edx] mulps xmm1, xmm2 addps xmm0, xmm1 } break; case 3: __asm { movss xmm1, [ecx] movhps xmm1, [ecx+4] movss xmm2, [edx] movhps xmm2, [edx+4] mulps xmm1, xmm2 addps xmm0, xmm1 } break; } __asm { movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 mov eax, dot movss [eax], xmm0 } return; } } // // cmpeqps == Equal // cmpneqps != Not Equal // cmpltps < Less Than // cmpnltps >= Not Less Than // cmpnleps > Not Less Or Equal // #define FLIP not al #define NOFLIP #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \ int i, cnt, pre, post; \ float *aligned; \ \ /* if the float array is not aligned on a 4 byte boundary */ \ if ( ((int) SRC0) & 3 ) { \ /* unaligned memory access */ \ pre = 0; \ cnt = COUNT >> 2; \ post = COUNT - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, SRC0 \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm mov ecx, 0x01010101 \ __asm loopNA: \ __asm movups xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, ecx \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopNA \ __asm pop ebx \ } \ else { \ /* aligned memory access */ \ aligned = (float *) ((((int) SRC0) + 15) & ~15); \ if ( (int)aligned > ((int)src0) + COUNT ) { \ pre = COUNT; \ post = 0; \ } \ else { \ pre = aligned - SRC0; \ cnt = (COUNT - pre) >> 2; \ post = COUNT - pre - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, aligned \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm add edi, pre \ __asm mov ecx, 0x01010101 \ __asm loopA: \ __asm movaps xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, ecx \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopA \ __asm pop ebx \ } \ } \ doneCmp: \ double c = constant; \ for ( i = 0; i < pre; i++ ) { \ dst[i] = src0[i] CMP c; \ } \ for ( i = count - post; i < count; i++ ) { \ dst[i] = src0[i] CMP c; \ } #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \ int i, cnt, pre, post; \ float *aligned; \ \ /* if the float array is not aligned on a 4 byte boundary */ \ if ( ((int) SRC0) & 3 ) { \ /* unaligned memory access */ \ pre = 0; \ cnt = COUNT >> 2; \ post = COUNT - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, SRC0 \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm mov cl, bitNum \ __asm loopNA: \ __asm movups xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, 0x01010101 \ __asm shl ebx, cl \ __asm or ebx, dword ptr [edi] \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopNA \ __asm pop ebx \ } \ else { \ /* aligned memory access */ \ aligned = (float *) ((((int) SRC0) + 15) & ~15); \ if ( (int)aligned > ((int)src0) + COUNT ) { \ pre = COUNT; \ post = 0; \ } \ else { \ pre = aligned - SRC0; \ cnt = (COUNT - pre) >> 2; \ post = COUNT - pre - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, aligned \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm add edi, pre \ __asm mov cl, bitNum \ __asm loopA: \ __asm movaps xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, 0x01010101 \ __asm shl ebx, cl \ __asm or ebx, dword ptr [edi] \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopA \ __asm pop ebx \ } \ } \ doneCmp: \ float c = constant; \ for ( i = 0; i < pre; i++ ) { \ dst[i] |= ( src0[i] CMP c ) << BITNUM; \ } \ for ( i = count - post; i < count; i++ ) { \ dst[i] |= ( src0[i] CMP c ) << BITNUM; \ } /* ============ idSIMD_SSE::CmpGT dst[i] = src0[i] > constant; ============ */ void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP ) } /* ============ idSIMD_SSE::CmpGT dst[i] |= ( src0[i] > constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP ) } /* ============ idSIMD_SSE::CmpGE dst[i] = src0[i] >= constant; ============ */ void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpGE dst[i] |= ( src0[i] >= constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpLT dst[i] = src0[i] < constant; ============ */ void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpLE dst[i] = src0[i] <= constant; ============ */ void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP ) } /* ============ idSIMD_SSE::CmpLE dst[i] |= ( src0[i] <= constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP ) } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) { int i, pre, post; min = idMath::INFINITY; max = -idMath::INFINITY; __asm { push ebx mov eax, min mov ebx, max movss xmm0, [eax] movss xmm1, [ebx] shufps xmm0, xmm0, 0 shufps xmm1, xmm1, 0 KFLOATINITS( src, count, pre, post ) and eax, 15 jz lpA jmp lpNA align 16 lpNA: movups xmm2, [edx+ebx] movups xmm3, [edx+ebx+16] minps xmm0, xmm2 maxps xmm1, xmm2 prefetchnta [edx+ebx+64] minps xmm0, xmm3 maxps xmm1, xmm3 add ebx, 16*2 jl lpNA jmp done2 lpA: movaps xmm2, [edx+ebx] movaps xmm3, [edx+ebx+16] minps xmm0, xmm2 maxps xmm1, xmm2 prefetchnta [edx+ebx+64] minps xmm0, xmm3 maxps xmm1, xmm3 add ebx, 16*2 jl lpA jmp done2 align 16 done2: movaps xmm2, xmm0 movaps xmm3, xmm1 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) minss xmm0, xmm2 maxss xmm1, xmm3 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) minss xmm0, xmm2 maxss xmm1, xmm3 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) minss xmm0, xmm2 maxss xmm1, xmm3 mov eax, min mov ebx, max movss [eax], xmm0 movss [ebx], xmm1 done: pop ebx } for ( i = 0; i < pre; i++ ) { float tmp = src[i]; if ( tmp > max ) { max = tmp; } if ( tmp < min ) { min = tmp; } } for ( i = count - post; i < count; i++ ) { float tmp = src[i]; if ( tmp > max ) { max = tmp; } if ( tmp < min ) { min = tmp; } } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) { __asm { mov eax, count test eax, eax movss xmm0, idMath::INFINITY xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) subps xmm1, xmm0 jz done mov ecx, eax and ecx, 1 mov esi, src jz startLoop movlps xmm2, [esi] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) dec eax add esi, 2*4 minps xmm0, xmm2 maxps xmm1, xmm2 startLoop: imul eax, 2*4 add esi, eax neg eax loopVert: movlps xmm2, [esi+eax] movhps xmm2, [esi+eax+8] add eax, 4*4 minps xmm0, xmm2 maxps xmm1, xmm2 jl loopVert done: movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 ) minps xmm0, xmm2 mov esi, min movlps [esi], xmm0 movaps xmm3, xmm1 shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 ) maxps xmm1, xmm3 mov edi, max movlps [edi], xmm1 } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) { __asm { movss xmm0, idMath::INFINITY xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov esi, src mov eax, count and eax, ~3 jz done4 imul eax, 12 add esi, eax neg eax loop4: // prefetchnta [esi+4*12] movss xmm4, [esi+eax+0*12+8] movhps xmm4, [esi+eax+0*12+0] minps xmm0, xmm4 maxps xmm1, xmm4 movss xmm5, [esi+eax+1*12+0] movhps xmm5, [esi+eax+1*12+4] minps xmm2, xmm5 maxps xmm3, xmm5 movss xmm6, [esi+eax+2*12+8] movhps xmm6, [esi+eax+2*12+0] minps xmm0, xmm6 maxps xmm1, xmm6 movss xmm7, [esi+eax+3*12+0] movhps xmm7, [esi+eax+3*12+4] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*12 jl loop4 done4: mov eax, count and eax, 3 jz done1 imul eax, 12 add esi, eax neg eax loop1: movss xmm4, [esi+eax+0*12+8] movhps xmm4, [esi+eax+0*12+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 12 jl loop1 done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); __asm { movss xmm0, idMath::INFINITY xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov esi, src mov eax, count and eax, ~3 jz done4 imul eax, DRAWVERT_SIZE add esi, eax neg eax loop4: // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] minps xmm2, xmm5 maxps xmm3, xmm5 movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm6 maxps xmm1, xmm6 movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*DRAWVERT_SIZE jl loop4 done4: mov eax, count and eax, 3 jz done1 imul eax, DRAWVERT_SIZE add esi, eax neg eax loop1: movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, DRAWVERT_SIZE jl loop1 done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); __asm { movss xmm0, idMath::INFINITY xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov edi, indexes mov esi, src mov eax, count and eax, ~3 jz done4 shl eax, 2 add edi, eax neg eax loop4: // prefetchnta [edi+128] // prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] mov edx, [edi+eax+0] imul edx, DRAWVERT_SIZE movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 mov edx, [edi+eax+4] imul edx, DRAWVERT_SIZE movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4] minps xmm2, xmm5 maxps xmm3, xmm5 mov edx, [edi+eax+8] imul edx, DRAWVERT_SIZE movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm6 maxps xmm1, xmm6 mov edx, [edi+eax+12] imul edx, DRAWVERT_SIZE movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*4 jl loop4 done4: mov eax, count and eax, 3 jz done1 shl eax, 2 add edi, eax neg eax loop1: mov edx, [edi+eax+0] imul edx, DRAWVERT_SIZE; movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 4 jl loop1 done1: shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } } /* ============ idSIMD_SSE::Clamp ============ */ void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) { int i, pre, post; __asm { movss xmm0,min movss xmm1,max shufps xmm0,xmm0,0 shufps xmm1,xmm1,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] maxps xmm2,xmm0 maxps xmm3,xmm0 prefetchnta [edx+ebx+64] minps xmm2,xmm1 minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] maxps xmm2,xmm0 maxps xmm3,xmm0 prefetchnta [edx+ebx+64] minps xmm2,xmm1 minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpNA done: } for ( i = 0; i < pre; i++ ) { if ( src[i] < min ) dst[i] = min; else if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } for( i = count - post; i < count; i++ ) { if ( src[i] < min ) dst[i] = min; else if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } } /* ============ idSIMD_SSE::ClampMin ============ */ void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) { int i, pre, post; __asm { movss xmm0,min shufps xmm0,xmm0,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] maxps xmm2,xmm0 prefetchnta [edx+ebx+64] maxps xmm3,xmm0 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] maxps xmm2,xmm0 prefetchnta [edx+ebx+64] maxps xmm3,xmm0 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpNA done: } for( i = 0; i < pre; i++ ) { if ( src[i] < min ) dst[i] = min; else dst[i] = src[i]; } for( i = count - post; i < count; i++ ) { if ( src[i] < min ) dst[i] = min; else dst[i] = src[i]; } } /* ============ idSIMD_SSE::ClampMax ============ */ void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) { int i, pre, post; __asm { movss xmm1,max shufps xmm1,xmm1,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] minps xmm2,xmm1 prefetchnta [edx+ebx+64] minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] minps xmm2,xmm1 prefetchnta [edx+ebx+64] minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpNA done: } for( i = 0; i < pre; i++ ) { if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } for( i = count - post; i < count; i++ ) { if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } } /* ============ idSIMD_SSE::Zero16 ============ */ void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) { __asm { mov edx, dst mov eax, count add eax, 3 shr eax, 2 jz doneZero16 shl eax, 4 add edx, eax neg eax xorps xmm0, xmm0 loopZero16: movaps [edx+eax], xmm0 add eax, 16 jl loopZero16 doneZero16: } } /* ============ idSIMD_SSE::Negate16 ============ */ void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) { __asm { mov edx, dst mov eax, count add eax, 3 shr eax, 2 jz doneNegate16 shl eax, 4 add edx, eax neg eax movss xmm0, SIMD_SP_signBitMask shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) loopNegate16: movaps xmm1, [edx+eax] xorps xmm1, xmm0 movaps [edx+eax], xmm1 add eax, 16 jl loopNegate16 doneNegate16: } } /* ============ idSIMD_SSE::Copy16 ============ */ void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) { __asm { mov ecx, src mov edx, dst mov eax, count add eax, 3 shr eax, 2 jz doneCopy16 shl eax, 4 add ecx, eax add edx, eax neg eax loopCopy16: movaps xmm0, [ecx+eax] movaps [edx+eax], xmm0 add eax, 16 jl loopCopy16 doneCopy16: } } /* ============ idSIMD_SSE::Add16 ============ */ void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) { __asm { mov ecx, src1 mov edx, src2 mov esi, dst mov eax, count add eax, 3 shr eax, 2 jz doneAdd16 shl eax, 4 add esi, eax add ecx, eax add edx, eax neg eax loopAdd16: movaps xmm0, [ecx+eax] addps xmm0, [edx+eax] movaps [esi+eax], xmm0 add eax, 16 jl loopAdd16 doneAdd16: } } /* ============ idSIMD_SSE::Sub16 ============ */ void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) { __asm { mov ecx, src1 mov edx, src2 mov esi, dst mov eax, count add eax, 3 shr eax, 2 jz doneSub16 shl eax, 4 add esi, eax add ecx, eax add edx, eax neg eax loopSub16: movaps xmm0, [ecx+eax] subps xmm0, [edx+eax] movaps [esi+eax], xmm0 add eax, 16 jl loopSub16 doneSub16: } } /* ============ idSIMD_SSE::Mul16 ============ */ void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) { __asm { mov ecx, dst mov edx, src1 mov eax, count add eax, 3 shr eax, 2 jz doneMulScalar16 movss xmm1, constant shl eax, 4 add ecx, eax add edx, eax neg eax shufps xmm1, xmm1, 0x00 loopMulScalar16: movaps xmm0, [edx+eax] mulps xmm0, xmm1 movaps [ecx+eax], xmm0 add eax, 16 jl loopMulScalar16 doneMulScalar16: } } /* ============ idSIMD_SSE::AddAssign16 ============ */ void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) { __asm { mov ecx, dst mov edx, src mov eax, count add eax, 3 shr eax, 2 jz doneAddAssign16 shl eax, 4 add ecx, eax add edx, eax neg eax loopAddAssign16: movaps xmm0, [ecx+eax] addps xmm0, [edx+eax] movaps [ecx+eax], xmm0 add eax, 16 jl loopAddAssign16 doneAddAssign16: } } /* ============ idSIMD_SSE::SubAssign16 ============ */ void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) { __asm { mov ecx, dst mov edx, src mov eax, count add eax, 3 shr eax, 2 jz doneSubAssign16 shl eax, 4 add ecx, eax add edx, eax neg eax loopSubAssign16: movaps xmm0, [ecx+eax] subps xmm0, [edx+eax] movaps [ecx+eax], xmm0 add eax, 16 jl loopSubAssign16 doneSubAssign16: } } /* ============ idSIMD_SSE::MulAssign16 ============ */ void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) { __asm { mov ecx, dst mov eax, count add eax, 3 shr eax, 2 jz doneMulAssign16 movss xmm1, constant shl eax, 4 add ecx, eax neg eax shufps xmm1, xmm1, 0x00 loopMulAssign16: movaps xmm0, [ecx+eax] mulps xmm0, xmm1 movaps [ecx+eax], xmm0 add eax, 16 jl loopMulAssign16 doneMulAssign16: } } /* ============ idSIMD_SSE::MatX_MultiplyVecX optimizes the following matrix multiplications: NxN * Nx1 Nx6 * 6x1 6xN * Nx1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss [eax+offset], reg1 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps [eax+offset], reg1 #define STORE4( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 \ __asm movhps [eax+offset+8], reg1 #define STOREC = int numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: { switch( numRows ) { case 1: { // 1x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] STORE1( 0, xmm0, xmm1 ) } return; } case 6: { // 6x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0]; mPtr++; } return; } } break; } case 2: { switch( numRows ) { case 2: { // 2x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm1, [esi+4] movss xmm2, [edi] mulss xmm2, xmm0 movss xmm3, [edi+4] mulss xmm3, xmm1 addss xmm2, xmm3 STORE1( 0, xmm2, xmm4 ) mulss xmm0, [edi+8] mulss xmm1, [edi+8+4] addss xmm0, xmm1 STORE1( 4, xmm0, xmm4 ) } return; } case 6: { // 6x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, [esi] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movaps xmm0, [edi] mulps xmm0, xmm7 movaps xmm1, [edi+16] mulps xmm1, xmm7 movaps xmm2, xmm0 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) movaps xmm3, [edi+32] addps xmm0, xmm2 mulps xmm3, xmm7 STORE4( 0, xmm0, xmm4 ) shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm1, xmm3 addps xmm3, xmm1 STORE2LO( 16, xmm3, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } return; } } break; } case 3: { switch( numRows ) { case 3: { // 3x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm4, [edi] mulss xmm4, xmm0 movss xmm1, [esi+4] movss xmm5, [edi+4] mulss xmm5, xmm1 addss xmm4, xmm5 movss xmm2, [esi+8] movss xmm6, [edi+8] mulss xmm6, xmm2 addss xmm4, xmm6 movss xmm3, [edi+12] mulss xmm3, xmm0 STORE1( 0, xmm4, xmm7 ); movss xmm5, [edi+12+4] mulss xmm5, xmm1 addss xmm3, xmm5 movss xmm6, [edi+12+8] mulss xmm6, xmm2 addss xmm3, xmm6 mulss xmm0, [edi+24] mulss xmm1, [edi+24+4] STORE1( 4, xmm3, xmm7 ); addss xmm0, xmm1 mulss xmm2, [edi+24+8] addss xmm0, xmm2 STORE1( 8, xmm0, xmm7 ); } return; } case 6: { // 6x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm5, [esi] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 movlps xmm1, [edi+4*4] shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 movlps xmm2, [edi+6*4] movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 mulps xmm0, xmm5 movlps xmm3, [edi+10*4] shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 movaps xmm3, xmm1 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 mulps xmm1, xmm6 shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm3 STORE4( 0, xmm0, xmm4 ) movss xmm1, [edi+12*4] mulss xmm1, xmm5 movss xmm2, [edi+13*4] mulss xmm2, xmm6 movss xmm3, [edi+14*4] mulss xmm3, xmm7 addss xmm1, xmm2 addss xmm1, xmm3 STORE1( 16, xmm1, xmm4 ) mulss xmm5, [edi+15*4] mulss xmm6, [edi+16*4] mulss xmm7, [edi+17*4] addss xmm5, xmm6 addss xmm5, xmm7 STORE1( 20, xmm5, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } return; } } break; } case 4: { switch( numRows ) { case 4: { // 4x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi ] movlps xmm0, qword ptr [edi ] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm4 STORE4( 0, xmm0, xmm2 ) } return; } case 6: { // 6x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi+ 0] movlps xmm0, qword ptr [edi+ 0] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm4 movlps xmm1, qword ptr [edi+64] movhps xmm1, qword ptr [edi+80] STORE4( 0, xmm0, xmm4 ) mulps xmm1, xmm6 movlps xmm2, qword ptr [edi+72] movhps xmm2, qword ptr [edi+88] mulps xmm2, xmm7 addps xmm1, xmm2 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm3, xmm1 addps xmm1, xmm3 STORE2LO( 16, xmm1, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } return; } } break; } case 5: { switch( numRows ) { case 5: { // 5x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 movss xmm7, [esi+0*4] shufps xmm7, xmm7, 0 mulps xmm0, xmm7 movss xmm5, [esi+1*4] shufps xmm5, xmm5, 0 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm6, [esi+2*4] shufps xmm6, xmm6, 0 mulps xmm2, xmm6 addps xmm0, xmm2 movss xmm1, [esi+3*4] shufps xmm1, xmm1, 0 mulps xmm3, xmm1 addps xmm0, xmm3 movss xmm2, [esi+4*4] shufps xmm2, xmm2, 0 mulps xmm4, xmm2 addps xmm0, xmm4 mulss xmm7, [edi+20*4] mulss xmm5, [edi+21*4] addps xmm7, xmm5 mulss xmm6, [edi+22*4] addps xmm7, xmm6 mulss xmm1, [edi+23*4] addps xmm7, xmm1 mulss xmm2, [edi+24*4] addps xmm7, xmm2 STORE4( 0, xmm0, xmm3 ) STORE1( 16, xmm7, xmm4 ) } return; } case 6: { // 6x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [esi] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movlps xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movlps xmm0, [edi] movhps xmm3, [edi+8] movaps xmm1, [edi+16] movlps xmm2, [edi+32] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 mulps xmm0, xmm6 mulps xmm3, xmm7 movlps xmm2, [edi+40] addps xmm0, xmm3 // xmm0 + xmm1 movhps xmm5, [edi+40+8] movlps xmm3, [edi+40+16] movhps xmm3, [edi+40+24] movlps xmm4, [edi+40+32] shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 mulps xmm2, xmm6 mulps xmm5, xmm7 addps xmm2, xmm5 // xmm2 + xmm3 movss xmm5, [esi+16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm4, xmm0 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) addps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 STORE4( 0, xmm0, xmm2 ) movlps xmm4, [edi+80] movhps xmm3, [edi+80+8] movaps xmm1, [edi+80+16] movlps xmm2, [edi+80+32] shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 mulps xmm4, xmm6 mulps xmm3, xmm7 mulps xmm1, xmm5 addps xmm4, xmm3 // xmm4 + xmm1 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) addps xmm4, xmm1 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) addps xmm4, xmm1 STORE2LO( 16, xmm4, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } return; } } break; } case 6: { switch( numRows ) { case 1: { // 1x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] movss xmm1, [esi+4] mulss xmm1, [edi+4] movss xmm2, [esi+8] addss xmm0, xmm1 mulss xmm2, [edi+8] movss xmm3, [esi+12] addss xmm0, xmm2 mulss xmm3, [edi+12] movss xmm4, [esi+16] addss xmm0, xmm3 mulss xmm4, [edi+16] movss xmm5, [esi+20] addss xmm0, xmm4 mulss xmm5, [edi+20] movss xmm6, [esi+24] addss xmm0, xmm5 mulss xmm6, [edi+24] addss xmm0, xmm6 STORE1( 0, xmm0, xmm7 ) } return; } case 2: { // 2x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) } return; } case 3: { // 3x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) // row 2 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 STORE1( 8, xmm0, xmm3 ) } return; } case 4: { // 4x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm4 ) } return; } case 5: { // 5x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm3 ) // row 5 movaps xmm0, [edi+96] movaps xmm1, [edi+96+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x01 addss xmm0, xmm1 STORE1( 16, xmm0, xmm3 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 STORE4( 0, xmm0, xmm3 ) STORE2LO( 16, xmm2, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } return; } } break; } default: { int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] STOREC sum; mPtr += numColumns; } break; } } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_MultiplyAddVecX optimizes the following matrix multiplications: NxN * Nx1 Nx6 * 6x1 6xN * Nx1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm addss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC += int numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: { switch( numRows ) { case 1: { // 1x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] STORE1( 0, xmm0, xmm1 ) } return; } case 6: { // 6x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0]; mPtr++; } return; } } break; } case 2: { switch( numRows ) { case 2: { // 2x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm1, [esi+4] movss xmm2, [edi] mulss xmm2, xmm0 movss xmm3, [edi+4] mulss xmm3, xmm1 addss xmm2, xmm3 STORE1( 0, xmm2, xmm4 ) mulss xmm0, [edi+8] mulss xmm1, [edi+8+4] addss xmm0, xmm1 STORE1( 4, xmm0, xmm4 ) } return; } case 6: { // 6x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, [esi] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movaps xmm0, [edi] mulps xmm0, xmm7 movaps xmm1, [edi+16] mulps xmm1, xmm7 movaps xmm2, xmm0 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) movaps xmm3, [edi+32] addps xmm0, xmm2 mulps xmm3, xmm7 STORE4( 0, xmm0, xmm4 ) shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm1, xmm3 addps xmm3, xmm1 STORE2LO( 16, xmm3, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } return; } } break; } case 3: { switch( numRows ) { case 3: { // 3x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm4, [edi] mulss xmm4, xmm0 movss xmm1, [esi+4] movss xmm5, [edi+4] mulss xmm5, xmm1 addss xmm4, xmm5 movss xmm2, [esi+8] movss xmm6, [edi+8] mulss xmm6, xmm2 addss xmm4, xmm6 movss xmm3, [edi+12] mulss xmm3, xmm0 STORE1( 0, xmm4, xmm7 ); movss xmm5, [edi+12+4] mulss xmm5, xmm1 addss xmm3, xmm5 movss xmm6, [edi+12+8] mulss xmm6, xmm2 addss xmm3, xmm6 mulss xmm0, [edi+24] mulss xmm1, [edi+24+4] STORE1( 4, xmm3, xmm7 ); addss xmm0, xmm1 mulss xmm2, [edi+24+8] addss xmm0, xmm2 STORE1( 8, xmm0, xmm7 ); } return; } case 6: { // 6x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm5, [esi] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 movlps xmm1, [edi+4*4] shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 movlps xmm2, [edi+6*4] movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 mulps xmm0, xmm5 movlps xmm3, [edi+10*4] shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 movaps xmm3, xmm1 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 mulps xmm1, xmm6 shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm3 STORE4( 0, xmm0, xmm4 ) movss xmm1, [edi+12*4] mulss xmm1, xmm5 movss xmm2, [edi+13*4] mulss xmm2, xmm6 movss xmm3, [edi+14*4] mulss xmm3, xmm7 addss xmm1, xmm2 addss xmm1, xmm3 STORE1( 16, xmm1, xmm4 ) mulss xmm5, [edi+15*4] mulss xmm6, [edi+16*4] mulss xmm7, [edi+17*4] addss xmm5, xmm6 addss xmm5, xmm7 STORE1( 20, xmm5, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } return; } } break; } case 4: { switch( numRows ) { case 4: { // 4x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi ] movlps xmm0, qword ptr [edi ] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm4 STORE4( 0, xmm0, xmm2 ) } return; } case 6: { // 6x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi+ 0] movlps xmm0, qword ptr [edi+ 0] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm4 movlps xmm1, qword ptr [edi+64] movhps xmm1, qword ptr [edi+80] STORE4( 0, xmm0, xmm4 ) mulps xmm1, xmm6 movlps xmm2, qword ptr [edi+72] movhps xmm2, qword ptr [edi+88] mulps xmm2, xmm7 addps xmm1, xmm2 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm3, xmm1 addps xmm1, xmm3 STORE2LO( 16, xmm1, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } return; } } break; } case 5: { switch( numRows ) { case 5: { // 5x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 movss xmm7, [esi+0*4] shufps xmm7, xmm7, 0 mulps xmm0, xmm7 movss xmm5, [esi+1*4] shufps xmm5, xmm5, 0 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm6, [esi+2*4] shufps xmm6, xmm6, 0 mulps xmm2, xmm6 addps xmm0, xmm2 movss xmm1, [esi+3*4] shufps xmm1, xmm1, 0 mulps xmm3, xmm1 addps xmm0, xmm3 movss xmm2, [esi+4*4] shufps xmm2, xmm2, 0 mulps xmm4, xmm2 addps xmm0, xmm4 mulss xmm7, [edi+20*4] mulss xmm5, [edi+21*4] addps xmm7, xmm5 mulss xmm6, [edi+22*4] addps xmm7, xmm6 mulss xmm1, [edi+23*4] addps xmm7, xmm1 mulss xmm2, [edi+24*4] addps xmm7, xmm2 STORE4( 0, xmm0, xmm3 ) STORE1( 16, xmm7, xmm4 ) } return; } case 6: { // 6x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [esi] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movlps xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movlps xmm0, [edi] movhps xmm3, [edi+8] movaps xmm1, [edi+16] movlps xmm2, [edi+32] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 mulps xmm0, xmm6 mulps xmm3, xmm7 movlps xmm2, [edi+40] addps xmm0, xmm3 // xmm0 + xmm1 movhps xmm5, [edi+40+8] movlps xmm3, [edi+40+16] movhps xmm3, [edi+40+24] movlps xmm4, [edi+40+32] shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 mulps xmm2, xmm6 mulps xmm5, xmm7 addps xmm2, xmm5 // xmm2 + xmm3 movss xmm5, [esi+16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm4, xmm0 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) addps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 STORE4( 0, xmm0, xmm2 ) movlps xmm4, [edi+80] movhps xmm3, [edi+80+8] movaps xmm1, [edi+80+16] movlps xmm2, [edi+80+32] shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 mulps xmm4, xmm6 mulps xmm3, xmm7 mulps xmm1, xmm5 addps xmm4, xmm3 // xmm4 + xmm1 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) addps xmm4, xmm1 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) addps xmm4, xmm1 STORE2LO( 16, xmm4, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } return; } } break; } case 6: { switch( numRows ) { case 1: { // 1x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] movss xmm1, [esi+4] mulss xmm1, [edi+4] movss xmm2, [esi+8] addss xmm0, xmm1 mulss xmm2, [edi+8] movss xmm3, [esi+12] addss xmm0, xmm2 mulss xmm3, [edi+12] movss xmm4, [esi+16] addss xmm0, xmm3 mulss xmm4, [edi+16] movss xmm5, [esi+20] addss xmm0, xmm4 mulss xmm5, [edi+20] movss xmm6, [esi+24] addss xmm0, xmm5 mulss xmm6, [edi+24] addss xmm0, xmm6 STORE1( 0, xmm0, xmm7 ) } return; } case 2: { // 2x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) } return; } case 3: { // 3x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) // row 2 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 STORE1( 8, xmm0, xmm3 ) } return; } case 4: { // 4x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm4 ) } return; } case 5: { // 5x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm3 ) // row 5 movaps xmm0, [edi+96] movaps xmm1, [edi+96+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x01 addss xmm0, xmm1 STORE1( 16, xmm0, xmm3 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 STORE4( 0, xmm0, xmm3 ) STORE2LO( 16, xmm2, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } return; } } break; } default: { int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] STOREC sum; mPtr += numColumns; } break; } } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_MultiplySubVecX optimizes the following matrix multiplications: NxN * Nx1 Nx6 * 6x1 6xN * Nx1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm subss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC -= int numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: { switch( numRows ) { case 1: { // 1x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] STORE1( 0, xmm0, xmm1 ) } return; } case 6: { // 6x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0]; mPtr++; } return; } } break; } case 2: { switch( numRows ) { case 2: { // 2x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm1, [esi+4] movss xmm2, [edi] mulss xmm2, xmm0 movss xmm3, [edi+4] mulss xmm3, xmm1 addss xmm2, xmm3 STORE1( 0, xmm2, xmm4 ) mulss xmm0, [edi+8] mulss xmm1, [edi+8+4] addss xmm0, xmm1 STORE1( 4, xmm0, xmm4 ) } return; } case 6: { // 6x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, [esi] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movaps xmm0, [edi] mulps xmm0, xmm7 movaps xmm1, [edi+16] mulps xmm1, xmm7 movaps xmm2, xmm0 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) movaps xmm3, [edi+32] addps xmm0, xmm2 mulps xmm3, xmm7 STORE4( 0, xmm0, xmm4 ) shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm1, xmm3 addps xmm3, xmm1 STORE2LO( 16, xmm3, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } return; } } break; } case 3: { switch( numRows ) { case 3: { // 3x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm4, [edi] mulss xmm4, xmm0 movss xmm1, [esi+4] movss xmm5, [edi+4] mulss xmm5, xmm1 addss xmm4, xmm5 movss xmm2, [esi+8] movss xmm6, [edi+8] mulss xmm6, xmm2 addss xmm4, xmm6 movss xmm3, [edi+12] mulss xmm3, xmm0 STORE1( 0, xmm4, xmm7 ); movss xmm5, [edi+12+4] mulss xmm5, xmm1 addss xmm3, xmm5 movss xmm6, [edi+12+8] mulss xmm6, xmm2 addss xmm3, xmm6 mulss xmm0, [edi+24] mulss xmm1, [edi+24+4] STORE1( 4, xmm3, xmm7 ); addss xmm0, xmm1 mulss xmm2, [edi+24+8] addss xmm0, xmm2 STORE1( 8, xmm0, xmm7 ); } return; } case 6: { // 6x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm5, [esi] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 movlps xmm1, [edi+4*4] shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 movlps xmm2, [edi+6*4] movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 mulps xmm0, xmm5 movlps xmm3, [edi+10*4] shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 movaps xmm3, xmm1 shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 mulps xmm1, xmm6 shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm3 STORE4( 0, xmm0, xmm4 ) movss xmm1, [edi+12*4] mulss xmm1, xmm5 movss xmm2, [edi+13*4] mulss xmm2, xmm6 movss xmm3, [edi+14*4] mulss xmm3, xmm7 addss xmm1, xmm2 addss xmm1, xmm3 STORE1( 16, xmm1, xmm4 ) mulss xmm5, [edi+15*4] mulss xmm6, [edi+16*4] mulss xmm7, [edi+17*4] addss xmm5, xmm6 addss xmm5, xmm7 STORE1( 20, xmm5, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } return; } } break; } case 4: { switch( numRows ) { case 4: { // 4x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi ] movlps xmm0, qword ptr [edi ] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm4 STORE4( 0, xmm0, xmm2 ) } return; } case 6: { // 6x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi+ 0] movlps xmm0, qword ptr [edi+ 0] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm4 movlps xmm1, qword ptr [edi+64] movhps xmm1, qword ptr [edi+80] STORE4( 0, xmm0, xmm4 ) mulps xmm1, xmm6 movlps xmm2, qword ptr [edi+72] movhps xmm2, qword ptr [edi+88] mulps xmm2, xmm7 addps xmm1, xmm2 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm3, xmm1 addps xmm1, xmm3 STORE2LO( 16, xmm1, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } return; } } break; } case 5: { switch( numRows ) { case 5: { // 5x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 movss xmm7, [esi+0*4] shufps xmm7, xmm7, 0 mulps xmm0, xmm7 movss xmm5, [esi+1*4] shufps xmm5, xmm5, 0 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm6, [esi+2*4] shufps xmm6, xmm6, 0 mulps xmm2, xmm6 addps xmm0, xmm2 movss xmm1, [esi+3*4] shufps xmm1, xmm1, 0 mulps xmm3, xmm1 addps xmm0, xmm3 movss xmm2, [esi+4*4] shufps xmm2, xmm2, 0 mulps xmm4, xmm2 addps xmm0, xmm4 mulss xmm7, [edi+20*4] mulss xmm5, [edi+21*4] addps xmm7, xmm5 mulss xmm6, [edi+22*4] addps xmm7, xmm6 mulss xmm1, [edi+23*4] addps xmm7, xmm1 mulss xmm2, [edi+24*4] addps xmm7, xmm2 STORE4( 0, xmm0, xmm3 ) STORE1( 16, xmm7, xmm4 ) } return; } case 6: { // 6x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [esi] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) movlps xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 ) movlps xmm0, [edi] movhps xmm3, [edi+8] movaps xmm1, [edi+16] movlps xmm2, [edi+32] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 mulps xmm0, xmm6 mulps xmm3, xmm7 movlps xmm2, [edi+40] addps xmm0, xmm3 // xmm0 + xmm1 movhps xmm5, [edi+40+8] movlps xmm3, [edi+40+16] movhps xmm3, [edi+40+24] movlps xmm4, [edi+40+32] shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 mulps xmm2, xmm6 mulps xmm5, xmm7 addps xmm2, xmm5 // xmm2 + xmm3 movss xmm5, [esi+16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm4, xmm0 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 ) addps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 STORE4( 0, xmm0, xmm2 ) movlps xmm4, [edi+80] movhps xmm3, [edi+80+8] movaps xmm1, [edi+80+16] movlps xmm2, [edi+80+32] shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 mulps xmm4, xmm6 mulps xmm3, xmm7 mulps xmm1, xmm5 addps xmm4, xmm3 // xmm4 + xmm1 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 ) addps xmm4, xmm1 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 ) addps xmm4, xmm1 STORE2LO( 16, xmm4, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } return; } } break; } case 6: { switch( numRows ) { case 1: { // 1x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] movss xmm1, [esi+4] mulss xmm1, [edi+4] movss xmm2, [esi+8] addss xmm0, xmm1 mulss xmm2, [edi+8] movss xmm3, [esi+12] addss xmm0, xmm2 mulss xmm3, [edi+12] movss xmm4, [esi+16] addss xmm0, xmm3 mulss xmm4, [edi+16] movss xmm5, [esi+20] addss xmm0, xmm4 mulss xmm5, [edi+20] movss xmm6, [esi+24] addss xmm0, xmm5 mulss xmm6, [edi+24] addss xmm0, xmm6 STORE1( 0, xmm0, xmm7 ) } return; } case 2: { // 2x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) } return; } case 3: { // 3x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) // row 2 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 STORE1( 8, xmm0, xmm3 ) } return; } case 4: { // 4x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm4 ) } return; } case 5: { // 5x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm3 ) // row 5 movaps xmm0, [edi+96] movaps xmm1, [edi+96+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x01 addss xmm0, xmm1 STORE1( 16, xmm0, xmm3 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 STORE4( 0, xmm0, xmm3 ) STORE2LO( 16, xmm2, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } return; } } break; } default: { int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] STOREC sum; mPtr += numColumns; } break; } } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_TransposeMultiplyVecX optimizes the following matrix multiplications: Nx6 * Nx1 6xN * 6x1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss [eax+offset], reg1 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps [eax+offset], reg1 #define STORE4( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 \ __asm movhps [eax+offset+8], reg1 #define STOREC = int numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: switch( numColumns ) { case 6: { // 1x6 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm3 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0]; mPtr++; } return; } } break; case 2: switch( numColumns ) { case 6: { // 2x6 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) movaps xmm2, [edi] mulps xmm2, xmm0 movlps xmm3, [edi+24] movhps xmm3, [edi+32] mulps xmm3, xmm1 addps xmm2, xmm3 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm4, [edi+16] movhps xmm4, [edi+40] mulps xmm4, xmm0 movhlps xmm3, xmm4 addps xmm3, xmm4 STORE4( 0, xmm2, xmm5 ) STORE2LO( 16, xmm3, xmm6 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } return; } } break; case 3: switch( numColumns ) { case 6: { // 3x6 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movss xmm1, [esi+2*4] movlps xmm3, [edi+(0*6+0)*4] movhps xmm3, [edi+(0*6+2)*4] movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(2*6+4)*4] mulps xmm5, xmm1 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } return; } } break; case 4: switch( numColumns ) { case 6: { // 4x6 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } return; } } break; case 5: switch( numColumns ) { case 6: { // 5x6 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movss xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm4, xmm2 mulps xmm4, [edi+(4*6+0)*4] addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } return; } } break; case 6: switch( numColumns ) { case 1: { // 6x1 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movhps xmm0, [esi+8] movlps xmm1, [esi+16] mulps xmm0, [edi] mulps xmm1, [edi+16] shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) addps xmm0, xmm1 movhlps xmm2, xmm0 addss xmm2, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm2, xmm0 STORE1( 0, xmm2, xmm3 ) } return; } case 2: { // 6x2 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm6, [edi+0*4] mulps xmm6, xmm0 movlps xmm1, [esi+2*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm7, [edi+4*4] mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm2, [esi+4*4] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm7, [edi+8*4] mulps xmm7, xmm2 addps xmm6, xmm7 movhlps xmm3, xmm6 addps xmm3, xmm6 STORE2LO( 0, xmm3, xmm7 ) } return; } case 3: { // 6x3 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+(0*3+2)*4] movhps xmm0, [edi+(0*3+0)*4] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm6, [esi+0*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm1, [edi+(1*3+0)*4] movhps xmm1, [edi+(1*3+1)*4] movss xmm7, [esi+1*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm2, [edi+(2*3+2)*4] movhps xmm2, [edi+(2*3+0)*4] shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm7, [esi+2*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movss xmm3, [edi+(3*3+0)*4] movhps xmm3, [edi+(3*3+1)*4] movss xmm7, [esi+3*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movss xmm4, [edi+(4*3+2)*4] movhps xmm4, [edi+(4*3+0)*4] shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm7, [esi+4*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movss xmm5, [edi+(5*3+0)*4] movhps xmm5, [edi+(5*3+1)*4] movss xmm7, [esi+5*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE1( 0, xmm6, xmm7 ) STORE2HI( 4, xmm6, xmm7 ) } return; } case 4: { // 6x4 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm3, [edi+(0*4+0)*4] movhps xmm3, [edi+(0*4+2)*4] movss xmm4, [esi+0*4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*4+0)*4] movhps xmm5, [edi+(1*4+2)*4] movss xmm6, [esi+1*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*4+0)*4] movhps xmm4, [edi+(2*4+2)*4] movss xmm6, [esi+2*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*4+0)*4] movhps xmm5, [edi+(3*4+2)*4] movss xmm6, [esi+3*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(4*4+0)*4] movhps xmm4, [edi+(4*4+2)*4] movss xmm6, [esi+4*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(5*4+0)*4] movhps xmm5, [edi+(5*4+2)*4] movss xmm6, [esi+5*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) } return; } case 5: { // 6x5 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [edi+(0*5+0)*4] movhps xmm6, [edi+(0*5+2)*4] movss xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movlps xmm7, [edi+(1*5+0)*4] movhps xmm7, [edi+(1*5+2)*4] movss xmm1, [esi+1*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm7, [edi+(2*5+0)*4] movhps xmm7, [edi+(2*5+2)*4] movss xmm2, [esi+2*4] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movlps xmm7, [edi+(3*5+0)*4] movhps xmm7, [edi+(3*5+2)*4] movss xmm3, [esi+3*4] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movlps xmm7, [edi+(4*5+0)*4] movhps xmm7, [edi+(4*5+2)*4] movss xmm4, [esi+4*4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movlps xmm7, [edi+(5*5+0)*4] movhps xmm7, [edi+(5*5+2)*4] movss xmm5, [esi+5*4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE4( 0, xmm6, xmm7 ) movss xmm6, [edi+(0*5+4)*4] mulss xmm6, xmm0 movss xmm7, [edi+(1*5+4)*4] mulss xmm7, xmm1 addss xmm6, xmm7 movss xmm7, [edi+(2*5+4)*4] mulss xmm7, xmm2 addss xmm6, xmm7 movss xmm7, [edi+(3*5+4)*4] mulss xmm7, xmm3 addss xmm6, xmm7 movss xmm7, [edi+(4*5+4)*4] mulss xmm7, xmm4 addss xmm6, xmm7 movss xmm7, [edi+(5*5+4)*4] mulss xmm7, xmm5 addss xmm6, xmm7 STORE1( 16, xmm6, xmm7 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movlps xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(4*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) movlps xmm5, [edi+(5*6+0)*4] movhps xmm5, [edi+(5*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] movhps xmm5, [edi+(5*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } return; } } break; default: int numRows = mat.GetNumRows(); for ( int i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] STOREC sum; } break; } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_TransposeMultiplyAddVecX optimizes the following matrix multiplications: Nx6 * Nx1 6xN * 6x1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm addss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC += int numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: switch( numColumns ) { case 6: { // 1x6 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm3 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0]; mPtr++; } return; } } break; case 2: switch( numColumns ) { case 6: { // 2x6 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) movaps xmm2, [edi] mulps xmm2, xmm0 movlps xmm3, [edi+24] movhps xmm3, [edi+32] mulps xmm3, xmm1 addps xmm2, xmm3 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm4, [edi+16] movhps xmm4, [edi+40] mulps xmm4, xmm0 movhlps xmm3, xmm4 addps xmm3, xmm4 STORE4( 0, xmm2, xmm5 ) STORE2LO( 16, xmm3, xmm6 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } return; } } break; case 3: switch( numColumns ) { case 6: { // 3x6 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movss xmm1, [esi+2*4] movlps xmm3, [edi+(0*6+0)*4] movhps xmm3, [edi+(0*6+2)*4] movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(2*6+4)*4] mulps xmm5, xmm1 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } return; } } break; case 4: switch( numColumns ) { case 6: { // 4x6 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } return; } } break; case 5: switch( numColumns ) { case 6: { // 5x6 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movss xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm4, xmm2 mulps xmm4, [edi+(4*6+0)*4] addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } return; } } break; case 6: switch( numColumns ) { case 1: { // 6x1 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movhps xmm0, [esi+8] movlps xmm1, [esi+16] mulps xmm0, [edi] mulps xmm1, [edi+16] shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) addps xmm0, xmm1 movhlps xmm2, xmm0 addss xmm2, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm2, xmm0 STORE1( 0, xmm2, xmm3 ) } return; } case 2: { // 6x2 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm6, [edi+0*4] mulps xmm6, xmm0 movlps xmm1, [esi+2*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm7, [edi+4*4] mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm2, [esi+4*4] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm7, [edi+8*4] mulps xmm7, xmm2 addps xmm6, xmm7 movhlps xmm3, xmm6 addps xmm3, xmm6 STORE2LO( 0, xmm3, xmm7 ) } return; } case 3: { // 6x3 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+(0*3+2)*4] movhps xmm0, [edi+(0*3+0)*4] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm6, [esi+0*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm1, [edi+(1*3+0)*4] movhps xmm1, [edi+(1*3+1)*4] movss xmm7, [esi+1*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm2, [edi+(2*3+2)*4] movhps xmm2, [edi+(2*3+0)*4] shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm7, [esi+2*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movss xmm3, [edi+(3*3+0)*4] movhps xmm3, [edi+(3*3+1)*4] movss xmm7, [esi+3*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movss xmm4, [edi+(4*3+2)*4] movhps xmm4, [edi+(4*3+0)*4] shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm7, [esi+4*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movss xmm5, [edi+(5*3+0)*4] movhps xmm5, [edi+(5*3+1)*4] movss xmm7, [esi+5*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE1( 0, xmm6, xmm7 ) STORE2HI( 4, xmm6, xmm7 ) } return; } case 4: { // 6x4 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm3, [edi+(0*4+0)*4] movhps xmm3, [edi+(0*4+2)*4] movss xmm4, [esi+0*4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*4+0)*4] movhps xmm5, [edi+(1*4+2)*4] movss xmm6, [esi+1*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*4+0)*4] movhps xmm4, [edi+(2*4+2)*4] movss xmm6, [esi+2*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*4+0)*4] movhps xmm5, [edi+(3*4+2)*4] movss xmm6, [esi+3*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(4*4+0)*4] movhps xmm4, [edi+(4*4+2)*4] movss xmm6, [esi+4*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(5*4+0)*4] movhps xmm5, [edi+(5*4+2)*4] movss xmm6, [esi+5*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) } return; } case 5: { // 6x5 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [edi+(0*5+0)*4] movhps xmm6, [edi+(0*5+2)*4] movss xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movlps xmm7, [edi+(1*5+0)*4] movhps xmm7, [edi+(1*5+2)*4] movss xmm1, [esi+1*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm7, [edi+(2*5+0)*4] movhps xmm7, [edi+(2*5+2)*4] movss xmm2, [esi+2*4] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movlps xmm7, [edi+(3*5+0)*4] movhps xmm7, [edi+(3*5+2)*4] movss xmm3, [esi+3*4] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movlps xmm7, [edi+(4*5+0)*4] movhps xmm7, [edi+(4*5+2)*4] movss xmm4, [esi+4*4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movlps xmm7, [edi+(5*5+0)*4] movhps xmm7, [edi+(5*5+2)*4] movss xmm5, [esi+5*4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE4( 0, xmm6, xmm7 ) movss xmm6, [edi+(0*5+4)*4] mulss xmm6, xmm0 movss xmm7, [edi+(1*5+4)*4] mulss xmm7, xmm1 addss xmm6, xmm7 movss xmm7, [edi+(2*5+4)*4] mulss xmm7, xmm2 addss xmm6, xmm7 movss xmm7, [edi+(3*5+4)*4] mulss xmm7, xmm3 addss xmm6, xmm7 movss xmm7, [edi+(4*5+4)*4] mulss xmm7, xmm4 addss xmm6, xmm7 movss xmm7, [edi+(5*5+4)*4] mulss xmm7, xmm5 addss xmm6, xmm7 STORE1( 16, xmm6, xmm7 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movlps xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(4*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) movlps xmm5, [edi+(5*6+0)*4] movhps xmm5, [edi+(5*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] movhps xmm5, [edi+(5*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } return; } } break; default: int numRows = mat.GetNumRows(); for ( int i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] STOREC sum; } break; } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ void idSIMD_SSE::MatX_TransposeMultiplySubVecX optimizes the following matrix multiplications: Nx6 * Nx1 6xN * 6x1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm subss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC -= int numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: switch( numColumns ) { case 6: { // 1x6 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm3 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0]; mPtr++; } return; } } break; case 2: switch( numColumns ) { case 6: { // 2x6 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) movaps xmm2, [edi] mulps xmm2, xmm0 movlps xmm3, [edi+24] movhps xmm3, [edi+32] mulps xmm3, xmm1 addps xmm2, xmm3 shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm4, [edi+16] movhps xmm4, [edi+40] mulps xmm4, xmm0 movhlps xmm3, xmm4 addps xmm3, xmm4 STORE4( 0, xmm2, xmm5 ) STORE2LO( 16, xmm3, xmm6 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } return; } } break; case 3: switch( numColumns ) { case 6: { // 3x6 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movss xmm1, [esi+2*4] movlps xmm3, [edi+(0*6+0)*4] movhps xmm3, [edi+(0*6+2)*4] movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(2*6+4)*4] mulps xmm5, xmm1 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } return; } } break; case 4: switch( numColumns ) { case 6: { // 4x6 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } return; } } break; case 5: switch( numColumns ) { case 6: { // 5x6 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movss xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm4, xmm2 mulps xmm4, [edi+(4*6+0)*4] addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } return; } } break; case 6: switch( numColumns ) { case 1: { // 6x1 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movhps xmm0, [esi+8] movlps xmm1, [esi+16] mulps xmm0, [edi] mulps xmm1, [edi+16] shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 ) addps xmm0, xmm1 movhlps xmm2, xmm0 addss xmm2, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm2, xmm0 STORE1( 0, xmm2, xmm3 ) } return; } case 2: { // 6x2 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm6, [edi+0*4] mulps xmm6, xmm0 movlps xmm1, [esi+2*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm7, [edi+4*4] mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm2, [esi+4*4] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm7, [edi+8*4] mulps xmm7, xmm2 addps xmm6, xmm7 movhlps xmm3, xmm6 addps xmm3, xmm6 STORE2LO( 0, xmm3, xmm7 ) } return; } case 3: { // 6x3 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+(0*3+2)*4] movhps xmm0, [edi+(0*3+0)*4] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm6, [esi+0*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm1, [edi+(1*3+0)*4] movhps xmm1, [edi+(1*3+1)*4] movss xmm7, [esi+1*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm2, [edi+(2*3+2)*4] movhps xmm2, [edi+(2*3+0)*4] shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm7, [esi+2*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movss xmm3, [edi+(3*3+0)*4] movhps xmm3, [edi+(3*3+1)*4] movss xmm7, [esi+3*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movss xmm4, [edi+(4*3+2)*4] movhps xmm4, [edi+(4*3+0)*4] shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 ) movss xmm7, [esi+4*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movss xmm5, [edi+(5*3+0)*4] movhps xmm5, [edi+(5*3+1)*4] movss xmm7, [esi+5*4] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE1( 0, xmm6, xmm7 ) STORE2HI( 4, xmm6, xmm7 ) } return; } case 4: { // 6x4 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm3, [edi+(0*4+0)*4] movhps xmm3, [edi+(0*4+2)*4] movss xmm4, [esi+0*4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*4+0)*4] movhps xmm5, [edi+(1*4+2)*4] movss xmm6, [esi+1*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*4+0)*4] movhps xmm4, [edi+(2*4+2)*4] movss xmm6, [esi+2*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*4+0)*4] movhps xmm5, [edi+(3*4+2)*4] movss xmm6, [esi+3*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(4*4+0)*4] movhps xmm4, [edi+(4*4+2)*4] movss xmm6, [esi+4*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(5*4+0)*4] movhps xmm5, [edi+(5*4+2)*4] movss xmm6, [esi+5*4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) } return; } case 5: { // 6x5 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [edi+(0*5+0)*4] movhps xmm6, [edi+(0*5+2)*4] movss xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movlps xmm7, [edi+(1*5+0)*4] movhps xmm7, [edi+(1*5+2)*4] movss xmm1, [esi+1*4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm7, [edi+(2*5+0)*4] movhps xmm7, [edi+(2*5+2)*4] movss xmm2, [esi+2*4] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movlps xmm7, [edi+(3*5+0)*4] movhps xmm7, [edi+(3*5+2)*4] movss xmm3, [esi+3*4] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movlps xmm7, [edi+(4*5+0)*4] movhps xmm7, [edi+(4*5+2)*4] movss xmm4, [esi+4*4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movlps xmm7, [edi+(5*5+0)*4] movhps xmm7, [edi+(5*5+2)*4] movss xmm5, [esi+5*4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE4( 0, xmm6, xmm7 ) movss xmm6, [edi+(0*5+4)*4] mulss xmm6, xmm0 movss xmm7, [edi+(1*5+4)*4] mulss xmm7, xmm1 addss xmm6, xmm7 movss xmm7, [edi+(2*5+4)*4] mulss xmm7, xmm2 addss xmm6, xmm7 movss xmm7, [edi+(3*5+4)*4] mulss xmm7, xmm3 addss xmm6, xmm7 movss xmm7, [edi+(4*5+4)*4] mulss xmm7, xmm4 addss xmm6, xmm7 movss xmm7, [edi+(5*5+4)*4] mulss xmm7, xmm5 addss xmm6, xmm7 STORE1( 16, xmm6, xmm7 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movlps xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, [edi+(4*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) movlps xmm5, [edi+(5*6+0)*4] movhps xmm5, [edi+(5*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] movhps xmm5, [edi+(5*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } return; } } break; default: int numRows = mat.GetNumRows(); for ( int i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] STOREC sum; } break; } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_MultiplyMatX optimizes the following matrix multiplications: NxN * Nx6 6xN * Nx6 Nx6 * 6xN 6x6 * 6xN with N in the range [1-6]. The hot cache clock cycle counts are generally better for the SIMD version than the FPU version. At times up to 40% less clock cycles on a P3. In practise however, the results are poor probably due to memory access. ============ */ void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) { int i, j, k, l, n; float *dstPtr; const float *m1Ptr, *m2Ptr; double sum; assert( m1.GetNumColumns() == m2.GetNumRows() ); dstPtr = dst.ToFloatPtr(); m1Ptr = m1.ToFloatPtr(); m2Ptr = m2.ToFloatPtr(); k = m1.GetNumRows(); l = m2.GetNumColumns(); n = m1.GetNumColumns(); switch( n ) { case 1: { if ( !(l^6) ) { switch( k ) { case 1: { // 1x1 * 1x6, no precision loss compared to FPU version __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movss xmm0, [edi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, [esi] mulps xmm1, xmm0 movaps [eax], xmm1 movlps xmm2, [esi+16] mulps xmm2, xmm0 movlps [eax+16], xmm2 } return; } case 6: { // 6x1 * 1x6, no precision loss compared to FPU version __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr xorps xmm1, xmm1 movaps xmm0, [edi] movlps xmm1, [edi+16] movlhps xmm1, xmm0 movhlps xmm2, xmm0 movlhps xmm2, xmm1 // row 0 and 1 movaps xmm3, [esi] movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) movaps xmm6, xmm3 shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 movaps [eax], xmm4 movaps [eax+16], xmm5 movaps [eax+32], xmm6 // row 2 and 3 movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm3, xmm2 movaps [eax+48], xmm4 movaps [eax+64], xmm5 movaps [eax+80], xmm3 // row 4 and 5 movlps xmm3, [esi+16] movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm3, xmm2 movaps [eax+96], xmm4 movaps [eax+112], xmm5 movaps [eax+128], xmm3 } return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m2Ptr++; } m1Ptr++; } break; } case 2: { if ( !(l^6) ) { switch( k ) { case 2: { // 2x2 * 2x6 #define MUL_Nx2_2x6_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movlps xmm1, [esi+16] \ __asm movhps xmm1, [esi+40] \ __asm movlps xmm2, [esi+24] \ __asm movhps xmm2, [esi+32] #define MUL_Nx2_2x6_ROW2( row ) \ __asm movaps xmm3, [edi+row*16] \ __asm movaps xmm5, xmm0 \ __asm movaps xmm4, xmm3 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm4 \ __asm movaps xmm4, xmm3 \ __asm movaps xmm6, xmm2 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm5, xmm6 \ __asm movaps [eax+row*48], xmm5 \ __asm movaps xmm4, xmm3 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm movaps xmm7, xmm1 \ __asm mulps xmm7, xmm4 \ __asm movaps xmm4, xmm3 \ __asm movaps xmm5, xmm0 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \ __asm mulps xmm5, xmm4 \ __asm movaps xmm4, xmm3 \ __asm movaps xmm6, xmm2 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm5, xmm6 \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \ __asm movaps xmm6, xmm1 \ __asm mulps xmm6, xmm3 \ __asm movaps xmm4, xmm7 \ __asm movlhps xmm7, xmm6 \ __asm movhlps xmm6, xmm4 \ __asm addps xmm6, xmm7 \ __asm movlps [eax+row*48+16], xmm6 \ __asm movlps [eax+row*48+24], xmm5 \ __asm movhps [eax+row*48+32], xmm5 \ __asm movhps [eax+row*48+40], xmm6 MUL_Nx2_2x6_INIT MUL_Nx2_2x6_ROW2( 0 ) return; } case 6: { // 6x2 * 2x6 MUL_Nx2_2x6_INIT MUL_Nx2_2x6_ROW2( 0 ) MUL_Nx2_2x6_ROW2( 1 ) MUL_Nx2_2x6_ROW2( 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l]; m2Ptr++; } m1Ptr += 2; } break; } case 3: { if ( !(l^6) ) { switch( k ) { case 3: { // 3x3 * 3x6 __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movaps xmm5, xmmword ptr [esi] movlps xmm6, qword ptr [esi+24] movhps xmm6, qword ptr [esi+32] movaps xmm7, xmmword ptr [esi+48] movss xmm0, dword ptr [edi] shufps xmm0, xmm0, 0 mulps xmm0, xmm5 movss xmm1, dword ptr [edi+4] shufps xmm1, xmm1, 0 mulps xmm1, xmm6 movss xmm2, dword ptr [edi+8] shufps xmm2, xmm2, 0 mulps xmm2, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 movaps xmmword ptr [eax], xmm0 movss xmm3, dword ptr [edi+12] shufps xmm3, xmm3, 0 mulps xmm3, xmm5 movss xmm4, dword ptr [edi+16] shufps xmm4, xmm4, 0 mulps xmm4, xmm6 movss xmm0, dword ptr [edi+20] shufps xmm0, xmm0, 0 mulps xmm0, xmm7 addps xmm3, xmm4 addps xmm0, xmm3 movlps qword ptr [eax+24], xmm0 movhps qword ptr [eax+32], xmm0 movss xmm1, dword ptr [edi+24] shufps xmm1, xmm1, 0 mulps xmm1, xmm5 movss xmm2, dword ptr [edi+28] shufps xmm2, xmm2, 0 mulps xmm2, xmm6 movss xmm3, dword ptr [edi+32] shufps xmm3, xmm3, 0 mulps xmm3, xmm7 addps xmm1, xmm2 addps xmm1, xmm3 movaps xmmword ptr [eax+48], xmm1 movlps xmm5, qword ptr [esi+16] movlps xmm6, qword ptr [esi+40] movlps xmm7, qword ptr [esi+64] shufps xmm5, xmm5, 0x44 shufps xmm6, xmm6, 0x44 shufps xmm7, xmm7, 0x44 movaps xmm3, xmmword ptr [edi] movlps xmm4, qword ptr [edi+16] movaps xmm0, xmm3 shufps xmm0, xmm0, 0xF0 mulps xmm0, xmm5 movaps xmm1, xmm3 shufps xmm1, xmm4, 0x05 mulps xmm1, xmm6 shufps xmm3, xmm4, 0x5A mulps xmm3, xmm7 addps xmm1, xmm0 addps xmm1, xmm3 movlps qword ptr [eax+16], xmm1 movhps qword ptr [eax+40], xmm1 movss xmm0, dword ptr [edi+24] shufps xmm0, xmm0, 0 mulps xmm0, xmm5 movss xmm2, dword ptr [edi+28] shufps xmm2, xmm2, 0 mulps xmm2, xmm6 movss xmm4, dword ptr [edi+32] shufps xmm4, xmm4, 0 mulps xmm4, xmm7 addps xmm0, xmm2 addps xmm0, xmm4 movlps qword ptr [eax+64], xmm0 } return; } case 6: { // 6x3 * 3x6 #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \ __asm movss xmm3, [edi+(row*3+0)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm3, xmm0 \ __asm movss xmm4, [edi+(row*3+1)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm4, xmm1 \ __asm addps xmm3, xmm4 \ __asm movss xmm5, [edi+(row*3+2)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm5 \ __asm movlps [eax+(row*6+0)*4], xmm3 \ __asm movhps [eax+(row*6+2)*4], xmm3 #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm movlps xmm2, [esi+16*4] \ __asm shufps xmm0, xmm0, 0x44 \ __asm shufps xmm1, xmm1, 0x44 \ __asm shufps xmm2, xmm2, 0x44 \ __asm movlps xmm3, [edi+0*4] \ __asm movhps xmm3, [edi+2*4] \ __asm movaps xmm4, xmm3 \ __asm movaps xmm5, xmm3 \ __asm shufps xmm3, xmm3, 0xF0 \ __asm mulps xmm3, xmm0 \ __asm movlps xmm6, [edi+4*4] \ __asm movhps xmm6, [edi+6*4] \ __asm shufps xmm4, xmm6, 0x05 \ __asm mulps xmm4, xmm1 \ __asm addps xmm3, xmm4 \ __asm shufps xmm5, xmm6, 0x5A \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm5 \ __asm movlps [eax+4*4], xmm3 \ __asm movhps [eax+10*4], xmm3 \ __asm movaps xmm5, xmm6 \ __asm movlps xmm3, [edi+8*4] \ __asm movhps xmm3, [edi+10*4] \ __asm movaps xmm4, xmm3 \ __asm shufps xmm5, xmm3, 0x5A \ __asm mulps xmm5, xmm0 \ __asm shufps xmm6, xmm3, 0xAF \ __asm mulps xmm6, xmm1 \ __asm addps xmm5, xmm6 \ __asm shufps xmm4, xmm4, 0xF0 \ __asm mulps xmm4, xmm2 \ __asm addps xmm4, xmm5 \ __asm movlps [eax+16*4], xmm4 \ __asm movhps [eax+22*4], xmm4 \ __asm movlps xmm6, [edi+12*4] \ __asm movhps xmm6, [edi+14*4] \ __asm movaps xmm5, xmm6 \ __asm movaps xmm4, xmm6 \ __asm shufps xmm6, xmm6, 0xF0 \ __asm mulps xmm6, xmm0 \ __asm movlps xmm3, [edi+16*4] \ __asm shufps xmm5, xmm3, 0x05 \ __asm mulps xmm5, xmm1 \ __asm addps xmm5, xmm6 \ __asm shufps xmm4, xmm3, 0x5A \ __asm mulps xmm4, xmm2 \ __asm addps xmm4, xmm5 \ __asm movlps [eax+28*4], xmm4 \ __asm movhps [eax+34*4], xmm4 MUL_Nx3_3x6_FIRST4COLUMNS_INIT MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 ) MUL_Nx3_3x6_LAST2COLUMNS_ROW6 return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l]; m2Ptr++; } m1Ptr += 3; } break; } case 4: { if ( !(l^6) ) { switch( k ) { case 4: { // 4x4 * 4x6 #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] \ __asm movlps xmm3, [esi+18*4] \ __asm movhps xmm3, [esi+20*4] #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \ __asm movss xmm4, [edi+row*16+0*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm4, xmm0 \ __asm movss xmm5, [edi+row*16+1*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm4, xmm5 \ __asm movss xmm6, [edi+row*16+2*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm4, xmm6 \ __asm movss xmm7, [edi+row*16+3*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm3 \ __asm addps xmm4, xmm7 \ __asm movlps [eax+row*24+0], xmm4 \ __asm movhps [eax+row*24+8], xmm4 #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm movlps xmm2, [esi+16*4] \ __asm movlps xmm3, [esi+22*4] \ __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \ __asm movlps xmm7, [edi+row*32+ 0*4] \ __asm movhps xmm7, [edi+row*32+ 4*4] \ __asm movaps xmm6, xmm7 \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \ __asm mulps xmm6, xmm0 \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \ __asm mulps xmm7, xmm1 \ __asm addps xmm6, xmm7 \ __asm movlps xmm4, [edi+row*32+ 2*4] \ __asm movhps xmm4, [edi+row*32+ 6*4] \ __asm movaps xmm5, xmm4 \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \ __asm mulps xmm4, xmm3 \ __asm addps xmm6, xmm4 \ __asm movlps [eax+row*48+ 4*4], xmm6 \ __asm movhps [eax+row*48+10*4], xmm6 MUL_Nx4_4x6_FIRST4COLUMNS_INIT MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx4_4x6_LAST2COLUMNS_INIT MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) return; } case 6: { // 6x4 * 4x6 MUL_Nx4_4x6_FIRST4COLUMNS_INIT MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 ) MUL_Nx4_4x6_LAST2COLUMNS_INIT MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l]; m2Ptr++; } m1Ptr += 4; } break; } case 5: { if ( !(l^6) ) { switch( k ) { case 5: { // 5x5 * 5x6 #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] \ __asm movlps xmm3, [esi+18*4] \ __asm movhps xmm3, [esi+20*4] \ __asm movlps xmm4, [esi+24*4] \ __asm movhps xmm4, [esi+26*4] #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \ __asm movss xmm6, [edi+row*20+0*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm0 \ __asm movss xmm5, [edi+row*20+1*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+row*20+2*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+row*20+3*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm3 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+row*20+4*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm movlps [eax+row*24+0], xmm6 \ __asm movhps [eax+row*24+8], xmm6 #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm movlps xmm2, [esi+16*4] \ __asm movlps xmm3, [esi+22*4] \ __asm movlps xmm4, [esi+28*4] \ __asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \ __asm movlps xmm7, [edi+row*40+ 0*4] \ __asm movhps xmm7, [edi+row*40+ 6*4] \ __asm movaps xmm6, xmm7 \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \ __asm mulps xmm6, xmm0 \ __asm movaps xmm5, xmm7 \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm6, xmm5 \ __asm movlps xmm7, [edi+row*40+ 2*4] \ __asm movhps xmm7, [edi+row*40+ 8*4] \ __asm movaps xmm5, xmm7 \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm movaps xmm5, xmm7 \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \ __asm mulps xmm5, xmm3 \ __asm addps xmm6, xmm5 \ __asm movlps xmm5, [edi+row*40+ 4*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm movlps [eax+row*48+ 4*4], xmm6 \ __asm movhps [eax+row*48+10*4], xmm6 #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \ __asm movlps xmm6, [edi+20*4+0*4] \ __asm unpcklps xmm6, xmm6 \ __asm mulps xmm6, xmm0 \ __asm movlps xmm5, [edi+20*4+2*4] \ __asm unpcklps xmm5, xmm5 \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+20*4+4*4] \ __asm unpcklps xmm5, xmm5 \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm movhlps xmm7, xmm6 \ __asm addps xmm6, xmm7 \ __asm movlps [eax+row*24+4*4], xmm6 MUL_Nx5_5x6_FIRST4COLUMNS_INIT MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx5_5x6_LAST2COLUMNS_INIT MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 ) return; } case 6: { // 6x5 * 5x6 MUL_Nx5_5x6_FIRST4COLUMNS_INIT MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 ) MUL_Nx5_5x6_LAST2COLUMNS_INIT MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l]; m2Ptr++; } m1Ptr += 5; } break; } case 6: { switch( k ) { case 1: { if ( !(l^1) ) { // 1x6 * 6x1 dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] + m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5]; return; } break; } case 2: { if ( !(l^2) ) { // 2x6 * 6x2 #define MUL_Nx6_6x2_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movaps xmm1, [esi+16] \ __asm movaps xmm2, [esi+32] #define MUL_Nx6_6x2_ROW2( row ) \ __asm movaps xmm7, [edi+row*48+0*4] \ __asm movaps xmm6, xmm7 \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm7, xmm0 \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movaps xmm6, [edi+row*48+4*4] \ __asm movaps xmm5, xmm6 \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \ __asm mulps xmm5, xmm0 \ __asm movaps xmm6, [edi+row*48+24+2*4] \ __asm movaps xmm4, xmm6 \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm5, xmm6 \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \ __asm mulps xmm4, xmm2 \ __asm addps xmm5, xmm4 \ __asm movaps xmm4, xmm5 \ __asm movhlps xmm5, xmm7 \ __asm movlhps xmm7, xmm4 \ __asm addps xmm7, xmm5 \ __asm movaps [eax+row*16], xmm7 MUL_Nx6_6x2_INIT MUL_Nx6_6x2_ROW2( 0 ) return; } break; } case 3: { if ( !(l^3) ) { // 3x6 * 6x3 #define MUL_Nx6_6x3_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movss xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 1*4] \ __asm movss xmm1, [esi+ 3*4] \ __asm movhps xmm1, [esi+ 4*4] \ __asm movss xmm2, [esi+ 6*4] \ __asm movhps xmm2, [esi+ 7*4] \ __asm movss xmm3, [esi+ 9*4] \ __asm movhps xmm3, [esi+10*4] \ __asm movss xmm4, [esi+12*4] \ __asm movhps xmm4, [esi+13*4] \ __asm movss xmm5, [esi+15*4] \ __asm movhps xmm5, [esi+16*4] #define MUL_Nx6_6x3_ROW( row ) \ __asm movss xmm7, [edi+row*24+0] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+row*24+4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+8] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+12] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+16] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+20] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movss [eax+row*12+0], xmm7 \ __asm movhps [eax+row*12+4], xmm7 MUL_Nx6_6x3_INIT MUL_Nx6_6x3_ROW( 0 ) MUL_Nx6_6x3_ROW( 1 ) MUL_Nx6_6x3_ROW( 2 ) return; } break; } case 4: { if ( !(l^4) ) { // 4x6 * 6x4 #define MUL_Nx6_6x4_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movaps xmm1, [esi+16] \ __asm movaps xmm2, [esi+32] \ __asm movaps xmm3, [esi+48] \ __asm movaps xmm4, [esi+64] \ __asm movaps xmm5, [esi+80] #define MUL_Nx6_6x4_ROW( row ) \ __asm movss xmm7, [edi+row*24+0] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+row*24+4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+8] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+12] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+16] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+20] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movaps [eax+row*16], xmm7 MUL_Nx6_6x4_INIT MUL_Nx6_6x4_ROW( 0 ) MUL_Nx6_6x4_ROW( 1 ) MUL_Nx6_6x4_ROW( 2 ) MUL_Nx6_6x4_ROW( 3 ) return; } break; } case 5: { if ( !(l^5) ) { // 5x6 * 6x5 #define MUL_Nx6_6x5_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movlps xmm1, [esi+20] \ __asm movhps xmm1, [esi+28] \ __asm movlps xmm2, [esi+40] \ __asm movhps xmm2, [esi+48] \ __asm movlps xmm3, [esi+60] \ __asm movhps xmm3, [esi+68] \ __asm movaps xmm4, [esi+80] \ __asm movlps xmm5, [esi+100] \ __asm movhps xmm5, [esi+108] #define MUL_Nx6_6x5_ROW( row ) \ __asm movss xmm7, [edi+row*24+0] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm fld dword ptr [edi+(row*6+0)*4] \ __asm fmul dword ptr [esi+(4+0*5)*4] \ __asm movss xmm6, [edi+row*24+4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+1)*4] \ __asm fmul dword ptr [esi+(4+1*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+8] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+2)*4] \ __asm fmul dword ptr [esi+(4+2*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+12] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+3)*4] \ __asm fmul dword ptr [esi+(4+3*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+16] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+4)*4] \ __asm fmul dword ptr [esi+(4+4*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+20] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+5)*4] \ __asm fmul dword ptr [esi+(4+5*5)*4] \ __asm faddp st(1),st \ __asm fstp dword ptr [eax+(row*5+4)*4] \ __asm movlps [eax+row*20], xmm7 \ __asm movhps [eax+row*20+8], xmm7 MUL_Nx6_6x5_INIT MUL_Nx6_6x5_ROW( 0 ) MUL_Nx6_6x5_ROW( 1 ) MUL_Nx6_6x5_ROW( 2 ) MUL_Nx6_6x5_ROW( 3 ) MUL_Nx6_6x5_ROW( 4 ) return; } break; } case 6: { switch( l ) { case 1: { // 6x6 * 6x1 __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 movlps [eax], xmm0 movhps [eax+8], xmm0 movlps [eax+16], xmm2 } return; } case 2: { // 6x6 * 6x2 MUL_Nx6_6x2_INIT MUL_Nx6_6x2_ROW2( 0 ) MUL_Nx6_6x2_ROW2( 1 ) MUL_Nx6_6x2_ROW2( 2 ) return; } case 3: { // 6x6 * 6x3 MUL_Nx6_6x3_INIT MUL_Nx6_6x3_ROW( 0 ) MUL_Nx6_6x3_ROW( 1 ) MUL_Nx6_6x3_ROW( 2 ) MUL_Nx6_6x3_ROW( 3 ) MUL_Nx6_6x3_ROW( 4 ) MUL_Nx6_6x3_ROW( 5 ) return; } case 4: { // 6x6 * 6x4 MUL_Nx6_6x4_INIT MUL_Nx6_6x4_ROW( 0 ) MUL_Nx6_6x4_ROW( 1 ) MUL_Nx6_6x4_ROW( 2 ) MUL_Nx6_6x4_ROW( 3 ) MUL_Nx6_6x4_ROW( 4 ) MUL_Nx6_6x4_ROW( 5 ) return; } case 5: { // 6x6 * 6x5 MUL_Nx6_6x5_INIT MUL_Nx6_6x5_ROW( 0 ) MUL_Nx6_6x5_ROW( 1 ) MUL_Nx6_6x5_ROW( 2 ) MUL_Nx6_6x5_ROW( 3 ) MUL_Nx6_6x5_ROW( 4 ) MUL_Nx6_6x5_ROW( 5 ) return; } case 6: { // 6x6 * 6x6 __asm { mov ecx, dword ptr m2Ptr movlps xmm3, qword ptr [ecx+72] mov edx, dword ptr m1Ptr // Loading first 4 columns (upper 4 rows) of m2Ptr. movaps xmm0, xmmword ptr [ecx] movlps xmm1, qword ptr [ecx+24] movhps xmm1, qword ptr [ecx+32] movaps xmm2, xmmword ptr [ecx+48] movhps xmm3, qword ptr [ecx+80] // Calculating first 4 elements in the first row of the destination matrix. movss xmm4, dword ptr [edx] movss xmm5, dword ptr [edx+4] mov eax, dword ptr dstPtr shufps xmm4, xmm4, 0 movss xmm6, dword ptr [edx+8] shufps xmm5, xmm5, 0 movss xmm7, dword ptr [edx+12] mulps xmm4, xmm0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm5, xmm1 mulps xmm6, xmm2 addps xmm5, xmm4 mulps xmm7, xmm3 addps xmm6, xmm5 addps xmm7, xmm6 movaps xmmword ptr [eax], xmm7 // Calculating first 4 elements in the second row of the destination matrix. movss xmm4, dword ptr [edx+24] shufps xmm4, xmm4, 0 mulps xmm4, xmm0 movss xmm5, dword ptr [edx+28] shufps xmm5, xmm5, 0 mulps xmm5, xmm1 movss xmm6, dword ptr [edx+32] shufps xmm6, xmm6, 0 movss xmm7, dword ptr [edx+36] shufps xmm7, xmm7, 0 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm7, xmm6 addps xmm5, xmm4 addps xmm7, xmm5 // Calculating first 4 elements in the third row of the destination matrix. movss xmm4, dword ptr [edx+48] movss xmm5, dword ptr [edx+52] movlps qword ptr [eax+24], xmm7 ; save 2nd movhps qword ptr [eax+32], xmm7 ; row movss xmm6, dword ptr [edx+56] movss xmm7, dword ptr [edx+60] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm5, xmm4 addps xmm7, xmm6 addps xmm7, xmm5 movaps xmmword ptr [eax+48], xmm7 // Calculating first 4 elements in the fourth row of the destination matrix. movss xmm4, dword ptr [edx+72] movss xmm5, dword ptr [edx+76] movss xmm6, dword ptr [edx+80] movss xmm7, dword ptr [edx+84] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm4, xmm5 addps xmm6, xmm4 addps xmm7, xmm6 movlps qword ptr [eax+72], xmm7 movhps qword ptr [eax+80], xmm7 // Calculating first 4 elements in the fifth row of the destination matrix. movss xmm4, dword ptr [edx+96] movss xmm5, dword ptr [edx+100] movss xmm6, dword ptr [edx+104] movss xmm7, dword ptr [edx+108] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm5, xmm4 addps xmm7, xmm6 addps xmm7, xmm5 movaps xmmword ptr [eax+96], xmm7 // Calculating first 4 elements in the sixth row of the destination matrix. movss xmm4, dword ptr [edx+120] movss xmm5, dword ptr [edx+124] movss xmm6, dword ptr [edx+128] movss xmm7, dword ptr [edx+132] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm4, xmm5 addps xmm6, xmm4 addps xmm7, xmm6 movhps qword ptr [eax+128], xmm7 movlps qword ptr [eax+120], xmm7 // Loading first 4 columns (lower 2 rows) of m2Ptr. movlps xmm0, qword ptr [ecx+96] movhps xmm0, qword ptr [ecx+104] movlps xmm1, qword ptr [ecx+120] movhps xmm1, qword ptr [ecx+128] // Calculating first 4 elements in the first row of the destination matrix. movss xmm2, dword ptr [edx+16] shufps xmm2, xmm2, 0 movss xmm4, dword ptr [edx+40] movss xmm3, dword ptr [edx+20] movss xmm5, dword ptr [edx+44] movaps xmm6, xmmword ptr [eax] movlps xmm7, qword ptr [eax+24] shufps xmm3, xmm3, 0 shufps xmm5, xmm5, 0 movhps xmm7, qword ptr [eax+32] shufps xmm4, xmm4, 0 mulps xmm5, xmm1 mulps xmm2, xmm0 mulps xmm3, xmm1 mulps xmm4, xmm0 addps xmm6, xmm2 addps xmm7, xmm4 addps xmm7, xmm5 addps xmm6, xmm3 movlps qword ptr [eax+24], xmm7 movaps xmmword ptr [eax], xmm6 movhps qword ptr [eax+32], xmm7 // Calculating first 4 elements in the third row of the destination matrix. movss xmm2, dword ptr [edx+64] movss xmm4, dword ptr [edx+88] movss xmm5, dword ptr [edx+92] movss xmm3, dword ptr [edx+68] movaps xmm6, xmmword ptr [eax+48] movlps xmm7, qword ptr [eax+72] movhps xmm7, qword ptr [eax+80] shufps xmm2, xmm2, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm3, xmm3, 0 mulps xmm2, xmm0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm3, xmm1 addps xmm6, xmm2 addps xmm6, xmm3 addps xmm7, xmm4 addps xmm7, xmm5 movlps qword ptr [eax+72], xmm7 movaps xmmword ptr [eax+48], xmm6 movhps qword ptr [eax+80], xmm7 // Calculating first 4 elements in the fifth row of the destination matrix. movss xmm2, dword ptr [edx+112] movss xmm3, dword ptr [edx+116] movaps xmm6, xmmword ptr [eax+96] shufps xmm2, xmm2, 0 shufps xmm3, xmm3, 0 mulps xmm2, xmm0 mulps xmm3, xmm1 addps xmm6, xmm2 addps xmm6, xmm3 movaps xmmword ptr [eax+96], xmm6 // Calculating first 4 elements in the sixth row of the destination matrix. movss xmm4, dword ptr [edx+136] movss xmm5, dword ptr [edx+140] movhps xmm7, qword ptr [eax+128] movlps xmm7, qword ptr [eax+120] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 addps xmm7, xmm4 addps xmm7, xmm5 // Calculating last 2 columns of the destination matrix. movlps xmm0, qword ptr [ecx+16] movhps xmm0, qword ptr [ecx+40] movhps qword ptr [eax+128], xmm7 movlps qword ptr [eax+120], xmm7 movlps xmm2, qword ptr [ecx+64] movhps xmm2, qword ptr [ecx+88] movaps xmm3, xmm2 shufps xmm3, xmm3, 4Eh movlps xmm4, qword ptr [ecx+112] movhps xmm4, qword ptr [ecx+136] movaps xmm5, xmm4 shufps xmm5, xmm5, 4Eh movlps xmm6, qword ptr [edx] movhps xmm6, qword ptr [edx+24] movaps xmm7, xmm6 shufps xmm7, xmm7, 0F0h mulps xmm7, xmm0 shufps xmm6, xmm6, 0A5h movaps xmm1, xmm0 shufps xmm1, xmm1, 4Eh mulps xmm1, xmm6 addps xmm7, xmm1 movlps xmm6, qword ptr [edx+8] movhps xmm6, qword ptr [edx+32] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm2 mulps xmm6, xmm3 addps xmm7, xmm1 addps xmm7, xmm6 movhps xmm6, qword ptr [edx+40] movlps xmm6, qword ptr [edx+16] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm4 mulps xmm6, xmm5 addps xmm7, xmm1 addps xmm7, xmm6 movlps qword ptr [eax+16], xmm7 movhps qword ptr [eax+40], xmm7 movlps xmm6, qword ptr [edx+48] movhps xmm6, qword ptr [edx+72] movaps xmm7, xmm6 shufps xmm7, xmm7, 0F0h mulps xmm7, xmm0 shufps xmm6, xmm6, 0A5h movaps xmm1, xmm0 shufps xmm1, xmm1, 4Eh mulps xmm1, xmm6 addps xmm7, xmm1 movhps xmm6, qword ptr [edx+80] movlps xmm6, qword ptr [edx+56] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm2 mulps xmm6, xmm3 addps xmm7, xmm1 addps xmm7, xmm6 movlps xmm6, qword ptr [edx+64] movhps xmm6, qword ptr [edx+88] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm4 mulps xmm6, xmm5 addps xmm7, xmm1 addps xmm7, xmm6 movlps qword ptr [eax+64], xmm7 movhps qword ptr [eax+88], xmm7 movlps xmm6, qword ptr [edx+96] movhps xmm6, qword ptr [edx+120] movaps xmm7, xmm6 shufps xmm7, xmm7, 0F0h mulps xmm7, xmm0 shufps xmm6, xmm6, 0A5h movaps xmm1, xmm0 shufps xmm1, xmm1, 4Eh mulps xmm1, xmm6 addps xmm7, xmm1 movlps xmm6, qword ptr [edx+104] movhps xmm6, qword ptr [edx+128] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm2 mulps xmm6, xmm3 addps xmm7, xmm1 addps xmm7, xmm6 movlps xmm6, qword ptr [edx+112] movhps xmm6, qword ptr [edx+136] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm4 mulps xmm6, xmm5 addps xmm7, xmm1 addps xmm7, xmm6 movlps qword ptr [eax+112], xmm7 movhps qword ptr [eax+136], xmm7 } return; } } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l]; m2Ptr++; } m1Ptr += 6; } break; } default: { for ( i = 0; i < k; i++ ) { for ( j = 0; j < l; j++ ) { m2Ptr = m2.ToFloatPtr() + j; sum = m1Ptr[0] * m2Ptr[0]; for ( n = 1; n < m1.GetNumColumns(); n++ ) { m2Ptr += l; sum += m1Ptr[n] * m2Ptr[0]; } *dstPtr++ = sum; } m1Ptr += m1.GetNumColumns(); } break; } } } /* ============ idSIMD_SSE::MatX_TransposeMultiplyMatX optimizes the following transpose matrix multiplications: Nx6 * NxN 6xN * 6x6 with N in the range [1-6]. ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) { int i, j, k, l, n; float *dstPtr; const float *m1Ptr, *m2Ptr; double sum; assert( m1.GetNumRows() == m2.GetNumRows() ); m1Ptr = m1.ToFloatPtr(); m2Ptr = m2.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); k = m1.GetNumColumns(); l = m2.GetNumColumns(); switch( m1.GetNumRows() ) { case 1: if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1 __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] movaps [eax], xmm0 movlps [eax+16], xmm1 } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m2Ptr++; } m1Ptr++; } break; case 2: if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2 #define MUL_2xN_2x2_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi] \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm movlps xmm1, [esi+8] \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) #define MUL_2xN_2x2_ROW2( N, row ) \ __asm movlps xmm6, [edi+(row+0*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm movlps xmm7, [edi+(row+1*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm0 \ __asm mulps xmm7, xmm1 \ __asm addps xmm6, xmm7 \ __asm movaps [eax+(row*2)*4], xmm6 MUL_2xN_2x2_INIT MUL_2xN_2x2_ROW2( 6, 0 ) MUL_2xN_2x2_ROW2( 6, 2 ) MUL_2xN_2x2_ROW2( 6, 4 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l]; m2Ptr++; } m1Ptr++; } break; case 3: if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3 #define MUL_3xN_3x3_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movss xmm0, [esi+(0*3+0)*4] \ __asm movhps xmm0, [esi+(0*3+1)*4] \ __asm movss xmm1, [esi+(1*3+0)*4] \ __asm movhps xmm1, [esi+(1*3+1)*4] \ __asm movss xmm2, [esi+(2*3+0)*4] \ __asm movhps xmm2, [esi+(2*3+1)*4] #define MUL_3xN_3x3_INIT_ROW4 \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 ) #define MUL_3xN_3x3_ROW4( N, row ) \ __asm movlps xmm3, [edi+(row+0*N+0)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+0)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \ __asm movlps xmm5, [edi+(row+2*N+0)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+0)*4], xmm3 \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \ __asm movlps xmm3, [edi+(row+0*N+1)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+1)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm movlps xmm5, [edi+(row+2*N+1)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+4)*4], xmm3 \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \ __asm movlps xmm3, [edi+(row+0*N+2)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+2)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \ __asm movlps xmm5, [edi+(row+2*N+2)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+8)*4], xmm3 #define MUL_3xN_3x3_INIT_ROW4_ROW4 \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) #define MUL_3xN_3x3_INIT_ROW4_ROW \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 ) #define MUL_3xN_3x3_ROW( N, row ) \ __asm movss xmm3, [edi+(row+0*N)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm movss xmm4, [edi+(row+1*N)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm movss xmm5, [edi+(row+2*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movss [eax+(row*3+0)*4], xmm3 \ __asm movhps [eax+(row*3+1)*4], xmm3 MUL_3xN_3x3_INIT MUL_3xN_3x3_INIT_ROW4 MUL_3xN_3x3_ROW4( 6, 0 ) MUL_3xN_3x3_INIT_ROW4_ROW MUL_3xN_3x3_ROW( 6, 4 ) MUL_3xN_3x3_ROW( 6, 5 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l]; m2Ptr++; } m1Ptr++; } break; case 4: if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4 #define MUL_4xN_4x4_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movaps xmm1, [esi+16] \ __asm movaps xmm2, [esi+32] \ __asm movaps xmm3, [esi+48] #define MUL_4xN_4x4_ROW( N, row ) \ __asm movss xmm7, [edi+(row+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+(row+1*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+2*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+3*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movaps [eax+row*16], xmm7 MUL_4xN_4x4_INIT MUL_4xN_4x4_ROW( 6, 0 ) MUL_4xN_4x4_ROW( 6, 1 ) MUL_4xN_4x4_ROW( 6, 2 ) MUL_4xN_4x4_ROW( 6, 3 ) MUL_4xN_4x4_ROW( 6, 4 ) MUL_4xN_4x4_ROW( 6, 5 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l]; m2Ptr++; } m1Ptr++; } break; case 5: if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5 #define MUL_5xN_5x5_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 5*4] \ __asm movhps xmm1, [esi+ 7*4] \ __asm movlps xmm2, [esi+10*4] \ __asm movhps xmm2, [esi+12*4] \ __asm movlps xmm3, [esi+15*4] \ __asm movhps xmm3, [esi+17*4] \ __asm movlps xmm4, [esi+20*4] \ __asm movhps xmm4, [esi+22*4] #define MUL_5xN_5x5_ROW( N, row ) \ __asm movss xmm6, [edi+(row+0*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm0 \ __asm fld dword ptr [edi+(row+0*N)*4] \ __asm fmul dword ptr [esi+ 4*4] \ __asm movss xmm5, [edi+(row+1*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+1*N)*4] \ __asm fmul dword ptr [esi+ 9*4] \ __asm faddp st(1),st \ __asm movss xmm5, [edi+(row+2*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+2*N)*4] \ __asm fmul dword ptr [esi+14*4] \ __asm faddp st(1),st \ __asm movss xmm5, [edi+(row+3*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm3 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+3*N)*4] \ __asm fmul dword ptr [esi+19*4] \ __asm faddp st(1),st \ __asm movss xmm5, [edi+(row+4*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+4*N)*4] \ __asm fmul dword ptr [esi+24*4] \ __asm faddp st(1),st \ __asm fstp dword ptr [eax+(row*5+4)*4] \ __asm movlps [eax+(row*5+0)*4], xmm6 \ __asm movhps [eax+(row*5+2)*4], xmm6 MUL_5xN_5x5_INIT MUL_5xN_5x5_ROW( 6, 0 ) MUL_5xN_5x5_ROW( 6, 1 ) MUL_5xN_5x5_ROW( 6, 2 ) MUL_5xN_5x5_ROW( 6, 3 ) MUL_5xN_5x5_ROW( 6, 4 ) MUL_5xN_5x5_ROW( 6, 5 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l]; m2Ptr++; } m1Ptr++; } break; case 6: if ( !(l^6) ) { switch( k ) { case 1: { // 6x1 * 6x6 #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] \ __asm movlps xmm3, [esi+18*4] \ __asm movhps xmm3, [esi+20*4] \ __asm movlps xmm4, [esi+24*4] \ __asm movhps xmm4, [esi+26*4] \ __asm movlps xmm5, [esi+30*4] \ __asm movhps xmm5, [esi+32*4] #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \ __asm movss xmm7, [edi+(row+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+(row+1*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+2*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+3*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+4*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+5*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movlps [eax+(row*6+0)*4], xmm7 \ __asm movhps [eax+(row*6+2)*4], xmm7 #define MUL_6xN_6x6_LAST2COLUMNS_INIT \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm movlps xmm2, [esi+16*4] \ __asm movlps xmm3, [esi+22*4] \ __asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm movlps xmm4, [esi+28*4] \ __asm movlps xmm5, [esi+34*4] \ __asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \ __asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 ) #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \ __asm movlps xmm7, [edi+(row*2+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm7, xmm0 \ __asm movlps xmm6, [edi+(row*2+1*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+2*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+3*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+4*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+5*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movlps [eax+(row*12+ 4)*4], xmm7 \ __asm movhps [eax+(row*12+10)*4], xmm7 #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \ __asm movss xmm7, [edi+(1*N-1)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+(2*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(3*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(4*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(5*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(6*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movlps [eax+(row*6+4)*4], xmm7 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 ) return; } case 2: { // 6x2 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 ) return; } case 3: { // 6x3 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 ) return; } case 4: { // 6x4 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 ) return; } case 5: { // 6x5 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 ) MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 ) return; } case 6: { // 6x6 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l]; m2Ptr++; } m1Ptr++; } break; default: for ( i = 0; i < k; i++ ) { for ( j = 0; j < l; j++ ) { m1Ptr = m1.ToFloatPtr() + i; m2Ptr = m2.ToFloatPtr() + j; sum = m1Ptr[0] * m2Ptr[0]; for ( n = 1; n < m1.GetNumRows(); n++ ) { m1Ptr += k; m2Ptr += l; sum += m1Ptr[0] * m2Ptr[0]; } *dstPtr++ = sum; } } break; } } /* ============ idSIMD_SSE::MatX_LowerTriangularSolve solves x in Lx = b for the n * n sub-matrix of L if skip > 0 the first skip elements of x are assumed to be valid already L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) { int nc; const float *lptr; if ( skip >= n ) { return; } lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { #define NSKIP( n, s ) ((n<<3)|(s&7)) switch( NSKIP( n, skip ) ) { case NSKIP( 1, 0 ): x[0] = b[0]; return; case NSKIP( 2, 0 ): x[0] = b[0]; case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; return; case NSKIP( 3, 0 ): x[0] = b[0]; case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; return; case NSKIP( 4, 0 ): x[0] = b[0]; case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; return; case NSKIP( 5, 0 ): x[0] = b[0]; case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; return; case NSKIP( 6, 0 ): x[0] = b[0]; case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; return; case NSKIP( 7, 0 ): x[0] = b[0]; case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5]; return; } return; } // process first 4 rows switch( skip ) { case 0: x[0] = b[0]; case 1: x[1] = b[1] - lptr[1*nc+0] * x[0]; case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; skip = 4; } lptr = L[skip]; // this code assumes n > 4 __asm { push ebx mov eax, skip // eax = i shl eax, 2 // eax = i*4 mov edx, n // edx = n shl edx, 2 // edx = n*4 mov esi, x // esi = x mov edi, lptr // edi = lptr add esi, eax add edi, eax mov ebx, b // ebx = b // check for aligned memory mov ecx, nc shl ecx, 2 or ecx, esi or ecx, edi and ecx, 15 jnz loopurow // aligned looprow: mov ecx, eax neg ecx movaps xmm0, [esi+ecx] mulps xmm0, [edi+ecx] add ecx, 12*4 jg donedot8 dot8: movaps xmm1, [esi+ecx-(8*4)] mulps xmm1, [edi+ecx-(8*4)] addps xmm0, xmm1 movaps xmm3, [esi+ecx-(4*4)] mulps xmm3, [edi+ecx-(4*4)] addps xmm0, xmm3 add ecx, 8*4 jle dot8 donedot8: sub ecx, 4*4 jg donedot4 //dot4: movaps xmm1, [esi+ecx-(4*4)] mulps xmm1, [edi+ecx-(4*4)] addps xmm0, xmm1 add ecx, 4*4 donedot4: movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 sub ecx, 4*4 jz dot0 add ecx, 4 jz dot1 add ecx, 4 jz dot2 //dot3: movss xmm1, [esi-(3*4)] mulss xmm1, [edi-(3*4)] addss xmm0, xmm1 dot2: movss xmm3, [esi-(2*4)] mulss xmm3, [edi-(2*4)] addss xmm0, xmm3 dot1: movss xmm5, [esi-(1*4)] mulss xmm5, [edi-(1*4)] addss xmm0, xmm5 dot0: movss xmm1, [ebx+eax] subss xmm1, xmm0 movss [esi], xmm1 add eax, 4 cmp eax, edx jge done add esi, 4 mov ecx, nc shl ecx, 2 add edi, ecx add edi, 4 jmp looprow // unaligned loopurow: mov ecx, eax neg ecx movups xmm0, [esi+ecx] movups xmm1, [edi+ecx] mulps xmm0, xmm1 add ecx, 12*4 jg doneudot8 udot8: movups xmm1, [esi+ecx-(8*4)] movups xmm2, [edi+ecx-(8*4)] mulps xmm1, xmm2 addps xmm0, xmm1 movups xmm3, [esi+ecx-(4*4)] movups xmm4, [edi+ecx-(4*4)] mulps xmm3, xmm4 addps xmm0, xmm3 add ecx, 8*4 jle udot8 doneudot8: sub ecx, 4*4 jg doneudot4 //udot4: movups xmm1, [esi+ecx-(4*4)] movups xmm2, [edi+ecx-(4*4)] mulps xmm1, xmm2 addps xmm0, xmm1 add ecx, 4*4 doneudot4: movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm1 sub ecx, 4*4 jz udot0 add ecx, 4 jz udot1 add ecx, 4 jz udot2 //udot3: movss xmm1, [esi-(3*4)] movss xmm2, [edi-(3*4)] mulss xmm1, xmm2 addss xmm0, xmm1 udot2: movss xmm3, [esi-(2*4)] movss xmm4, [edi-(2*4)] mulss xmm3, xmm4 addss xmm0, xmm3 udot1: movss xmm5, [esi-(1*4)] movss xmm6, [edi-(1*4)] mulss xmm5, xmm6 addss xmm0, xmm5 udot0: movss xmm1, [ebx+eax] subss xmm1, xmm0 movss [esi], xmm1 add eax, 4 cmp eax, edx jge done add esi, 4 mov ecx, nc shl ecx, 2 add edi, ecx add edi, 4 jmp loopurow done: pop ebx } } /* ============ idSIMD_SSE::MatX_LowerTriangularSolveTranspose solves x in L'x = b for the n * n sub-matrix of L L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) { int nc; const float *lptr; lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[1] = b[1]; x[0] = b[0] - lptr[1*nc+0] * x[1]; return; case 3: x[2] = b[2]; x[1] = b[1] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 4: x[3] = b[3]; x[2] = b[2] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 5: x[4] = b[4]; x[3] = b[3] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 6: x[5] = b[5]; x[4] = b[4] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 7: x[6] = b[6]; x[5] = b[5] - lptr[6*nc+5] * x[6]; x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; } return; } #if 1 int i, j, m; float *xptr; double s0; // if the number of columns is not a multiple of 2 we're screwed for alignment. // however, if the number of columns is a multiple of 2 but the number of to be // processed rows is not a multiple of 2 we can still run 8 byte aligned m = n; if ( m & 1 ) { m--; x[m] = b[m]; lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; __asm { push ebx mov eax, m // eax = i mov esi, xptr // esi = xptr mov edi, lptr // edi = lptr mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 process4rows_1: movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3] xor ecx, ecx sub eax, m neg eax jz done4x4_1 process4x4_1: // process 4x4 blocks movlps xmm2, [edi+0] movhps xmm2, [edi+8] add edi, edx movss xmm1, [esi+4*ecx+0] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm3, [edi+0] movhps xmm3, [edi+8] add edi, edx mulps xmm1, xmm2 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm4, [edi+0] movhps xmm4, [edi+8] add edi, edx mulps xmm1, xmm3 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+8] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm5, [edi+0] movhps xmm5, [edi+8] add edi, edx mulps xmm1, xmm4 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+12] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) add ecx, 4 cmp ecx, eax mulps xmm1, xmm5 subps xmm0, xmm1 jl process4x4_1 done4x4_1: // process left over of the 4 rows movlps xmm2, [edi+0] movhps xmm2, [edi+8] movss xmm1, [esi+4*ecx] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm2 subps xmm0, xmm1 imul ecx, edx sub edi, ecx neg eax add eax, m sub eax, 4 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 ) movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) sub edi, edx movss [esi-4], xmm3 // xptr[-1] = s3 movss xmm4, xmm3 movss xmm5, xmm3 mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3 mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3 mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3 subss xmm2, xmm3 movss [esi-8], xmm2 // xptr[-2] = s2 movss xmm6, xmm2 sub edi, edx subss xmm0, xmm5 subss xmm1, xmm4 mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2 mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2 subss xmm1, xmm2 movss [esi-12], xmm1 // xptr[-3] = s1 subss xmm0, xmm6 sub edi, edx cmp eax, 4 mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1 subss xmm0, xmm1 movss [esi-16], xmm0 // xptr[-4] = s0 jl done4rows_1 sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows_1 done4rows_1: pop ebx } } else { lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; __asm { push ebx mov eax, m // eax = i mov esi, xptr // esi = xptr mov edi, lptr // edi = lptr mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 process4rows: movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3] sub eax, m jz done4x4 neg eax xor ecx, ecx process4x4: // process 4x4 blocks movlps xmm2, [edi+0] movhps xmm2, [edi+8] add edi, edx movss xmm1, [esi+4*ecx+0] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm3, [edi+0] movhps xmm3, [edi+8] add edi, edx mulps xmm1, xmm2 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm4, [edi+0] movhps xmm4, [edi+8] add edi, edx mulps xmm1, xmm3 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+8] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps xmm5, [edi+0] movhps xmm5, [edi+8] add edi, edx mulps xmm1, xmm4 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+12] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) add ecx, 4 cmp ecx, eax mulps xmm1, xmm5 subps xmm0, xmm1 jl process4x4 imul ecx, edx sub edi, ecx neg eax done4x4: // process left over of the 4 rows add eax, m sub eax, 4 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 ) movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 ) sub edi, edx movss [esi-4], xmm3 // xptr[-1] = s3 movss xmm4, xmm3 movss xmm5, xmm3 mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3 mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3 mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3 subss xmm2, xmm3 movss [esi-8], xmm2 // xptr[-2] = s2 movss xmm6, xmm2 sub edi, edx subss xmm0, xmm5 subss xmm1, xmm4 mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2 mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2 subss xmm1, xmm2 movss [esi-12], xmm1 // xptr[-3] = s1 subss xmm0, xmm6 sub edi, edx cmp eax, 4 mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1 subss xmm0, xmm1 movss [esi-16], xmm0 // xptr[-4] = s0 jl done4rows sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows done4rows: pop ebx } } // process left over rows for ( i = (m&3)-1; i >= 0; i-- ) { s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < n; j++ ) { s0 -= lptr[j*nc] * x[j]; } x[i] = s0; } #else int i, j, m; double s0, s1, s2, s3, t; const float *lptr2; float *xptr, *xptr2; m = n; if ( m & 1 ) { m--; x[m] = b[m]; lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; // process 4 rows at a time for ( i = m; i >= 4; i -= 4 ) { s0 = b[i-4]; s1 = b[i-3]; s2 = b[i-2]; s3 = b[i-1]; // process 4x4 blocks xptr2 = xptr; // x + i; lptr2 = lptr; // ptr = L[i] + i - 4; for ( j = 0; j < m-i; j += 4 ) { t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; } t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; // process left over of the 4 rows lptr -= nc; s0 -= lptr[0] * s3; s1 -= lptr[1] * s3; s2 -= lptr[2] * s3; lptr -= nc; s0 -= lptr[0] * s2; s1 -= lptr[1] * s2; lptr -= nc; s0 -= lptr[0] * s1; lptr -= nc; // store result xptr[-4] = s0; xptr[-3] = s1; xptr[-2] = s2; xptr[-1] = s3; // update pointers for next four rows lptr -= 4; xptr -= 4; } } else { lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; // process 4 rows at a time for ( i = m; i >= 4; i -= 4 ) { s0 = b[i-4]; s1 = b[i-3]; s2 = b[i-2]; s3 = b[i-1]; // process 4x4 blocks xptr2 = xptr; // x + i; lptr2 = lptr; // ptr = L[i] + i - 4; for ( j = 0; j < m-i; j += 4 ) { t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; } // process left over of the 4 rows lptr -= nc; s0 -= lptr[0] * s3; s1 -= lptr[1] * s3; s2 -= lptr[2] * s3; lptr -= nc; s0 -= lptr[0] * s2; s1 -= lptr[1] * s2; lptr -= nc; s0 -= lptr[0] * s1; lptr -= nc; // store result xptr[-4] = s0; xptr[-3] = s1; xptr[-2] = s2; xptr[-1] = s3; // update pointers for next four rows lptr -= 4; xptr -= 4; } } // process left over rows for ( i--; i >= 0; i-- ) { s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < m; j++ ) { s0 -= lptr[j*nc] * x[j]; } x[i] = s0; } #endif } /* ============ idSIMD_SSE::MatX_LDLTFactor in-place factorization LDL' of the n * n sub-matrix of mat the reciprocal of the diagonal elements are stored in invDiag currently assumes the number of columns of mat is a multiple of 4 ============ */ bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) { #if 1 int j, nc; float *v, *diag, *invDiagPtr, *mptr; double s0, s1, s2, sum, d; v = (float *) _alloca16( n * sizeof( float ) ); diag = (float *) _alloca16( n * sizeof( float ) ); invDiagPtr = invDiag.ToFloatPtr(); nc = mat.GetNumColumns(); assert( ( nc & 3 ) == 0 ); if ( n <= 0 ) { return true; } mptr = mat[0]; sum = mptr[0]; if ( sum == 0.0f ) { return false; } diag[0] = sum; invDiagPtr[0] = d = 1.0f / sum; if ( n <= 1 ) { return true; } mptr = mat[0]; for ( j = 1; j < n; j++ ) { mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; } mptr = mat[1]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { return false; } mat[1][1] = sum; diag[1] = sum; invDiagPtr[1] = d = 1.0f / sum; if ( n <= 2 ) { return true; } mptr = mat[0]; for ( j = 2; j < n; j++ ) { mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; } mptr = mat[2]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { return false; } mat[2][2] = sum; diag[2] = sum; invDiagPtr[2] = d = 1.0f / sum; if ( n <= 3 ) { return true; } mptr = mat[0]; for ( j = 3; j < n; j++ ) { mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; } mptr = mat[3]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { return false; } mat[3][3] = sum; diag[3] = sum; invDiagPtr[3] = d = 1.0f / sum; if ( n <= 4 ) { return true; } mptr = mat[0]; for ( j = 4; j < n; j++ ) { mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; } int ncf = nc * sizeof( float ); mptr = mat[0]; __asm { xorps xmm2, xmm2 xorps xmm3, xmm3 xorps xmm4, xmm4 push ebx mov ebx, 4 loopRow: cmp ebx, n jge done mov ecx, ebx // esi = i shl ecx, 2 // esi = i * 4 mov edx, diag // edx = diag add edx, ecx // edx = &diag[i] mov edi, ebx // edi = i imul edi, ncf // edi = i * nc * sizeof( float ) add edi, mptr // edi = mat[i] add edi, ecx // edi = &mat[i][i] mov esi, v // ecx = v add esi, ecx // ecx = &v[i] mov eax, invDiagPtr // eax = invDiagPtr add eax, ecx // eax = &invDiagPtr[i] neg ecx movaps xmm0, [edx+ecx] mulps xmm0, [edi+ecx] movaps [esi+ecx], xmm0 mulps xmm0, [edi+ecx] add ecx, 12*4 jg doneDot8 dot8: movaps xmm1, [edx+ecx-(8*4)] mulps xmm1, [edi+ecx-(8*4)] movaps [esi+ecx-(8*4)], xmm1 mulps xmm1, [edi+ecx-(8*4)] addps xmm0, xmm1 movaps xmm2, [edx+ecx-(4*4)] mulps xmm2, [edi+ecx-(4*4)] movaps [esi+ecx-(4*4)], xmm2 mulps xmm2, [edi+ecx-(4*4)] addps xmm0, xmm2 add ecx, 8*4 jle dot8 doneDot8: sub ecx, 4*4 jg doneDot4 movaps xmm1, [edx+ecx-(4*4)] mulps xmm1, [edi+ecx-(4*4)] movaps [esi+ecx-(4*4)], xmm1 mulps xmm1, [edi+ecx-(4*4)] addps xmm0, xmm1 add ecx, 4*4 doneDot4: sub ecx, 2*4 jg doneDot2 movlps xmm3, [edx+ecx-(2*4)] movlps xmm4, [edi+ecx-(2*4)] mulps xmm3, xmm4 movlps [esi+ecx-(2*4)], xmm3 mulps xmm3, xmm4 addps xmm0, xmm3 add ecx, 2*4 doneDot2: sub ecx, 1*4 jg doneDot1 movss xmm3, [edx+ecx-(1*4)] movss xmm4, [edi+ecx-(1*4)] mulss xmm3, xmm4 movss [esi+ecx-(1*4)], xmm3 mulss xmm3, xmm4 addss xmm0, xmm3 doneDot1: movhlps xmm2, xmm0 addps xmm0, xmm2 movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm2 movss xmm1, [edi] subss xmm1, xmm0 movss [edi], xmm1 // mptr[i] = sum; movss [edx], xmm1 // diag[i] = sum; // if ( sum == 0.0f ) return false; movaps xmm2, xmm1 cmpeqss xmm2, SIMD_SP_zero andps xmm2, SIMD_SP_tiny orps xmm1, xmm2 rcpss xmm7, xmm1 mulss xmm1, xmm7 mulss xmm1, xmm7 addss xmm7, xmm7 subss xmm7, xmm1 movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum; mov edx, n // edx = n sub edx, ebx // edx = n - i dec edx // edx = n - i - 1 jle doneSubRow // if ( i + 1 >= n ) return true; mov eax, ebx // eax = i shl eax, 2 // eax = i * 4 neg eax loopSubRow: add edi, ncf mov ecx, eax movaps xmm0, [esi+ecx] mulps xmm0, [edi+ecx] add ecx, 12*4 jg doneSubDot8 subDot8: movaps xmm1, [esi+ecx-(8*4)] mulps xmm1, [edi+ecx-(8*4)] addps xmm0, xmm1 movaps xmm2, [esi+ecx-(4*4)] mulps xmm2, [edi+ecx-(4*4)] addps xmm0, xmm2 add ecx, 8*4 jle subDot8 doneSubDot8: sub ecx, 4*4 jg doneSubDot4 movaps xmm1, [esi+ecx-(4*4)] mulps xmm1, [edi+ecx-(4*4)] addps xmm0, xmm1 add ecx, 4*4 doneSubDot4: sub ecx, 2*4 jg doneSubDot2 movlps xmm3, [esi+ecx-(2*4)] movlps xmm4, [edi+ecx-(2*4)] mulps xmm3, xmm4 addps xmm0, xmm3 add ecx, 2*4 doneSubDot2: sub ecx, 1*4 jg doneSubDot1 movss xmm3, [esi+ecx-(1*4)] movss xmm4, [edi+ecx-(1*4)] mulss xmm3, xmm4 addss xmm0, xmm3 doneSubDot1: movhlps xmm2, xmm0 addps xmm0, xmm2 movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 ) addss xmm0, xmm2 movss xmm1, [edi] subss xmm1, xmm0 mulss xmm1, xmm7 movss [edi], xmm1 dec edx jg loopSubRow doneSubRow: inc ebx jmp loopRow done: pop ebx } return true; #else int i, j, k, nc; float *v, *diag, *mptr; double s0, s1, s2, s3, sum, d; v = (float *) _alloca16( n * sizeof( float ) ); diag = (float *) _alloca16( n * sizeof( float ) ); nc = mat.GetNumColumns(); if ( n <= 0 ) { return true; } mptr = mat[0]; sum = mptr[0]; if ( sum == 0.0f ) { return false; } diag[0] = sum; invDiag[0] = d = 1.0f / sum; if ( n <= 1 ) { return true; } mptr = mat[0]; for ( j = 1; j < n; j++ ) { mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; } mptr = mat[1]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { return false; } mat[1][1] = sum; diag[1] = sum; invDiag[1] = d = 1.0f / sum; if ( n <= 2 ) { return true; } mptr = mat[0]; for ( j = 2; j < n; j++ ) { mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; } mptr = mat[2]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { return false; } mat[2][2] = sum; diag[2] = sum; invDiag[2] = d = 1.0f / sum; if ( n <= 3 ) { return true; } mptr = mat[0]; for ( j = 3; j < n; j++ ) { mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; } mptr = mat[3]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { return false; } mat[3][3] = sum; diag[3] = sum; invDiag[3] = d = 1.0f / sum; if ( n <= 4 ) { return true; } mptr = mat[0]; for ( j = 4; j < n; j++ ) { mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; } for ( i = 4; i < n; i++ ) { mptr = mat[i]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3]; for ( k = 4; k < i-3; k += 4 ) { v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0]; v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2]; v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3]; } switch( i - k ) { case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2]; case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0]; } sum = s3; sum += s2; sum += s1; sum += s0; sum = mptr[i] - sum; if ( sum == 0.0f ) { return false; } mat[i][i] = sum; diag[i] = sum; invDiag[i] = d = 1.0f / sum; if ( i + 1 >= n ) { return true; } mptr = mat[i+1]; for ( j = i+1; j < n; j++ ) { s0 = mptr[0] * v[0]; s1 = mptr[1] * v[1]; s2 = mptr[2] * v[2]; s3 = mptr[3] * v[3]; for ( k = 4; k < i-7; k += 8 ) { s0 += mptr[k+0] * v[k+0]; s1 += mptr[k+1] * v[k+1]; s2 += mptr[k+2] * v[k+2]; s3 += mptr[k+3] * v[k+3]; s0 += mptr[k+4] * v[k+4]; s1 += mptr[k+5] * v[k+5]; s2 += mptr[k+6] * v[k+6]; s3 += mptr[k+7] * v[k+7]; } switch( i - k ) { case 7: s0 += mptr[k+6] * v[k+6]; case 6: s1 += mptr[k+5] * v[k+5]; case 5: s2 += mptr[k+4] * v[k+4]; case 4: s3 += mptr[k+3] * v[k+3]; case 3: s0 += mptr[k+2] * v[k+2]; case 2: s1 += mptr[k+1] * v[k+1]; case 1: s2 += mptr[k+0] * v[k+0]; } sum = s3; sum += s2; sum += s1; sum += s0; mptr[i] = ( mptr[i] - sum ) * d; mptr += nc; } } return true; #endif } /* ============ idSIMD_SSE::BlendJoints ============ */ #define REFINE_BLENDJOINTS_RECIPROCAL void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { int i; if ( lerp <= 0.0f ) { return; } else if ( lerp >= 1.0f ) { for ( i = 0; i < numJoints; i++ ) { int j = index[i]; joints[j] = blendJoints[j]; } return; } for ( i = 0; i <= numJoints - 4; i += 4 ) { ALIGN16( float jointVert0[4] ); ALIGN16( float jointVert1[4] ); ALIGN16( float jointVert2[4] ); ALIGN16( float blendVert0[4] ); ALIGN16( float blendVert1[4] ); ALIGN16( float blendVert2[4] ); ALIGN16( float jointQuat0[4] ); ALIGN16( float jointQuat1[4] ); ALIGN16( float jointQuat2[4] ); ALIGN16( float jointQuat3[4] ); ALIGN16( float blendQuat0[4] ); ALIGN16( float blendQuat1[4] ); ALIGN16( float blendQuat2[4] ); ALIGN16( float blendQuat3[4] ); for ( int j = 0; j < 4; j++ ) { int n = index[i+j]; jointVert0[j] = joints[n].t[0]; jointVert1[j] = joints[n].t[1]; jointVert2[j] = joints[n].t[2]; blendVert0[j] = blendJoints[n].t[0]; blendVert1[j] = blendJoints[n].t[1]; blendVert2[j] = blendJoints[n].t[2]; jointQuat0[j] = joints[n].q[0]; jointQuat1[j] = joints[n].q[1]; jointQuat2[j] = joints[n].q[2]; jointQuat3[j] = joints[n].q[3]; blendQuat0[j] = blendJoints[n].q[0]; blendQuat1[j] = blendJoints[n].q[1]; blendQuat2[j] = blendJoints[n].q[2]; blendQuat3[j] = blendJoints[n].q[3]; } #if 1 __asm { // lerp translation movss xmm7, lerp shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) movaps xmm0, blendVert0 subps xmm0, jointVert0 mulps xmm0, xmm7 addps xmm0, jointVert0 movaps jointVert0, xmm0 movaps xmm1, blendVert1 subps xmm1, jointVert1 mulps xmm1, xmm7 addps xmm1, jointVert1 movaps jointVert1, xmm1 movaps xmm2, blendVert2 subps xmm2, jointVert2 mulps xmm2, xmm7 addps xmm2, jointVert2 movaps jointVert2, xmm2 // lerp quaternions movaps xmm0, jointQuat0 mulps xmm0, blendQuat0 movaps xmm1, jointQuat1 mulps xmm1, blendQuat1 addps xmm0, xmm1 movaps xmm2, jointQuat2 mulps xmm2, blendQuat2 addps xmm0, xmm2 movaps xmm3, jointQuat3 mulps xmm3, blendQuat3 addps xmm0, xmm3 // xmm0 = cosom movaps xmm1, xmm0 movaps xmm2, xmm0 andps xmm1, SIMD_SP_signBitMask // xmm1 = signBit xorps xmm0, xmm1 mulps xmm2, xmm2 xorps xmm4, xmm4 movaps xmm3, SIMD_SP_one subps xmm3, xmm2 // xmm3 = scale0 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #ifdef REFINE_BLENDJOINTS_RECIPROCAL movaps xmm2, xmm3 rsqrtps xmm4, xmm2 mulps xmm2, xmm4 mulps xmm2, xmm4 subps xmm2, SIMD_SP_rsqrt_c0 mulps xmm4, SIMD_SP_rsqrt_c1 mulps xmm2, xmm4 #else rsqrtps xmm2, xmm3 // xmm2 = sinom #endif mulps xmm3, xmm2 // xmm3 = sqrt( scale0 ) // omega0 = atan2( xmm3, xmm0 ) movaps xmm4, xmm0 minps xmm0, xmm3 maxps xmm3, xmm4 cmpeqps xmm4, xmm0 #ifdef REFINE_BLENDJOINTS_RECIPROCAL rcpps xmm5, xmm3 mulps xmm3, xmm5 mulps xmm3, xmm5 addps xmm5, xmm5 subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x mulps xmm0, xmm5 // xmm0 = x / y or y / x #else rcpps xmm3, xmm3 // xmm3 = 1 / y or 1 / x mulps xmm0, xmm3 // xmm0 = x / y or y / x #endif movaps xmm3, xmm4 andps xmm3, SIMD_SP_signBitMask xorps xmm0, xmm3 // xmm0 = -x / y or y / x andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f movaps xmm3, xmm0 mulps xmm3, xmm3 // xmm3 = s movaps xmm5, SIMD_SP_atan_c0 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c1 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c2 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c3 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c4 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c5 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c6 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c7 mulps xmm5, xmm3 addps xmm5, SIMD_SP_one mulps xmm5, xmm0 addps xmm5, xmm4 // xmm5 = omega0 movaps xmm6, xmm7 // xmm6 = lerp mulps xmm6, xmm5 // xmm6 = omega1 subps xmm5, xmm6 // xmm5 = omega0 // scale0 = sin( xmm5 ) * xmm2 // scale1 = sin( xmm6 ) * xmm2 movaps xmm3, xmm5 movaps xmm7, xmm6 mulps xmm3, xmm3 mulps xmm7, xmm7 movaps xmm4, SIMD_SP_sin_c0 movaps xmm0, SIMD_SP_sin_c0 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c1 addps xmm0, SIMD_SP_sin_c1 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c2 addps xmm0, SIMD_SP_sin_c2 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c3 addps xmm0, SIMD_SP_sin_c3 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c4 addps xmm0, SIMD_SP_sin_c4 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_one addps xmm0, SIMD_SP_one mulps xmm5, xmm4 mulps xmm6, xmm0 mulps xmm5, xmm2 // xmm5 = scale0 mulps xmm6, xmm2 // xmm6 = scale1 xorps xmm6, xmm1 movaps xmm0, jointQuat0 mulps xmm0, xmm5 movaps xmm1, blendQuat0 mulps xmm1, xmm6 addps xmm0, xmm1 movaps jointQuat0, xmm0 movaps xmm1, jointQuat1 mulps xmm1, xmm5 movaps xmm2, blendQuat1 mulps xmm2, xmm6 addps xmm1, xmm2 movaps jointQuat1, xmm1 movaps xmm2, jointQuat2 mulps xmm2, xmm5 movaps xmm3, blendQuat2 mulps xmm3, xmm6 addps xmm2, xmm3 movaps jointQuat2, xmm2 movaps xmm3, jointQuat3 mulps xmm3, xmm5 movaps xmm4, blendQuat3 mulps xmm4, xmm6 addps xmm3, xmm4 movaps jointQuat3, xmm3 } #else jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] ); jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] ); jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] ); jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] ); jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] ); jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] ); jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] ); jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] ); jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] ); jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] ); jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] ); jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] ); ALIGN16( float cosom[4] ); ALIGN16( float sinom[4] ); ALIGN16( float omega0[4] ); ALIGN16( float omega1[4] ); ALIGN16( float scale0[4] ); ALIGN16( float scale1[4] ); ALIGN16( unsigned int signBit[4] ); cosom[0] = jointQuat0[0] * blendQuat0[0]; cosom[1] = jointQuat0[1] * blendQuat0[1]; cosom[2] = jointQuat0[2] * blendQuat0[2]; cosom[3] = jointQuat0[3] * blendQuat0[3]; cosom[0] += jointQuat1[0] * blendQuat1[0]; cosom[1] += jointQuat1[1] * blendQuat1[1]; cosom[2] += jointQuat1[2] * blendQuat1[2]; cosom[3] += jointQuat1[3] * blendQuat1[3]; cosom[0] += jointQuat2[0] * blendQuat2[0]; cosom[1] += jointQuat2[1] * blendQuat2[1]; cosom[2] += jointQuat2[2] * blendQuat2[2]; cosom[3] += jointQuat2[3] * blendQuat2[3]; cosom[0] += jointQuat3[0] * blendQuat3[0]; cosom[1] += jointQuat3[1] * blendQuat3[1]; cosom[2] += jointQuat3[2] * blendQuat3[2]; cosom[3] += jointQuat3[3] * blendQuat3[3]; signBit[0] = (*(unsigned int *)&cosom[0]) & ( 1 << 31 ); signBit[1] = (*(unsigned int *)&cosom[1]) & ( 1 << 31 ); signBit[2] = (*(unsigned int *)&cosom[2]) & ( 1 << 31 ); signBit[3] = (*(unsigned int *)&cosom[3]) & ( 1 << 31 ); (*(unsigned int *)&cosom[0]) ^= signBit[0]; (*(unsigned int *)&cosom[1]) ^= signBit[1]; (*(unsigned int *)&cosom[2]) ^= signBit[2]; (*(unsigned int *)&cosom[3]) ^= signBit[3]; scale0[0] = 1.0f - cosom[0] * cosom[0]; scale0[1] = 1.0f - cosom[1] * cosom[1]; scale0[2] = 1.0f - cosom[2] * cosom[2]; scale0[3] = 1.0f - cosom[3] * cosom[3]; scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0]; scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1]; scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2]; scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3]; sinom[0] = idMath::RSqrt( scale0[0] ); sinom[1] = idMath::RSqrt( scale0[1] ); sinom[2] = idMath::RSqrt( scale0[2] ); sinom[3] = idMath::RSqrt( scale0[3] ); scale0[0] *= sinom[0]; scale0[1] *= sinom[1]; scale0[2] *= sinom[2]; scale0[3] *= sinom[3]; omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] ); omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] ); omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] ); omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] ); omega1[0] = lerp * omega0[0]; omega1[1] = lerp * omega0[1]; omega1[2] = lerp * omega0[2]; omega1[3] = lerp * omega0[3]; omega0[0] -= omega1[0]; omega0[1] -= omega1[1]; omega0[2] -= omega1[2]; omega0[3] -= omega1[3]; scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0]; scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1]; scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2]; scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3]; scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0]; scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1]; scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2]; scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3]; (*(unsigned int *)&scale1[0]) ^= signBit[0]; (*(unsigned int *)&scale1[1]) ^= signBit[1]; (*(unsigned int *)&scale1[2]) ^= signBit[2]; (*(unsigned int *)&scale1[3]) ^= signBit[3]; jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0]; jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1]; jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2]; jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3]; jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0]; jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1]; jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2]; jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3]; jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0]; jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1]; jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2]; jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3]; jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0]; jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1]; jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2]; jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3]; #endif for ( int j = 0; j < 4; j++ ) { int n = index[i+j]; joints[n].t[0] = jointVert0[j]; joints[n].t[1] = jointVert1[j]; joints[n].t[2] = jointVert2[j]; joints[n].q[0] = jointQuat0[j]; joints[n].q[1] = jointQuat1[j]; joints[n].q[2] = jointQuat2[j]; joints[n].q[3] = jointQuat3[j]; } } for ( ; i < numJoints; i++ ) { int n = index[i]; idVec3 &jointVert = joints[n].t; const idVec3 &blendVert = blendJoints[n].t; jointVert[0] += lerp * ( blendVert[0] - jointVert[0] ); jointVert[1] += lerp * ( blendVert[1] - jointVert[1] ); jointVert[2] += lerp * ( blendVert[2] - jointVert[2] ); idQuat &jointQuat = joints[n].q; const idQuat &blendQuat = blendJoints[n].q; float cosom; float sinom; float omega; float scale0; float scale1; unsigned int signBit; cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w; signBit = (*(unsigned int *)&cosom) & ( 1 << 31 ); (*(unsigned int *)&cosom) ^= signBit; scale0 = 1.0f - cosom * cosom; scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0; sinom = idMath::InvSqrt( scale0 ); omega = idMath::ATan16( scale0 * sinom, cosom ); scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom; scale1 = idMath::Sin16( lerp * omega ) * sinom; (*(unsigned int *)&scale1) ^= signBit; jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x; jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y; jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z; jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w; } } /* ============ idSIMD_SSE::ConvertJointQuatsToJointMats ============ */ void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) { assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) ); for ( int i = 0; i < numJoints; i++ ) { const float *q = jointQuats[i].q.ToFloatPtr(); float *m = jointMats[i].ToFloatPtr(); m[0*4+3] = q[4]; m[1*4+3] = q[5]; m[2*4+3] = q[6]; float x2 = q[0] + q[0]; float y2 = q[1] + q[1]; float z2 = q[2] + q[2]; { float xx = q[0] * x2; float yy = q[1] * y2; float zz = q[2] * z2; m[0*4+0] = 1.0f - yy - zz; m[1*4+1] = 1.0f - xx - zz; m[2*4+2] = 1.0f - xx - yy; } { float yz = q[1] * z2; float wx = q[3] * x2; m[2*4+1] = yz - wx; m[1*4+2] = yz + wx; } { float xy = q[0] * y2; float wz = q[3] * z2; m[1*4+0] = xy - wz; m[0*4+1] = xy + wz; } { float xz = q[0] * z2; float wy = q[3] * y2; m[0*4+2] = xz - wy; m[2*4+0] = xz + wy; } } } /* ============ idSIMD_SSE::ConvertJointMatsToJointQuats ============ */ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) { assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) ); #if 1 ALIGN16( byte shuffle[16] ); __asm { mov eax, numJoints mov esi, jointMats mov edi, jointQuats and eax, ~3 jz done4 imul eax, JOINTMAT_SIZE add esi, eax neg eax loopMat4: movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4] movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4] movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4] shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4] movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4] movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4] movss xmm5, xmm0 movss xmm6, xmm1 movss xmm7, xmm2 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4] movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4] movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4] movss xmm5, xmm0 movss xmm6, xmm1 movss xmm7, xmm2 shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4] movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4] movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4] movss xmm5, xmm0 movss xmm6, xmm1 movss xmm7, xmm2 // ------------------- movaps xmm0, xmm5 addps xmm0, xmm6 addps xmm0, xmm7 cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f movaps xmm1, xmm5 movaps xmm2, xmm5 cmpnltps xmm1, xmm6 cmpnltps xmm2, xmm7 andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] movaps xmm4, xmm6 cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2] movaps xmm1, xmm0 andnps xmm1, xmm2 orps xmm2, xmm0 movaps xmm3, xmm2 andnps xmm2, xmm4 orps xmm3, xmm2 xorps xmm3, SIMD_SP_not andps xmm0, SIMD_DW_mat2quatShuffle0 movaps xmm4, xmm1 andps xmm4, SIMD_DW_mat2quatShuffle1 orps xmm0, xmm4 movaps xmm4, xmm2 andps xmm4, SIMD_DW_mat2quatShuffle2 orps xmm0, xmm4 movaps xmm4, xmm3 andps xmm4, SIMD_DW_mat2quatShuffle3 orps xmm4, xmm0 movaps shuffle, xmm4 movaps xmm0, xmm2 orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0 orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2 orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1 andps xmm0, SIMD_SP_signBitMask andps xmm1, SIMD_SP_signBitMask andps xmm2, SIMD_SP_signBitMask xorps xmm5, xmm0 xorps xmm6, xmm1 xorps xmm7, xmm2 addps xmm5, xmm6 addps xmm7, SIMD_SP_one addps xmm5, xmm7 // xmm5 = t movaps xmm7, xmm5 // xmm7 = t rsqrtps xmm6, xmm5 mulps xmm5, xmm6 mulps xmm5, xmm6 subps xmm5, SIMD_SP_rsqrt_c0 mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1 mulps xmm6, xmm5 // xmm5 = s mulps xmm7, xmm6 // xmm7 = s * t xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s // ------------------- add edi, 4*JOINTQUAT_SIZE movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0 movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[0*4+1] // edx = k1 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2 movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[0*4+3] // edx = k3 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4] mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4] mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4] mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0 movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[1*4+1] // edx = k1 movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2 movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[1*4+3] // edx = k3 movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4] mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4] mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4] mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0 movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[2*4+1] // edx = k1 movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2 movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[2*4+3] // edx = k3 movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4] mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4] mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4] mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[3*4+1] // edx = k1 movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2 movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[3*4+3] // edx = k3 movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4] mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4] mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4] mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; add eax, 4*JOINTMAT_SIZE jl loopMat4 done4: mov eax, numJoints and eax, 3 jz done1 imul eax, JOINTMAT_SIZE add esi, eax neg eax loopMat1: movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4] movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4] movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4] // ------------------- movaps xmm0, xmm5 addss xmm0, xmm6 addss xmm0, xmm7 cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f movaps xmm1, xmm5 movaps xmm2, xmm5 cmpnltss xmm1, xmm6 cmpnltss xmm2, xmm7 andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] movaps xmm4, xmm6 cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2] movaps xmm1, xmm0 andnps xmm1, xmm2 orps xmm2, xmm0 movaps xmm3, xmm2 andnps xmm2, xmm4 orps xmm3, xmm2 xorps xmm3, SIMD_SP_not andps xmm0, SIMD_DW_mat2quatShuffle0 movaps xmm4, xmm1 andps xmm4, SIMD_DW_mat2quatShuffle1 orps xmm0, xmm4 movaps xmm4, xmm2 andps xmm4, SIMD_DW_mat2quatShuffle2 orps xmm0, xmm4 movaps xmm4, xmm3 andps xmm4, SIMD_DW_mat2quatShuffle3 orps xmm4, xmm0 movss shuffle, xmm4 movaps xmm0, xmm2 orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0 orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2 orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1 andps xmm0, SIMD_SP_signBitMask andps xmm1, SIMD_SP_signBitMask andps xmm2, SIMD_SP_signBitMask xorps xmm5, xmm0 xorps xmm6, xmm1 xorps xmm7, xmm2 addss xmm5, xmm6 addss xmm7, SIMD_SP_one addss xmm5, xmm7 // xmm5 = t movss xmm7, xmm5 // xmm7 = t rsqrtss xmm6, xmm5 mulss xmm5, xmm6 mulss xmm5, xmm6 subss xmm5, SIMD_SP_rsqrt_c0 mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1 mulss xmm6, xmm5 // xmm5 = s mulss xmm7, xmm6 // xmm7 = s * t xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s // ------------------- movzx ecx, byte ptr shuffle[0] // ecx = k0 add edi, JOINTQUAT_SIZE movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[1] // edx = k1 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[2] // ecx = k2 movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[3] // edx = k3 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4] mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4] mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4] mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; add eax, JOINTMAT_SIZE jl loopMat1 done1: } #elif 0 for ( int i = 0; i < numJoints; i++ ) { float s0, s1, s2; int k0, k1, k2, k3; float *q = jointQuats[i].q.ToFloatPtr(); const float *m = jointMats[i].ToFloatPtr(); if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) { k0 = 3; k1 = 2; k2 = 1; k3 = 0; s0 = 1.0f; s1 = 1.0f; s2 = 1.0f; } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) { k0 = 0; k1 = 1; k2 = 2; k3 = 3; s0 = 1.0f; s1 = -1.0f; s2 = -1.0f; } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) { k0 = 1; k1 = 0; k2 = 3; k3 = 2; s0 = -1.0f; s1 = 1.0f; s2 = -1.0f; } else { k0 = 2; k1 = 3; k2 = 0; k3 = 1; s0 = -1.0f; s1 = -1.0f; s2 = 1.0f; } float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[k0] = s * t; q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; q[4] = m[0 * 4 + 3]; q[5] = m[1 * 4 + 3]; q[6] = m[2 * 4 + 3]; } #elif 1 for ( int i = 0; i < numJoints; i++ ) { float *q = jointQuats[i].q.ToFloatPtr(); const float *m = jointMats[i].ToFloatPtr(); if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) { float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[3] = s * t; q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s; q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s; q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s; } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) { float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[0] = s * t; q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s; q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s; q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s; } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) { float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[1] = s * t; q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s; q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s; q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s; } else { float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[2] = s * t; q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s; q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s; q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s; } q[4] = m[0 * 4 + 3]; q[5] = m[1 * 4 + 3]; q[6] = m[2 * 4 + 3]; } #endif } /* ============ idSIMD_SSE::TransformJoints ============ */ void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { #if 1 assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); __asm { mov ecx, firstJoint mov eax, lastJoint sub eax, ecx jl done imul ecx, 4 mov edi, parents add edi, ecx imul ecx, 12 mov esi, jointMats imul eax, 4 add edi, eax neg eax loopJoint: movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0 mov edx, [edi+eax] movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1 imul edx, JOINTMAT_SIZE movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2 movss xmm4, [esi+edx+ 0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm0 movss xmm5, [esi+edx+ 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm1 addps xmm4, xmm5 movss xmm6, [esi+edx+ 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm2 addps xmm4, xmm6 movss xmm5, [esi+edx+16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm0 movss xmm7, [esi+edx+12] shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) addps xmm4, xmm7 movaps [esi+ecx+ 0], xmm4 movss xmm6, [esi+edx+20] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm1 addps xmm5, xmm6 movss xmm7, [esi+edx+24] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm5, xmm7 movss xmm6, [esi+edx+32] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm3, [esi+edx+28] shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) addps xmm5, xmm3 movaps [esi+ecx+16], xmm5 movss xmm7, [esi+edx+36] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm3, [esi+edx+40] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm2 addps xmm6, xmm3 movss xmm7, [esi+edx+44] shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) addps xmm6, xmm7 movaps [esi+ecx+32], xmm6 add ecx, JOINTMAT_SIZE add eax, 4 jle loopJoint done: } #else int i; for( i = firstJoint; i <= lastJoint; i++ ) { assert( parents[i] < i ); jointMats[i] *= jointMats[parents[i]]; } #endif } /* ============ idSIMD_SSE::UntransformJoints ============ */ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { #if 1 assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); __asm { mov edx, firstJoint mov eax, lastJoint mov ecx, eax sub eax, edx jl done mov esi, jointMats imul ecx, JOINTMAT_SIZE imul edx, 4 mov edi, parents add edi, edx imul eax, 4 loopJoint: movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0 mov edx, [edi+eax] movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1 imul edx, JOINTMAT_SIZE movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2 movss xmm6, [esi+edx+12] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) subps xmm0, xmm6 movss xmm7, [esi+edx+28] shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 ) subps xmm1, xmm7 movss xmm3, [esi+edx+44] shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 ) subps xmm2, xmm3 movss xmm4, [esi+edx+ 0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm4, xmm0 movss xmm5, [esi+edx+16] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm1 addps xmm4, xmm5 movss xmm6, [esi+edx+32] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm2 addps xmm4, xmm6 movaps [esi+ecx+ 0], xmm4 movss xmm5, [esi+edx+ 4] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm5, xmm0 movss xmm6, [esi+edx+20] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm1 addps xmm5, xmm6 movss xmm7, [esi+edx+36] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm5, xmm7 movaps [esi+ecx+16], xmm5 movss xmm6, [esi+edx+ 8] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm7, [esi+edx+24] shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm3, [esi+edx+40] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm3, xmm2 addps xmm6, xmm3 movaps [esi+ecx+32], xmm6 sub ecx, JOINTMAT_SIZE sub eax, 4 jge loopJoint done: } #else int i; for( i = lastJoint; i >= firstJoint; i-- ) { assert( parents[i] < i ); jointMats[i] /= jointMats[parents[i]]; } #endif } /* ============ idSIMD_SSE::TransformVerts ============ */ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) { #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE ); assert( sizeof( idJointMat ) == JOINTMAT_SIZE ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, index mov esi, weights mov edi, joints add ecx, eax neg eax loopVert: mov ebx, [edx] movaps xmm2, [esi] add edx, 8 movaps xmm0, xmm2 add esi, JOINTWEIGHT_SIZE movaps xmm1, xmm2 mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 cmp dword ptr [edx-4], 0 jne doneWeight loopWeight: mov ebx, [edx] movaps xmm5, [esi] add edx, 8 movaps xmm3, xmm5 add esi, JOINTWEIGHT_SIZE movaps xmm4, xmm5 mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2 cmp dword ptr [edx-4], 0 addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 je loopWeight doneWeight: add eax, DRAWVERT_SIZE movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4 unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1 addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2 movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6 movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8 addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+8], xmm5 jl loopVert done: } #else int i, j; const byte *jointsPtr = (byte *)joints; for( j = i = 0; i < numVerts; i++ ) { idVec3 v; v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j]; while( index[j*2+1] == 0 ) { j++; v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j]; } j++; verts[i].xyz = v; } #endif } /* ============ idSIMD_SSE::TracePointCull ============ */ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); __asm { push ebx mov eax, numVerts test eax, eax jz done mov edi, planes movlps xmm1, [edi] // xmm1 = 0, 1, X, X movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movss xmm7, radius shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) xor edx, edx mov esi, verts mov edi, cullBits imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert: movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4] mulps xmm4, xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulps xmm5, xmm1 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) addps xmm4, xmm5 mulps xmm6, xmm2 addps xmm4, xmm3 addps xmm4, xmm6 movaps xmm5, xmm4 xorps xmm5, SIMD_SP_signBitMask cmpltps xmm4, xmm7 movmskps ecx, xmm4 cmpltps xmm5, xmm7 movmskps ebx, xmm5 shl cx, 4 or cl, bl inc edi or dl, cl add eax, DRAWVERT_SIZE mov byte ptr [edi-1], cl jl loopVert done: mov esi, totalOr mov byte ptr [esi], dl pop ebx } #else int i; byte tOr; tOr = 0; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1, d2, d3, t; const idVec3 &v = verts[i].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; t = d0 + radius; bits = FLOATSIGNBITSET( t ) << 0; t = d1 + radius; bits |= FLOATSIGNBITSET( t ) << 1; t = d2 + radius; bits |= FLOATSIGNBITSET( t ) << 2; t = d3 + radius; bits |= FLOATSIGNBITSET( t ) << 3; t = d0 - radius; bits |= FLOATSIGNBITSET( t ) << 4; t = d1 - radius; bits |= FLOATSIGNBITSET( t ) << 5; t = d2 - radius; bits |= FLOATSIGNBITSET( t ) << 6; t = d3 - radius; bits |= FLOATSIGNBITSET( t ) << 7; bits ^= 0x0F; // flip lower four bits tOr |= bits; cullBits[i] = bits; } totalOr = tOr; #endif } /* ============ idSIMD_SSE::DecalPointCull ============ */ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { #if 1 ALIGN16( float p0[4] ); ALIGN16( float p1[4] ); ALIGN16( float p2[4] ); ALIGN16( float p3[4] ); ALIGN16( float p4[4] ); ALIGN16( float p5[4] ); ALIGN16( float p6[4] ); ALIGN16( float p7[4] ); assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); __asm { mov ecx, planes movlps xmm1, [ecx] // xmm1 = 0, 1, X, X movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movaps p0, xmm0 movaps p1, xmm1 movaps p2, xmm2 movaps p3, xmm3 movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51 movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51 movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53 movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53 shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52 shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53 movaps p4, xmm4 movaps p5, xmm5 movaps p6, xmm6 movaps p7, xmm7 mov esi, verts mov edi, cullBits mov eax, numVerts and eax, ~1 jz done2 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert2: movaps xmm6, p0 movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 movaps xmm6, p0 movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm3 movaps xmm7, p1 movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movaps xmm7, p2 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps edx, xmm6 mov ch, dl shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, p4 shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, p5 addps xmm0, xmm1 shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 add edi, 2 mov dh, dl shl dl, 4 shl dh, 2 and edx, (3<<4)|(3<<12) or ecx, edx add eax, 2*DRAWVERT_SIZE mov word ptr [edi-2], cx jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movaps xmm6, p0 movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 mulps xmm0, p4 mulps xmm1, p5 addps xmm0, xmm1 mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 and edx, 3 shl edx, 4 or ecx, edx mov byte ptr [edi], cl done: } #else int i; for ( i = 0; i < numVerts; i += 2 ) { unsigned short bits0, bits1; float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; const idVec3 &v0 = verts[i+0].xyz; const idVec3 &v1 = verts[i+1].xyz; d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3]; d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3]; d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3]; d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3]; d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3]; d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3]; d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3]; d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3]; d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3]; d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3]; d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3]; d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3]; bits0 = FLOATSIGNBITSET( d0 ) << (0+0); bits0 |= FLOATSIGNBITSET( d1 ) << (0+1); bits0 |= FLOATSIGNBITSET( d2 ) << (0+2); bits0 |= FLOATSIGNBITSET( d3 ) << (0+3); bits0 |= FLOATSIGNBITSET( d4 ) << (0+4); bits0 |= FLOATSIGNBITSET( d5 ) << (0+5); bits1 = FLOATSIGNBITSET( d6 ) << (8+0); bits1 |= FLOATSIGNBITSET( d7 ) << (8+1); bits1 |= FLOATSIGNBITSET( d8 ) << (8+2); bits1 |= FLOATSIGNBITSET( d9 ) << (8+3); bits1 |= FLOATSIGNBITSET( d10 ) << (8+4); bits1 |= FLOATSIGNBITSET( d11 ) << (8+5); *(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F; } if ( numVerts & 1 ) { byte bits; float d0, d1, d2, d3, d4, d5; const idVec3 &v = verts[numVerts - 1].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3]; d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3]; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits } #endif } /* ============ idSIMD_SSE::OverlayPointCull ============ */ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); __asm { mov eax, numVerts mov edx, verts mov esi, texCoords mov edi, cullBits mov ecx, planes movss xmm4, [ecx+ 0] movss xmm5, [ecx+16] shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) movss xmm5, [ecx+ 4] movss xmm6, [ecx+20] shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) movss xmm6, [ecx+ 8] movss xmm7, [ecx+24] shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) movss xmm7, [ecx+12] movss xmm0, [ecx+28] shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 ) and eax, ~1 jz done2 add edi, eax neg eax loopVert2: movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, xmm4 movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm5 movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm7 movaps [esi], xmm0 movaps xmm1, xmm0 movaps xmm2, SIMD_SP_one subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 ) add edx, 2*DRAWVERT_SIZE movmskps ecx, xmm0 mov byte ptr [edi+eax+0], cl add esi, 4*4 movmskps ecx, xmm1 mov byte ptr [edi+eax+1], cl add eax, 2 jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, xmm4 movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm5 movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm7 movlps [esi], xmm0 movaps xmm1, xmm0 movaps xmm2, SIMD_SP_one subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) movmskps ecx, xmm0 mov byte ptr [edi], cl done: } #else const idPlane &p0 = planes[0]; const idPlane &p1 = planes[1]; for ( int i = 0; i < numVerts - 1; i += 2 ) { unsigned short bits; float d0, d1, d2, d3; const idVec3 &v0 = verts[i+0].xyz; const idVec3 &v1 = verts[i+1].xyz; d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3]; d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3]; texCoords[i+0][0] = d0; texCoords[i+0][1] = d1; texCoords[i+1][0] = d2; texCoords[i+1][1] = d3; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 8; bits |= FLOATSIGNBITSET( d3 ) << 9; d0 = 1.0f - d0; d1 = 1.0f - d1; d2 = 1.0f - d2; d3 = 1.0f - d3; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; bits |= FLOATSIGNBITSET( d2 ) << 10; bits |= FLOATSIGNBITSET( d3 ) << 11; *(unsigned short *)(cullBits + i) = bits; } if ( numVerts & 1 ) { byte bits; float d0, d1; const idPlane &p0 = planes[0]; const idPlane &p1 = planes[1]; const idVec3 &v0 = verts[numVerts - 1].xyz; d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; texCoords[i][0] = d0; texCoords[i][1] = d1; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; d0 = 1.0f - d0; d1 = 1.0f - d1; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; cullBits[numVerts - 1] = bits; } #endif } /* ============ idSIMD_SSE::DeriveTriPlanes ============ */ void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) { #if 1 assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); __asm { mov eax, numIndexes shl eax, 2 mov esi, verts mov edi, indexes mov edx, planes add edi, eax neg eax add eax, 4*12 jge done4 loopPlane4: mov ebx, [edi+eax-4*12+4] imul ebx, DRAWVERT_SIZE mov ecx, [edi+eax-4*12+0] imul ecx, DRAWVERT_SIZE movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] mov ebx, [edi+eax-4*12+8] imul ebx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] mov ebx, [edi+eax-3*12+4] imul ebx, DRAWVERT_SIZE mov ecx, [edi+eax-3*12+0] imul ecx, DRAWVERT_SIZE shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm0, xmm6 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm1, xmm7 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm2, xmm6 mov ebx, [edi+eax-3*12+8] imul ebx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm3, xmm7 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm4, xmm6 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm5, xmm7 mov ebx, [edi+eax-2*12+4] imul ebx, DRAWVERT_SIZE mov ecx, [edi+eax-2*12+0] imul ecx, DRAWVERT_SIZE shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm0, xmm6 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm1, xmm7 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm2, xmm6 mov ebx, [edi+eax-2*12+8] imul ebx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm3, xmm7 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm4, xmm6 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm5, xmm7 mov ebx, [edi+eax-1*12+4] imul ebx, DRAWVERT_SIZE mov ecx, [edi+eax-1*12+0] imul ecx, DRAWVERT_SIZE shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 ) shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 ) movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm0, xmm6 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm1, xmm7 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm2, xmm6 mov ebx, [edi+eax-1*12+8] imul ebx, DRAWVERT_SIZE movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm3, xmm7 movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm4, xmm6 movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm5, xmm7 movaps xmm6, xmm4 mulps xmm6, xmm2 movaps xmm7, xmm5 mulps xmm7, xmm1 subps xmm6, xmm7 mulps xmm5, xmm0 mulps xmm2, xmm3 subps xmm5, xmm2 mulps xmm3, xmm1 mulps xmm4, xmm0 subps xmm3, xmm4 movaps xmm0, xmm6 mulps xmm6, xmm6 movaps xmm1, xmm5 mulps xmm5, xmm5 movaps xmm2, xmm3 mulps xmm3, xmm3 addps xmm3, xmm5 addps xmm3, xmm6 rsqrtps xmm3, xmm3 add edx, 4*16 mov ecx, [edi+eax-1*12+0] imul ecx, DRAWVERT_SIZE mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movss [edx-1*16+0], xmm0 movss [edx-1*16+4], xmm1 movss [edx-1*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 movss [edx-1*16+12], xmm0 mov ecx, [edi+eax-2*12+0] imul ecx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [edx-2*16+0], xmm0 movss [edx-2*16+4], xmm1 movss [edx-2*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 movss [edx-2*16+12], xmm0 mov ecx, [edi+eax-3*12+0] imul ecx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [edx-3*16+0], xmm0 movss [edx-3*16+4], xmm1 movss [edx-3*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 movss [edx-3*16+12], xmm0 mov ecx, [edi+eax-4*12+0] imul ecx, DRAWVERT_SIZE shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [edx-4*16+0], xmm0 movss [edx-4*16+4], xmm1 movss [edx-4*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 movss [edx-4*16+12], xmm0 add eax, 4*12 jle loopPlane4 done4: sub eax, 4*12 jge done loopPlane1: mov ebx, [edi+eax+4] imul ebx, DRAWVERT_SIZE mov ecx, [edi+eax+0] imul ecx, DRAWVERT_SIZE movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] mov ebx, [edi+eax+8] imul ebx, DRAWVERT_SIZE movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm6, xmm4 mulss xmm6, xmm2 movss xmm7, xmm5 mulss xmm7, xmm1 subss xmm6, xmm7 mulss xmm5, xmm0 mulss xmm2, xmm3 subss xmm5, xmm2 mulss xmm3, xmm1 mulss xmm4, xmm0 subss xmm3, xmm4 movss xmm0, xmm6 mulss xmm6, xmm6 movss xmm1, xmm5 mulss xmm5, xmm5 movss xmm2, xmm3 mulss xmm3, xmm3 addss xmm3, xmm5 addss xmm3, xmm6 rsqrtss xmm3, xmm3 add edx, 1*16 mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [edx-1*16+0], xmm0 movss [edx-1*16+4], xmm1 movss [edx-1*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_singleSignBitMask subss xmm0, xmm1 subss xmm0, xmm2 movss [edx-1*16+12], xmm0 add eax, 1*12 jl loopPlane1 done: } #else int i, j; for ( i = 0; i <= numIndexes - 12; i += 12 ) { ALIGN16( float d0[4] ); ALIGN16( float d1[4] ); ALIGN16( float d2[4] ); ALIGN16( float d3[4] ); ALIGN16( float d4[4] ); ALIGN16( float d5[4] ); ALIGN16( float n0[4] ); ALIGN16( float n1[4] ); ALIGN16( float n2[4] ); for ( j = 0; j < 4; j++ ) { const idDrawVert *a, *b, *c; a = verts + indexes[i + j * 3 + 0]; b = verts + indexes[i + j * 3 + 1]; c = verts + indexes[i + j * 3 + 2]; d0[j] = b->xyz[0] - a->xyz[0]; d1[j] = b->xyz[1] - a->xyz[1]; d2[j] = b->xyz[2] - a->xyz[2]; d3[j] = c->xyz[0] - a->xyz[0]; d4[j] = c->xyz[1] - a->xyz[1]; d5[j] = c->xyz[2] - a->xyz[2]; } ALIGN16( float tmp[4] ); n0[0] = d4[0] * d2[0]; n0[1] = d4[1] * d2[1]; n0[2] = d4[2] * d2[2]; n0[3] = d4[3] * d2[3]; n0[0] -= d5[0] * d1[0]; n0[1] -= d5[1] * d1[1]; n0[2] -= d5[2] * d1[2]; n0[3] -= d5[3] * d1[3]; n1[0] = d5[0] * d0[0]; n1[1] = d5[1] * d0[1]; n1[2] = d5[2] * d0[2]; n1[3] = d5[3] * d0[3]; n1[0] -= d3[0] * d2[0]; n1[1] -= d3[1] * d2[1]; n1[2] -= d3[2] * d2[2]; n1[3] -= d3[3] * d2[3]; n2[0] = d3[0] * d1[0]; n2[1] = d3[1] * d1[1]; n2[2] = d3[2] * d1[2]; n2[3] = d3[3] * d1[3]; n2[0] -= d4[0] * d0[0]; n2[1] -= d4[1] * d0[1]; n2[2] -= d4[2] * d0[2]; n2[3] -= d4[3] * d0[3]; tmp[0] = n0[0] * n0[0]; tmp[1] = n0[1] * n0[1]; tmp[2] = n0[2] * n0[2]; tmp[3] = n0[3] * n0[3]; tmp[0] += n1[0] * n1[0]; tmp[1] += n1[1] * n1[1]; tmp[2] += n1[2] * n1[2]; tmp[3] += n1[3] * n1[3]; tmp[0] += n2[0] * n2[0]; tmp[1] += n2[1] * n2[1]; tmp[2] += n2[2] * n2[2]; tmp[3] += n2[3] * n2[3]; tmp[0] = idMath::RSqrt( tmp[0] ); tmp[1] = idMath::RSqrt( tmp[1] ); tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); n0[0] *= tmp[0]; n0[1] *= tmp[1]; n0[2] *= tmp[2]; n0[3] *= tmp[3]; n1[0] *= tmp[0]; n1[1] *= tmp[1]; n1[2] *= tmp[2]; n1[3] *= tmp[3]; n2[0] *= tmp[0]; n2[1] *= tmp[1]; n2[2] *= tmp[2]; n2[3] *= tmp[3]; for ( j = 0; j < 4; j++ ) { const idDrawVert *a; a = verts + indexes[i + j * 3]; planes->Normal()[0] = n0[j]; planes->Normal()[1] = n1[j]; planes->Normal()[2] = n2[j]; planes->FitThroughPoint( a->xyz ); planes++; } } for ( ; i < numIndexes; i += 3 ) { const idDrawVert *a, *b, *c; float d0, d1, d2, d3, d4, d5; float n0, n1, n2; a = verts + indexes[i + 0]; b = verts + indexes[i + 1]; c = verts + indexes[i + 2]; d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; d3 = c->xyz[0] - a->xyz[0]; d4 = c->xyz[1] - a->xyz[1]; d5 = c->xyz[2] - a->xyz[2]; float tmp; n0 = d4 * d2 - d5 * d1; n1 = d5 * d0 - d3 * d2; n2 = d3 * d1 - d4 * d0; tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 ); n0 *= tmp; n1 *= tmp; n2 *= tmp; planes->Normal()[0] = n0; planes->Normal()[1] = n1; planes->Normal()[2] = n2; planes->FitThroughPoint( a->xyz ); planes++; } #endif } /* ============ idSIMD_SSE::DeriveTangents ============ */ //#define REFINE_TANGENT_SQUAREROOT #define FIX_DEGENERATE_TANGENT void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) { int i; assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); assert( planes != NULL ); assert( verts != NULL ); assert( numVerts >= 0 ); #ifdef REFINE_TANGENT_SQUAREROOT __asm { movaps xmm6, SIMD_SP_rsqrt_c0 movaps xmm7, SIMD_SP_rsqrt_c1 } #endif bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) ); memset( used, 0, numVerts * sizeof( used[0] ) ); for ( i = 0; i <= numIndexes - 12; i += 12 ) { idDrawVert *a, *b, *c; ALIGN16( unsigned int signBit[4] ); ALIGN16( float d0[4] ); ALIGN16( float d1[4] ); ALIGN16( float d2[4] ); ALIGN16( float d3[4] ); ALIGN16( float d4[4] ); ALIGN16( float d5[4] ); ALIGN16( float d6[4] ); ALIGN16( float d7[4] ); ALIGN16( float d8[4] ); ALIGN16( float d9[4] ); ALIGN16( float n0[4] ); ALIGN16( float n1[4] ); ALIGN16( float n2[4] ); ALIGN16( float t0[4] ); ALIGN16( float t1[4] ); ALIGN16( float t2[4] ); ALIGN16( float t3[4] ); ALIGN16( float t4[4] ); ALIGN16( float t5[4] ); for ( int j = 0; j < 4; j++ ) { a = verts + indexes[i + j * 3 + 0]; b = verts + indexes[i + j * 3 + 1]; c = verts + indexes[i + j * 3 + 2]; d0[j] = b->xyz[0] - a->xyz[0]; d1[j] = b->xyz[1] - a->xyz[1]; d2[j] = b->xyz[2] - a->xyz[2]; d3[j] = b->st[0] - a->st[0]; d4[j] = b->st[1] - a->st[1]; d5[j] = c->xyz[0] - a->xyz[0]; d6[j] = c->xyz[1] - a->xyz[1]; d7[j] = c->xyz[2] - a->xyz[2]; d8[j] = c->st[0] - a->st[0]; d9[j] = c->st[1] - a->st[1]; } #if 1 __asm { // normal movaps xmm0, d6 mulps xmm0, d2 movaps xmm1, d7 mulps xmm1, d1 subps xmm0, xmm1 movaps xmm1, d7 mulps xmm1, d0 movaps xmm2, d5 mulps xmm2, d2 subps xmm1, xmm2 movaps xmm2, d5 mulps xmm2, d1 movaps xmm3, d6 mulps xmm3, d0 subps xmm2, xmm3 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm3, xmm4 addps xmm3, xmm5 #ifdef FIX_DEGENERATE_TANGENT xorps xmm4, xmm4 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #endif #ifdef REFINE_TANGENT_SQUAREROOT rsqrtps xmm4, xmm3 mulps xmm3, xmm4 mulps xmm3, xmm4 subps xmm3, xmm6 mulps xmm4, xmm7 mulps xmm3, xmm4 #else rsqrtps xmm3, xmm3 #endif mulps xmm0, xmm3 movaps n0, xmm0 mulps xmm1, xmm3 movaps n1, xmm1 mulps xmm2, xmm3 movaps n2, xmm2 // area sign bit movaps xmm0, d3 mulps xmm0, d9 movaps xmm1, d4 mulps xmm1, d8 subps xmm0, xmm1 andps xmm0, SIMD_SP_signBitMask movaps signBit, xmm0 // first tangent movaps xmm0, d0 mulps xmm0, d9 movaps xmm1, d4 mulps xmm1, d5 subps xmm0, xmm1 movaps xmm1, d1 mulps xmm1, d9 movaps xmm2, d4 mulps xmm2, d6 subps xmm1, xmm2 movaps xmm2, d2 mulps xmm2, d9 movaps xmm3, d4 mulps xmm3, d7 subps xmm2, xmm3 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm3, xmm4 addps xmm3, xmm5 #ifdef FIX_DEGENERATE_TANGENT xorps xmm4, xmm4 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #endif #ifdef REFINE_TANGENT_SQUAREROOT rsqrtps xmm4, xmm3 mulps xmm3, xmm4 mulps xmm3, xmm4 subps xmm3, xmm6 mulps xmm4, xmm7 mulps xmm3, xmm4 #else rsqrtps xmm3, xmm3 #endif xorps xmm3, signBit mulps xmm0, xmm3 movaps t0, xmm0 mulps xmm1, xmm3 movaps t1, xmm1 mulps xmm2, xmm3 movaps t2, xmm2 // second tangent movaps xmm0, d3 mulps xmm0, d5 movaps xmm1, d0 mulps xmm1, d8 subps xmm0, xmm1 movaps xmm1, d3 mulps xmm1, d6 movaps xmm2, d1 mulps xmm2, d8 subps xmm1, xmm2 movaps xmm2, d3 mulps xmm2, d7 movaps xmm3, d2 mulps xmm3, d8 subps xmm2, xmm3 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm3, xmm4 addps xmm3, xmm5 #ifdef FIX_DEGENERATE_TANGENT xorps xmm4, xmm4 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #endif #ifdef REFINE_TANGENT_SQUAREROOT rsqrtps xmm4, xmm3 mulps xmm3, xmm4 mulps xmm3, xmm4 subps xmm3, xmm6 mulps xmm4, xmm7 mulps xmm3, xmm4 #else rsqrtps xmm3, xmm3 #endif xorps xmm3, signBit mulps xmm0, xmm3 movaps t3, xmm0 mulps xmm1, xmm3 movaps t4, xmm1 mulps xmm2, xmm3 movaps t5, xmm2 } #else ALIGN16( float tmp[4] ); // normal n0[0] = d6[0] * d2[0]; n0[1] = d6[1] * d2[1]; n0[2] = d6[2] * d2[2]; n0[3] = d6[3] * d2[3]; n0[0] -= d7[0] * d1[0]; n0[1] -= d7[1] * d1[1]; n0[2] -= d7[2] * d1[2]; n0[3] -= d7[3] * d1[3]; n1[0] = d7[0] * d0[0]; n1[1] = d7[1] * d0[1]; n1[2] = d7[2] * d0[2]; n1[3] = d7[3] * d0[3]; n1[0] -= d5[0] * d2[0]; n1[1] -= d5[1] * d2[1]; n1[2] -= d5[2] * d2[2]; n1[3] -= d5[3] * d2[3]; n2[0] = d5[0] * d1[0]; n2[1] = d5[1] * d1[1]; n2[2] = d5[2] * d1[2]; n2[3] = d5[3] * d1[3]; n2[0] -= d6[0] * d0[0]; n2[1] -= d6[1] * d0[1]; n2[2] -= d6[2] * d0[2]; n2[3] -= d6[3] * d0[3]; tmp[0] = n0[0] * n0[0]; tmp[1] = n0[1] * n0[1]; tmp[2] = n0[2] * n0[2]; tmp[3] = n0[3] * n0[3]; tmp[0] += n1[0] * n1[0]; tmp[1] += n1[1] * n1[1]; tmp[2] += n1[2] * n1[2]; tmp[3] += n1[3] * n1[3]; tmp[0] += n2[0] * n2[0]; tmp[1] += n2[1] * n2[1]; tmp[2] += n2[2] * n2[2]; tmp[3] += n2[3] * n2[3]; tmp[0] = idMath::RSqrt( tmp[0] ); tmp[1] = idMath::RSqrt( tmp[1] ); tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); n0[0] *= tmp[0]; n0[1] *= tmp[1]; n0[2] *= tmp[2]; n0[3] *= tmp[3]; n1[0] *= tmp[0]; n1[1] *= tmp[1]; n1[2] *= tmp[2]; n1[3] *= tmp[3]; n2[0] *= tmp[0]; n2[1] *= tmp[1]; n2[2] *= tmp[2]; n2[3] *= tmp[3]; // area sign bit tmp[0] = d3[0] * d9[0]; tmp[1] = d3[1] * d9[1]; tmp[2] = d3[2] * d9[2]; tmp[3] = d3[3] * d9[3]; tmp[0] -= d4[0] * d8[0]; tmp[1] -= d4[1] * d8[1]; tmp[2] -= d4[2] * d8[2]; tmp[3] -= d4[3] * d8[3]; signBit[0] = ( *(unsigned int *)&tmp[0] ) & ( 1 << 31 ); signBit[1] = ( *(unsigned int *)&tmp[1] ) & ( 1 << 31 ); signBit[2] = ( *(unsigned int *)&tmp[2] ) & ( 1 << 31 ); signBit[3] = ( *(unsigned int *)&tmp[3] ) & ( 1 << 31 ); // first tangent t0[0] = d0[0] * d9[0]; t0[1] = d0[1] * d9[1]; t0[2] = d0[2] * d9[2]; t0[3] = d0[3] * d9[3]; t0[0] -= d4[0] * d5[0]; t0[1] -= d4[1] * d5[1]; t0[2] -= d4[2] * d5[2]; t0[3] -= d4[3] * d5[3]; t1[0] = d1[0] * d9[0]; t1[1] = d1[1] * d9[1]; t1[2] = d1[2] * d9[2]; t1[3] = d1[3] * d9[3]; t1[0] -= d4[0] * d6[0]; t1[1] -= d4[1] * d6[1]; t1[2] -= d4[2] * d6[2]; t1[3] -= d4[3] * d6[3]; t2[0] = d2[0] * d9[0]; t2[1] = d2[1] * d9[1]; t2[2] = d2[2] * d9[2]; t2[3] = d2[3] * d9[3]; t2[0] -= d4[0] * d7[0]; t2[1] -= d4[1] * d7[1]; t2[2] -= d4[2] * d7[2]; t2[3] -= d4[3] * d7[3]; tmp[0] = t0[0] * t0[0]; tmp[1] = t0[1] * t0[1]; tmp[2] = t0[2] * t0[2]; tmp[3] = t0[3] * t0[3]; tmp[0] += t1[0] * t1[0]; tmp[1] += t1[1] * t1[1]; tmp[2] += t1[2] * t1[2]; tmp[3] += t1[3] * t1[3]; tmp[0] += t2[0] * t2[0]; tmp[1] += t2[1] * t2[1]; tmp[2] += t2[2] * t2[2]; tmp[3] += t2[3] * t2[3]; tmp[0] = idMath::RSqrt( tmp[0] ); tmp[1] = idMath::RSqrt( tmp[1] ); tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); *(unsigned int *)&tmp[0] ^= signBit[0]; *(unsigned int *)&tmp[1] ^= signBit[1]; *(unsigned int *)&tmp[2] ^= signBit[2]; *(unsigned int *)&tmp[3] ^= signBit[3]; t0[0] *= tmp[0]; t0[1] *= tmp[1]; t0[2] *= tmp[2]; t0[3] *= tmp[3]; t1[0] *= tmp[0]; t1[1] *= tmp[1]; t1[2] *= tmp[2]; t1[3] *= tmp[3]; t2[0] *= tmp[0]; t2[1] *= tmp[1]; t2[2] *= tmp[2]; t2[3] *= tmp[3]; // second tangent t3[0] = d3[0] * d5[0]; t3[1] = d3[1] * d5[1]; t3[2] = d3[2] * d5[2]; t3[3] = d3[3] * d5[3]; t3[0] -= d0[0] * d8[0]; t3[1] -= d0[1] * d8[1]; t3[2] -= d0[2] * d8[2]; t3[3] -= d0[3] * d8[3]; t4[0] = d3[0] * d6[0]; t4[1] = d3[1] * d6[1]; t4[2] = d3[2] * d6[2]; t4[3] = d3[3] * d6[3]; t4[0] -= d1[0] * d8[0]; t4[1] -= d1[1] * d8[1]; t4[2] -= d1[2] * d8[2]; t4[3] -= d1[3] * d8[3]; t5[0] = d3[0] * d7[0]; t5[1] = d3[1] * d7[1]; t5[2] = d3[2] * d7[2]; t5[3] = d3[3] * d7[3]; t5[0] -= d2[0] * d8[0]; t5[1] -= d2[1] * d8[1]; t5[2] -= d2[2] * d8[2]; t5[3] -= d2[3] * d8[3]; tmp[0] = t3[0] * t3[0]; tmp[1] = t3[1] * t3[1]; tmp[2] = t3[2] * t3[2]; tmp[3] = t3[3] * t3[3]; tmp[0] += t4[0] * t4[0]; tmp[1] += t4[1] * t4[1]; tmp[2] += t4[2] * t4[2]; tmp[3] += t4[3] * t4[3]; tmp[0] += t5[0] * t5[0]; tmp[1] += t5[1] * t5[1]; tmp[2] += t5[2] * t5[2]; tmp[3] += t5[3] * t5[3]; tmp[0] = idMath::RSqrt( tmp[0] ); tmp[1] = idMath::RSqrt( tmp[1] ); tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); *(unsigned int *)&tmp[0] ^= signBit[0]; *(unsigned int *)&tmp[1] ^= signBit[1]; *(unsigned int *)&tmp[2] ^= signBit[2]; *(unsigned int *)&tmp[3] ^= signBit[3]; t3[0] *= tmp[0]; t3[1] *= tmp[1]; t3[2] *= tmp[2]; t3[3] *= tmp[3]; t4[0] *= tmp[0]; t4[1] *= tmp[1]; t4[2] *= tmp[2]; t4[3] *= tmp[3]; t5[0] *= tmp[0]; t5[1] *= tmp[1]; t5[2] *= tmp[2]; t5[3] *= tmp[3]; #endif for ( int j = 0; j < 4; j++ ) { const int v0 = indexes[i + j * 3 + 0]; const int v1 = indexes[i + j * 3 + 1]; const int v2 = indexes[i + j * 3 + 2]; a = verts + v0; b = verts + v1; c = verts + v2; planes->Normal()[0] = n0[j]; planes->Normal()[1] = n1[j]; planes->Normal()[2] = n2[j]; planes->FitThroughPoint( a->xyz ); planes++; if ( used[v0] ) { a->normal[0] += n0[j]; a->normal[1] += n1[j]; a->normal[2] += n2[j]; a->tangents[0][0] += t0[j]; a->tangents[0][1] += t1[j]; a->tangents[0][2] += t2[j]; a->tangents[1][0] += t3[j]; a->tangents[1][1] += t4[j]; a->tangents[1][2] += t5[j]; } else { a->normal[0] = n0[j]; a->normal[1] = n1[j]; a->normal[2] = n2[j]; a->tangents[0][0] = t0[j]; a->tangents[0][1] = t1[j]; a->tangents[0][2] = t2[j]; a->tangents[1][0] = t3[j]; a->tangents[1][1] = t4[j]; a->tangents[1][2] = t5[j]; used[v0] = true; } if ( used[v1] ) { b->normal[0] += n0[j]; b->normal[1] += n1[j]; b->normal[2] += n2[j]; b->tangents[0][0] += t0[j]; b->tangents[0][1] += t1[j]; b->tangents[0][2] += t2[j]; b->tangents[1][0] += t3[j]; b->tangents[1][1] += t4[j]; b->tangents[1][2] += t5[j]; } else { b->normal[0] = n0[j]; b->normal[1] = n1[j]; b->normal[2] = n2[j]; b->tangents[0][0] = t0[j]; b->tangents[0][1] = t1[j]; b->tangents[0][2] = t2[j]; b->tangents[1][0] = t3[j]; b->tangents[1][1] = t4[j]; b->tangents[1][2] = t5[j]; used[v1] = true; } if ( used[v2] ) { c->normal[0] += n0[j]; c->normal[1] += n1[j]; c->normal[2] += n2[j]; c->tangents[0][0] += t0[j]; c->tangents[0][1] += t1[j]; c->tangents[0][2] += t2[j]; c->tangents[1][0] += t3[j]; c->tangents[1][1] += t4[j]; c->tangents[1][2] += t5[j]; } else { c->normal[0] = n0[j]; c->normal[1] = n1[j]; c->normal[2] = n2[j]; c->tangents[0][0] = t0[j]; c->tangents[0][1] = t1[j]; c->tangents[0][2] = t2[j]; c->tangents[1][0] = t3[j]; c->tangents[1][1] = t4[j]; c->tangents[1][2] = t5[j]; used[v2] = true; } } } for ( ; i < numIndexes; i += 3 ) { idDrawVert *a, *b, *c; ALIGN16( unsigned int signBit[4] ); float d0, d1, d2, d3, d4; float d5, d6, d7, d8, d9; float n0, n1, n2; float t0, t1, t2; float t3, t4, t5; const int v0 = indexes[i + 0]; const int v1 = indexes[i + 1]; const int v2 = indexes[i + 2]; a = verts + v0; b = verts + v1; c = verts + v2; d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; d3 = b->st[0] - a->st[0]; d4 = b->st[1] - a->st[1]; d5 = c->xyz[0] - a->xyz[0]; d6 = c->xyz[1] - a->xyz[1]; d7 = c->xyz[2] - a->xyz[2]; d8 = c->st[0] - a->st[0]; d9 = c->st[1] - a->st[1]; #if 1 __asm { // normal movss xmm0, d6 mulss xmm0, d2 movss xmm1, d7 mulss xmm1, d1 subss xmm0, xmm1 movss xmm1, d7 mulss xmm1, d0 movss xmm2, d5 mulss xmm2, d2 subss xmm1, xmm2 movss xmm2, d5 mulss xmm2, d1 movss xmm3, d6 mulss xmm3, d0 subss xmm2, xmm3 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, xmm3 mulss xmm4, xmm4 mulss xmm5, xmm5 addss xmm3, xmm4 addss xmm3, xmm5 #ifdef FIX_DEGENERATE_TANGENT xorps xmm4, xmm4 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #endif #ifdef REFINE_TANGENT_SQUAREROOT rsqrtss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 subss xmm3, xmm6 mulss xmm4, xmm7 mulss xmm3, xmm4 #else rsqrtss xmm3, xmm3 #endif mulss xmm0, xmm3 movss n0, xmm0 mulss xmm1, xmm3 movss n1, xmm1 mulss xmm2, xmm3 movss n2, xmm2 // area sign bit movss xmm0, d3 mulss xmm0, d9 movss xmm1, d4 mulss xmm1, d8 subss xmm0, xmm1 andps xmm0, SIMD_SP_signBitMask movaps signBit, xmm0 // first tangent movss xmm0, d0 mulss xmm0, d9 movss xmm1, d4 mulss xmm1, d5 subss xmm0, xmm1 movss xmm1, d1 mulss xmm1, d9 movss xmm2, d4 mulss xmm2, d6 subss xmm1, xmm2 movss xmm2, d2 mulss xmm2, d9 movss xmm3, d4 mulss xmm3, d7 subss xmm2, xmm3 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, xmm3 mulss xmm4, xmm4 mulss xmm5, xmm5 addss xmm3, xmm4 addss xmm3, xmm5 #ifdef FIX_DEGENERATE_TANGENT xorps xmm4, xmm4 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #endif #ifdef REFINE_TANGENT_SQUAREROOT rsqrtss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 subss xmm3, xmm6 mulss xmm4, xmm7 mulss xmm3, xmm4 #else rsqrtss xmm3, xmm3 #endif xorps xmm3, signBit mulss xmm0, xmm3 movss t0, xmm0 mulss xmm1, xmm3 movss t1, xmm1 mulss xmm2, xmm3 movss t2, xmm2 // second tangent movss xmm0, d3 mulss xmm0, d5 movss xmm1, d0 mulss xmm1, d8 subss xmm0, xmm1 movss xmm1, d3 mulss xmm1, d6 movss xmm2, d1 mulss xmm2, d8 subss xmm1, xmm2 movss xmm2, d3 mulss xmm2, d7 movss xmm3, d2 mulss xmm3, d8 subss xmm2, xmm3 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, xmm3 mulss xmm4, xmm4 mulss xmm5, xmm5 addss xmm3, xmm4 addss xmm3, xmm5 #ifdef FIX_DEGENERATE_TANGENT xorps xmm4, xmm4 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 #endif #ifdef REFINE_TANGENT_SQUAREROOT rsqrtss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 subss xmm3, xmm6 mulss xmm4, xmm7 mulss xmm3, xmm4 #else rsqrtss xmm3, xmm3 #endif xorps xmm3, signBit mulss xmm0, xmm3 movss t3, xmm0 mulss xmm1, xmm3 movss t4, xmm1 mulss xmm2, xmm3 movss t5, xmm2 } #else float tmp; // normal n0 = d6 * d2 - d7 * d1; n1 = d7 * d0 - d5 * d2; n2 = d5 * d1 - d6 * d0; tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 ); n0 *= tmp; n1 *= tmp; n2 *= tmp; // area sign bit tmp = d3 * d9 - d4 * d8; signBit[0] = ( *(unsigned int *)&tmp ) & ( 1 << 31 ); // first tangent t0 = d0 * d9 - d4 * d5; t1 = d1 * d9 - d4 * d6; t2 = d2 * d9 - d4 * d7; tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 ); *(unsigned int *)&tmp ^= signBit[0]; t0 *= tmp; t1 *= tmp; t2 *= tmp; // second tangent t3 = d3 * d5 - d0 * d8; t4 = d3 * d6 - d1 * d8; t5 = d3 * d7 - d2 * d8; tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 ); *(unsigned int *)&tmp ^= signBit[0]; t3 *= tmp; t4 *= tmp; t5 *= tmp; #endif planes->Normal()[0] = n0; planes->Normal()[1] = n1; planes->Normal()[2] = n2; planes->FitThroughPoint( a->xyz ); planes++; if ( used[v0] ) { a->normal[0] += n0; a->normal[1] += n1; a->normal[2] += n2; a->tangents[0][0] += t0; a->tangents[0][1] += t1; a->tangents[0][2] += t2; a->tangents[1][0] += t3; a->tangents[1][1] += t4; a->tangents[1][2] += t5; } else { a->normal[0] = n0; a->normal[1] = n1; a->normal[2] = n2; a->tangents[0][0] = t0; a->tangents[0][1] = t1; a->tangents[0][2] = t2; a->tangents[1][0] = t3; a->tangents[1][1] = t4; a->tangents[1][2] = t5; used[v0] = true; } if ( used[v1] ) { b->normal[0] += n0; b->normal[1] += n1; b->normal[2] += n2; b->tangents[0][0] += t0; b->tangents[0][1] += t1; b->tangents[0][2] += t2; b->tangents[1][0] += t3; b->tangents[1][1] += t4; b->tangents[1][2] += t5; } else { b->normal[0] = n0; b->normal[1] = n1; b->normal[2] = n2; b->tangents[0][0] = t0; b->tangents[0][1] = t1; b->tangents[0][2] = t2; b->tangents[1][0] = t3; b->tangents[1][1] = t4; b->tangents[1][2] = t5; used[v1] = true; } if ( used[v2] ) { c->normal[0] += n0; c->normal[1] += n1; c->normal[2] += n2; c->tangents[0][0] += t0; c->tangents[0][1] += t1; c->tangents[0][2] += t2; c->tangents[1][0] += t3; c->tangents[1][1] += t4; c->tangents[1][2] += t5; } else { c->normal[0] = n0; c->normal[1] = n1; c->normal[2] = n2; c->tangents[0][0] = t0; c->tangents[0][1] = t1; c->tangents[0][2] = t2; c->tangents[1][0] = t3; c->tangents[1][1] = t4; c->tangents[1][2] = t5; used[v2] = true; } } } /* ============ idSIMD_SSE::DeriveUnsmoothedTangents ============ */ #define DERIVE_UNSMOOTHED_BITANGENT void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) { int i, j; for ( i = 0; i <= numVerts - 4; i += 4 ) { ALIGN16( float s0[4] ); ALIGN16( float s1[4] ); ALIGN16( float s2[4] ); ALIGN16( float d0[4] ); ALIGN16( float d1[4] ); ALIGN16( float d2[4] ); ALIGN16( float d3[4] ); ALIGN16( float d4[4] ); ALIGN16( float d5[4] ); ALIGN16( float d6[4] ); ALIGN16( float d7[4] ); ALIGN16( float d8[4] ); ALIGN16( float d9[4] ); ALIGN16( float n0[4] ); ALIGN16( float n1[4] ); ALIGN16( float n2[4] ); ALIGN16( float t0[4] ); ALIGN16( float t1[4] ); ALIGN16( float t2[4] ); ALIGN16( float t3[4] ); ALIGN16( float t4[4] ); ALIGN16( float t5[4] ); for ( j = 0; j < 4; j++ ) { const idDrawVert *a, *b, *c; const dominantTri_s &dt = dominantTris[i+j]; s0[j] = dt.normalizationScale[0]; s1[j] = dt.normalizationScale[1]; s2[j] = dt.normalizationScale[2]; a = verts + i + j; b = verts + dt.v2; c = verts + dt.v3; d0[j] = b->xyz[0] - a->xyz[0]; d1[j] = b->xyz[1] - a->xyz[1]; d2[j] = b->xyz[2] - a->xyz[2]; d3[j] = b->st[0] - a->st[0]; d4[j] = b->st[1] - a->st[1]; d5[j] = c->xyz[0] - a->xyz[0]; d6[j] = c->xyz[1] - a->xyz[1]; d7[j] = c->xyz[2] - a->xyz[2]; d8[j] = c->st[0] - a->st[0]; d9[j] = c->st[1] - a->st[1]; } #if 1 __asm { movaps xmm0, d6 mulps xmm0, d2 movaps xmm1, d7 mulps xmm1, d1 movaps xmm2, d7 mulps xmm2, d0 movaps xmm3, d5 mulps xmm3, d2 movaps xmm4, d5 mulps xmm4, d1 movaps xmm5, d6 mulps xmm5, d0 subps xmm0, xmm1 subps xmm2, xmm3 movaps xmm7, s2 subps xmm4, xmm5 mulps xmm0, xmm7 movaps n0, xmm0 mulps xmm2, xmm7 movaps n1, xmm2 mulps xmm4, xmm7 movaps n2, xmm4 movaps xmm0, d0 mulps xmm0, d9 movaps xmm1, d4 mulps xmm1, d5 movaps xmm2, d1 mulps xmm2, d9 movaps xmm3, d4 mulps xmm3, d6 movaps xmm4, d2 mulps xmm4, d9 movaps xmm5, d4 mulps xmm5, d7 subps xmm0, xmm1 subps xmm2, xmm3 movaps xmm7, s0 subps xmm4, xmm5 mulps xmm0, xmm7 movaps t0, xmm0 mulps xmm2, xmm7 movaps t1, xmm2 mulps xmm4, xmm7 movaps t2, xmm4 #ifndef DERIVE_UNSMOOTHED_BITANGENT movaps xmm0, d3 mulps xmm0, d5 movaps xmm1, d0 mulps xmm1, d8 movaps xmm2, d3 mulps xmm2, d6 movaps xmm3, d1 mulps xmm3, d8 movaps xmm4, d3 mulps xmm4, d7 movaps xmm5, d2 mulps xmm5, d8 #else movaps xmm0, n2 mulps xmm0, t1 movaps xmm1, n1 mulps xmm1, t2 movaps xmm2, n0 mulps xmm2, t2 movaps xmm3, n2 mulps xmm3, t0 movaps xmm4, n1 mulps xmm4, t0 movaps xmm5, n0 mulps xmm5, t1 #endif subps xmm0, xmm1 subps xmm2, xmm3 movaps xmm7, s1 subps xmm4, xmm5 mulps xmm0, xmm7 movaps t3, xmm0 mulps xmm2, xmm7 movaps t4, xmm2 mulps xmm4, xmm7 movaps t5, xmm4 } #else n0[0] = d6[0] * d2[0]; n0[1] = d6[1] * d2[1]; n0[2] = d6[2] * d2[2]; n0[3] = d6[3] * d2[3]; n1[0] = d7[0] * d0[0]; n1[1] = d7[1] * d0[1]; n1[2] = d7[2] * d0[2]; n1[3] = d7[3] * d0[3]; n2[0] = d5[0] * d1[0]; n2[1] = d5[1] * d1[1]; n2[2] = d5[2] * d1[2]; n2[3] = d5[3] * d1[3]; n0[0] -= d7[0] * d1[0]; n0[1] -= d7[1] * d1[1]; n0[2] -= d7[2] * d1[2]; n0[3] -= d7[3] * d1[3]; n1[0] -= d5[0] * d2[0]; n1[1] -= d5[1] * d2[1]; n1[2] -= d5[2] * d2[2]; n1[3] -= d5[3] * d2[3]; n2[0] -= d6[0] * d0[0]; n2[1] -= d6[1] * d0[1]; n2[2] -= d6[2] * d0[2]; n2[3] -= d6[3] * d0[3]; n0[0] *= s2[0]; n0[1] *= s2[1]; n0[2] *= s2[2]; n0[3] *= s2[3]; n1[0] *= s2[0]; n1[1] *= s2[1]; n1[2] *= s2[2]; n1[3] *= s2[3]; n2[0] *= s2[0]; n2[1] *= s2[1]; n2[2] *= s2[2]; n2[3] *= s2[3]; t0[0] = d0[0] * d9[0]; t0[1] = d0[1] * d9[1]; t0[2] = d0[2] * d9[2]; t0[3] = d0[3] * d9[3]; t1[0] = d1[0] * d9[0]; t1[1] = d1[1] * d9[1]; t1[2] = d1[2] * d9[2]; t1[3] = d1[3] * d9[3]; t2[0] = d2[0] * d9[0]; t2[1] = d2[1] * d9[1]; t2[2] = d2[2] * d9[2]; t2[3] = d2[3] * d9[3]; t0[0] -= d4[0] * d5[0]; t0[1] -= d4[1] * d5[1]; t0[2] -= d4[2] * d5[2]; t0[3] -= d4[3] * d5[3]; t1[0] -= d4[0] * d6[0]; t1[1] -= d4[1] * d6[1]; t1[2] -= d4[2] * d6[2]; t1[3] -= d4[3] * d6[3]; t2[0] -= d4[0] * d7[0]; t2[1] -= d4[1] * d7[1]; t2[2] -= d4[2] * d7[2]; t2[3] -= d4[3] * d7[3]; t0[0] *= s0[0]; t0[1] *= s0[1]; t0[2] *= s0[2]; t0[3] *= s0[3]; t1[0] *= s0[0]; t1[1] *= s0[1]; t1[2] *= s0[2]; t1[3] *= s0[3]; t2[0] *= s0[0]; t2[1] *= s0[1]; t2[2] *= s0[2]; t2[3] *= s0[3]; #ifndef DERIVE_UNSMOOTHED_BITANGENT t3[0] = d3[0] * d5[0]; t3[1] = d3[1] * d5[1]; t3[2] = d3[2] * d5[2]; t3[3] = d3[3] * d5[3]; t4[0] = d3[0] * d6[0]; t4[1] = d3[1] * d6[1]; t4[2] = d3[2] * d6[2]; t4[3] = d3[3] * d6[3]; t5[0] = d3[0] * d7[0]; t5[1] = d3[1] * d7[1]; t5[2] = d3[2] * d7[2]; t5[3] = d3[3] * d7[3]; t3[0] -= d0[0] * d8[0]; t3[1] -= d0[1] * d8[1]; t3[2] -= d0[2] * d8[2]; t3[3] -= d0[3] * d8[3]; t4[0] -= d1[0] * d8[0]; t4[1] -= d1[1] * d8[1]; t4[2] -= d1[2] * d8[2]; t4[3] -= d1[3] * d8[3]; t5[0] -= d2[0] * d8[0]; t5[1] -= d2[1] * d8[1]; t5[2] -= d2[2] * d8[2]; t5[3] -= d2[3] * d8[3]; #else t3[0] = n2[0] * t1[0]; t3[1] = n2[1] * t1[1]; t3[2] = n2[2] * t1[2]; t3[3] = n2[3] * t1[3]; t4[0] = n0[0] * t2[0]; t4[1] = n0[1] * t2[1]; t4[2] = n0[2] * t2[2]; t4[3] = n0[3] * t2[3]; t5[0] = n1[0] * t0[0]; t5[1] = n1[1] * t0[1]; t5[2] = n1[2] * t0[2]; t5[3] = n1[3] * t0[3]; t3[0] -= n1[0] * t2[0]; t3[1] -= n1[1] * t2[1]; t3[2] -= n1[2] * t2[2]; t3[3] -= n1[3] * t2[3]; t4[0] -= n2[0] * t0[0]; t4[1] -= n2[1] * t0[1]; t4[2] -= n2[2] * t0[2]; t4[3] -= n2[3] * t0[3]; t5[0] -= n0[0] * t1[0]; t5[1] -= n0[1] * t1[1]; t5[2] -= n0[2] * t1[2]; t5[3] -= n0[3] * t1[3]; #endif t3[0] *= s1[0]; t3[1] *= s1[1]; t3[2] *= s1[2]; t3[3] *= s1[3]; t4[0] *= s1[0]; t4[1] *= s1[1]; t4[2] *= s1[2]; t4[3] *= s1[3]; t5[0] *= s1[0]; t5[1] *= s1[1]; t5[2] *= s1[2]; t5[3] *= s1[3]; #endif for ( j = 0; j < 4; j++ ) { idDrawVert *a; a = verts + i + j; a->normal[0] = n0[j]; a->normal[1] = n1[j]; a->normal[2] = n2[j]; a->tangents[0][0] = t0[j]; a->tangents[0][1] = t1[j]; a->tangents[0][2] = t2[j]; a->tangents[1][0] = t3[j]; a->tangents[1][1] = t4[j]; a->tangents[1][2] = t5[j]; } } for ( ; i < numVerts; i++ ) { idDrawVert *a, *b, *c; float d0, d1, d2, d3, d4; float d5, d6, d7, d8, d9; float s0, s1, s2; float n0, n1, n2; float t0, t1, t2; float t3, t4, t5; const dominantTri_s &dt = dominantTris[i]; s0 = dt.normalizationScale[0]; s1 = dt.normalizationScale[1]; s2 = dt.normalizationScale[2]; a = verts + i; b = verts + dt.v2; c = verts + dt.v3; d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; d3 = b->st[0] - a->st[0]; d4 = b->st[1] - a->st[1]; d5 = c->xyz[0] - a->xyz[0]; d6 = c->xyz[1] - a->xyz[1]; d7 = c->xyz[2] - a->xyz[2]; d8 = c->st[0] - a->st[0]; d9 = c->st[1] - a->st[1]; #if 1 __asm { movss xmm0, d6 mulss xmm0, d2 movss xmm1, d7 mulss xmm1, d1 movss xmm2, d7 mulss xmm2, d0 movss xmm3, d5 mulss xmm3, d2 movss xmm4, d5 mulss xmm4, d1 movss xmm5, d6 mulss xmm5, d0 subss xmm0, xmm1 subss xmm2, xmm3 movss xmm7, s2 subss xmm4, xmm5 mulss xmm0, xmm7 movss n0, xmm0 mulss xmm2, xmm7 movss n1, xmm2 mulss xmm4, xmm7 movss n2, xmm4 movss xmm0, d0 mulss xmm0, d9 movss xmm1, d4 mulss xmm1, d5 movss xmm2, d1 mulss xmm2, d9 movss xmm3, d4 mulss xmm3, d6 movss xmm4, d2 mulss xmm4, d9 movss xmm5, d4 mulss xmm5, d7 subss xmm0, xmm1 subss xmm2, xmm3 movss xmm7, s0 subss xmm4, xmm5 mulss xmm0, xmm7 movss t0, xmm0 mulss xmm2, xmm7 movss t1, xmm2 mulss xmm4, xmm7 movss t2, xmm4 #ifndef DERIVE_UNSMOOTHED_BITANGENT movss xmm0, d3 mulss xmm0, d5 movss xmm1, d0 mulss xmm1, d8 movss xmm2, d3 mulss xmm2, d6 movss xmm3, d1 mulss xmm3, d8 movss xmm4, d3 mulss xmm4, d7 movss xmm5, d2 mulss xmm5, d8 #else movss xmm0, n2 mulss xmm0, t1 movss xmm1, n1 mulss xmm1, t2 movss xmm2, n0 mulss xmm2, t2 movss xmm3, n2 mulss xmm3, t0 movss xmm4, n1 mulss xmm4, t0 movss xmm5, n0 mulss xmm5, t1 #endif subss xmm0, xmm1 subss xmm2, xmm3 movss xmm7, s1 subss xmm4, xmm5 mulss xmm0, xmm7 movss t3, xmm0 mulss xmm2, xmm7 movss t4, xmm2 mulss xmm4, xmm7 movss t5, xmm4 } #else n0 = s2 * ( d6 * d2 - d7 * d1 ); n1 = s2 * ( d7 * d0 - d5 * d2 ); n2 = s2 * ( d5 * d1 - d6 * d0 ); t0 = s0 * ( d0 * d9 - d4 * d5 ); t1 = s0 * ( d1 * d9 - d4 * d6 ); t2 = s0 * ( d2 * d9 - d4 * d7 ); #ifndef DERIVE_UNSMOOTHED_BITANGENT t3 = s1 * ( d3 * d5 - d0 * d8 ); t4 = s1 * ( d3 * d6 - d1 * d8 ); t5 = s1 * ( d3 * d7 - d2 * d8 ); #else t3 = s1 * ( n2 * t1 - n1 * t2 ); t4 = s1 * ( n0 * t2 - n2 * t0 ); t5 = s1 * ( n1 * t0 - n0 * t1 ); #endif #endif a->normal[0] = n0; a->normal[1] = n1; a->normal[2] = n2; a->tangents[0][0] = t0; a->tangents[0][1] = t1; a->tangents[0][2] = t2; a->tangents[1][0] = t3; a->tangents[1][1] = t4; a->tangents[1][2] = t5; } } /* ============ idSIMD_SSE::NormalizeTangents ============ */ void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) { ALIGN16( float normal[12] ); assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); assert( verts != NULL ); assert( numVerts >= 0 ); __asm { mov eax, numVerts test eax, eax jz done #ifdef REFINE_TANGENT_SQUAREROOT movaps xmm6, SIMD_SP_rsqrt_c0 movaps xmm7, SIMD_SP_rsqrt_c1 #endif mov esi, verts imul eax, DRAWVERT_SIZE add esi, eax neg eax add eax, DRAWVERT_SIZE*4 jle loopVert4 sub eax, DRAWVERT_SIZE*4 jl loopVert1 loopVert4: sub eax, DRAWVERT_SIZE*4 // normalize 4 idDrawVert::normal movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0] // 0, X, X, X movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0] // 0, X, 3, 4 movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8] // 5, X, X, X movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4] // 5, X, 1, 2 movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0] // 6, X, X, X movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0] // 6, X, 9, 10 movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8] // 11, X, X, X movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4] // 11, X, 7, 8 movaps xmm1, xmm0 movaps xmm5, xmm2 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9 shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1 shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7 shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm3, xmm4 addps xmm3, xmm5 #ifdef REFINE_TANGENT_SQUAREROOT rsqrtps xmm4, xmm3 mulps xmm3, xmm4 mulps xmm3, xmm4 subps xmm3, xmm6 mulps xmm4, xmm7 mulps xmm3, xmm4 #else rsqrtps xmm3, xmm3 #endif mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 // save the 4 idDrawVert::normal to project the tangents movaps [normal+ 0], xmm0 movaps [normal+16], xmm1 movaps [normal+32], xmm2 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2 // project and normalize 4 idDrawVert::tangent[0] movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4 movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2 movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10 movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8 movaps xmm1, xmm0 movaps xmm5, xmm2 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9 shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1 shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7 shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, [normal+ 0] mulps xmm4, [normal+16] mulps xmm5, [normal+32] addps xmm3, xmm4 addps xmm3, xmm5 movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, [normal+ 0] mulps xmm4, [normal+16] mulps xmm5, [normal+32] subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm3, xmm4 addps xmm3, xmm5 #ifdef REFINE_TANGENT_SQUAREROOT rsqrtps xmm4, xmm3 mulps xmm3, xmm4 mulps xmm3, xmm4 subps xmm3, xmm6 mulps xmm4, xmm7 mulps xmm3, xmm4 #else rsqrtps xmm3, xmm3 #endif mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2 // project and normalize 4 idDrawVert::tangent[1] movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4 movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2 movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10 movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8 movaps xmm1, xmm0 movaps xmm5, xmm2 shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9 shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11 shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1 shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7 shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, [normal+ 0] mulps xmm4, [normal+16] mulps xmm5, [normal+32] addps xmm3, xmm4 addps xmm3, xmm5 movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, [normal+ 0] mulps xmm4, [normal+16] mulps xmm5, [normal+32] subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 movaps xmm3, xmm0 movaps xmm4, xmm1 movaps xmm5, xmm2 mulps xmm3, xmm3 mulps xmm4, xmm4 mulps xmm5, xmm5 addps xmm3, xmm4 addps xmm3, xmm5 #ifdef REFINE_TANGENT_SQUAREROOT rsqrtps xmm4, xmm3 mulps xmm3, xmm4 mulps xmm3, xmm4 subps xmm3, xmm6 mulps xmm4, xmm7 mulps xmm3, xmm4 #else rsqrtps xmm3, xmm3 #endif mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2 add eax, DRAWVERT_SIZE*8 jle loopVert4 sub eax, DRAWVERT_SIZE*4 jge done loopVert1: // normalize one idDrawVert::normal movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, xmm3 mulss xmm4, xmm4 mulss xmm5, xmm5 addss xmm3, xmm4 addss xmm3, xmm5 #ifdef REFINE_TANGENT_SQUAREROOT rsqrtss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 subss xmm3, xmm6 mulss xmm4, xmm7 mulss xmm3, xmm4 #else rsqrtss xmm3, xmm3 #endif mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2 // project and normalize one idDrawVert::tangent[0] movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8] movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] addss xmm3, xmm4 addss xmm3, xmm5 movss xmm4, xmm3 movss xmm5, xmm3 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] subss xmm0, xmm3 subss xmm1, xmm4 subss xmm2, xmm5 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, xmm3 mulss xmm4, xmm4 mulss xmm5, xmm5 addss xmm3, xmm4 addss xmm3, xmm5 #ifdef REFINE_TANGENT_SQUAREROOT rsqrtss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 subss xmm3, xmm6 mulss xmm4, xmm7 mulss xmm3, xmm4 #else rsqrtss xmm3, xmm3 #endif mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2 // project and normalize one idDrawVert::tangent[1] movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8] movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] addss xmm3, xmm4 addss xmm3, xmm5 movss xmm4, xmm3 movss xmm5, xmm3 mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0] mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4] mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8] subss xmm0, xmm3 subss xmm1, xmm4 subss xmm2, xmm5 movss xmm3, xmm0 movss xmm4, xmm1 movss xmm5, xmm2 mulss xmm3, xmm3 mulss xmm4, xmm4 mulss xmm5, xmm5 addss xmm3, xmm4 addss xmm3, xmm5 #ifdef REFINE_TANGENT_SQUAREROOT rsqrtss xmm4, xmm3 mulss xmm3, xmm4 mulss xmm3, xmm4 subss xmm3, xmm6 mulss xmm4, xmm7 mulss xmm3, xmm4 #else rsqrtss xmm3, xmm3 #endif mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0 movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1 movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2 add eax, DRAWVERT_SIZE jl loopVert1 done: } } /* ============ idSIMD_SSE::CreateTextureSpaceLightVectors ============ */ void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) ); memset( used, 0, numVerts * sizeof( used[0] ) ); for ( int i = numIndexes - 1; i >= 0; i-- ) { used[indexes[i]] = true; } #if 0 __asm { mov eax, numVerts mov esi, used add esi, eax mov edi, verts sub edi, DRAWVERT_SIZE neg eax dec eax mov ecx, lightOrigin movss xmm7, [ecx+0] movhps xmm7, [ecx+4] mov ecx, lightVectors sub ecx, 3*4 loopVert: inc eax jge done add edi, DRAWVERT_SIZE add ecx, 3*4 cmp byte ptr [esi+eax], 0 je loopVert movaps xmm0, xmm7 movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0] movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4] subps xmm0, xmm1 // 0, X, 1, 2 // 3, X, 4, 5 // 6, X, 7, 8 movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0] movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4] mulps xmm2, xmm0 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0] movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4] mulps xmm3, xmm0 movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2 unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5 movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0] movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4] mulps xmm4, xmm0 movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7 addps xmm5, xmm4 addps xmm5, xmm2 movlps [ecx+0], xmm5 shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 ) movss [ecx+8], xmm5 jmp loopVert done: } #elif 1 for ( int i = 0; i < numVerts; i++ ) { if ( !used[i] ) { continue; } const idDrawVert *v = &verts[i]; idVec3 lightDir; lightDir[0] = lightOrigin[0] - v->xyz[0]; lightDir[1] = lightOrigin[1] - v->xyz[1]; lightDir[2] = lightOrigin[2] - v->xyz[2]; lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2]; lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2]; lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2]; } #elif 1 ALIGN16( int usedVertNums[4] ); ALIGN16( float lightDir0[4] ); ALIGN16( float lightDir1[4] ); ALIGN16( float lightDir2[4] ); ALIGN16( float normal0[4] ); ALIGN16( float normal1[4] ); ALIGN16( float normal2[4] ); ALIGN16( float tangent0[4] ); ALIGN16( float tangent1[4] ); ALIGN16( float tangent2[4] ); ALIGN16( float tangent3[4] ); ALIGN16( float tangent4[4] ); ALIGN16( float tangent5[4] ); idVec3 localLightOrigin = lightOrigin; __asm { xor ecx, ecx mov eax, numVerts mov esi, used add esi, eax mov edi, verts sub edi, DRAWVERT_SIZE neg eax dec eax loopVert4: inc eax jge done4 add edi, DRAWVERT_SIZE cmp byte ptr [esi+eax], 0 je loopVert4 mov usedVertNums[ecx*4], eax inc ecx cmp ecx, 4 movss xmm0, localLightOrigin[0] movss xmm1, localLightOrigin[4] movss xmm2, localLightOrigin[8] subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0] subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4] subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8] movss lightDir0[ecx*4-4], xmm0 movss lightDir1[ecx*4-4], xmm1 movss lightDir2[ecx*4-4], xmm2 movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0] movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4] movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8] movss normal0[ecx*4-4], xmm3 movss normal1[ecx*4-4], xmm4 movss normal2[ecx*4-4], xmm5 movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0] movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4] movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8] movss tangent0[ecx*4-4], xmm0 movss tangent1[ecx*4-4], xmm1 movss tangent2[ecx*4-4], xmm2 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0] movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4] movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8] movss tangent3[ecx*4-4], xmm3 movss tangent4[ecx*4-4], xmm4 movss tangent5[ecx*4-4], xmm5 jl loopVert4 movaps xmm0, lightDir0 movaps xmm1, lightDir1 movaps xmm2, lightDir2 movaps xmm3, tangent0 mulps xmm3, xmm0 movaps xmm4, tangent1 mulps xmm4, xmm1 movaps xmm5, tangent2 mulps xmm5, xmm2 addps xmm3, xmm4 addps xmm5, xmm3 movaps xmm3, tangent3 mulps xmm3, xmm0 movaps xmm4, tangent4 mulps xmm4, xmm1 movaps xmm6, tangent5 mulps xmm6, xmm2 addps xmm3, xmm4 addps xmm6, xmm3 mulps xmm0, normal0 mulps xmm1, normal1 mulps xmm2, normal2 addps xmm0, xmm1 addps xmm0, xmm2 mov ecx, numVerts imul ecx, 12 mov edx, usedVertNums[0] add ecx, lightVectors imul edx, 12 movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[4] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) imul edx, 12 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[8] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) imul edx, 12 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[12] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) imul edx, 12 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 xor ecx, ecx jmp loopVert4 done4: test ecx, ecx jz done xor eax, eax mov edi, numVerts imul edi, 12 add edi, lightVectors loopVert1: movss xmm0, lightDir0[eax*4] movss xmm1, lightDir1[eax*4] movss xmm2, lightDir2[eax*4] mov edx, usedVertNums[eax*4] imul edx, 12 movss xmm3, tangent0[eax*4] mulss xmm3, xmm0 movss xmm4, tangent1[eax*4] mulss xmm4, xmm1 movss xmm5, tangent2[eax*4] mulss xmm5, xmm2 addss xmm3, xmm4 addss xmm5, xmm3 movss [edi+edx+0], xmm5 movss xmm3, tangent3[eax*4] mulss xmm3, xmm0 movss xmm4, tangent4[eax*4] mulss xmm4, xmm1 movss xmm6, tangent5[eax*4] mulss xmm6, xmm2 addss xmm3, xmm4 addss xmm6, xmm3 movss [edi+edx+4], xmm6 mulss xmm0, normal0[eax*4] mulss xmm1, normal1[eax*4] mulss xmm2, normal2[eax*4] addss xmm0, xmm1 addss xmm0, xmm2 movss [edi+edx+8], xmm0 inc eax dec ecx jg loopVert1 done: } #else ALIGN16( float lightVectors0[4] ); ALIGN16( float lightVectors1[4] ); ALIGN16( float lightVectors2[4] ); int numUsedVerts = 0; for ( int i = 0; i < numVerts; i++ ) { if ( !used[i] ) { continue; } const idDrawVert *v = &verts[i]; lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0]; lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1]; lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2]; normal0[numUsedVerts] = v->normal[0]; normal1[numUsedVerts] = v->normal[1]; normal2[numUsedVerts] = v->normal[2]; tangent0[numUsedVerts] = v->tangents[0][0]; tangent1[numUsedVerts] = v->tangents[0][1]; tangent2[numUsedVerts] = v->tangents[0][2]; tangent3[numUsedVerts] = v->tangents[1][0]; tangent4[numUsedVerts] = v->tangents[1][1]; tangent5[numUsedVerts] = v->tangents[1][2]; usedVertNums[numUsedVerts++] = i; if ( numUsedVerts < 4 ) { continue; } lightVectors0[0] = lightDir0[0] * tangent0[0]; lightVectors0[1] = lightDir0[1] * tangent0[1]; lightVectors0[2] = lightDir0[2] * tangent0[2]; lightVectors0[3] = lightDir0[3] * tangent0[3]; lightVectors0[0] += lightDir1[0] * tangent1[0]; lightVectors0[1] += lightDir1[1] * tangent1[1]; lightVectors0[2] += lightDir1[2] * tangent1[2]; lightVectors0[3] += lightDir1[3] * tangent1[3]; lightVectors0[0] += lightDir2[0] * tangent2[0]; lightVectors0[1] += lightDir2[1] * tangent2[1]; lightVectors0[2] += lightDir2[2] * tangent2[2]; lightVectors0[3] += lightDir2[3] * tangent2[3]; lightVectors1[0] = lightDir0[0] * tangent3[0]; lightVectors1[1] = lightDir0[1] * tangent3[1]; lightVectors1[2] = lightDir0[2] * tangent3[2]; lightVectors1[3] = lightDir0[3] * tangent3[3]; lightVectors1[0] += lightDir1[0] * tangent4[0]; lightVectors1[1] += lightDir1[1] * tangent4[1]; lightVectors1[2] += lightDir1[2] * tangent4[2]; lightVectors1[3] += lightDir1[3] * tangent4[3]; lightVectors1[0] += lightDir2[0] * tangent5[0]; lightVectors1[1] += lightDir2[1] * tangent5[1]; lightVectors1[2] += lightDir2[2] * tangent5[2]; lightVectors1[3] += lightDir2[3] * tangent5[3]; lightVectors2[0] = lightDir0[0] * normal0[0]; lightVectors2[1] = lightDir0[1] * normal0[1]; lightVectors2[2] = lightDir0[2] * normal0[2]; lightVectors2[3] = lightDir0[3] * normal0[3]; lightVectors2[0] += lightDir1[0] * normal1[0]; lightVectors2[1] += lightDir1[1] * normal1[1]; lightVectors2[2] += lightDir1[2] * normal1[2]; lightVectors2[3] += lightDir1[3] * normal1[3]; lightVectors2[0] += lightDir2[0] * normal2[0]; lightVectors2[1] += lightDir2[1] * normal2[1]; lightVectors2[2] += lightDir2[2] * normal2[2]; lightVectors2[3] += lightDir2[3] * normal2[3]; for ( int j = 0; j < 4; j++ ) { int n = usedVertNums[j]; lightVectors[n][0] = lightVectors0[j]; lightVectors[n][1] = lightVectors1[j]; lightVectors[n][2] = lightVectors2[j]; } numUsedVerts = 0; } for ( int i = 0; i < numUsedVerts; i++ ) { lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i]; lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i]; lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i]; int n = usedVertNums[i]; lightVectors[n][0] = lightVectors0[i]; lightVectors[n][1] = lightVectors1[i]; lightVectors[n][2] = lightVectors2[i]; } #endif } /* ============ idSIMD_SSE::CreateSpecularTextureCoords ============ */ void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) { assert( sizeof( idDrawVert ) == DRAWVERT_SIZE ); assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET ); assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET ); assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET ); bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) ); memset( used, 0, numVerts * sizeof( used[0] ) ); for ( int i = numIndexes - 1; i >= 0; i-- ) { used[indexes[i]] = true; } #if 0 __asm { mov eax, numVerts mov esi, used add esi, eax mov edi, verts sub edi, DRAWVERT_SIZE neg eax dec eax mov ecx, viewOrigin movss xmm6, [ecx+0] movhps xmm6, [ecx+4] mov ecx, lightOrigin movss xmm7, [ecx+0] movhps xmm7, [ecx+4] mov ecx, texCoords sub ecx, 4*4 loopVert: inc eax jge done add edi, DRAWVERT_SIZE add ecx, 4*4 cmp byte ptr [esi+eax], 0 je loopVert movaps xmm0, xmm7 movaps xmm1, xmm6 movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0] movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4] subps xmm0, xmm2 subps xmm1, xmm2 movaps xmm3, xmm0 movaps xmm4, xmm1 mulps xmm3, xmm3 mulps xmm4, xmm4 // 0, X, 1, 2 // 3, X, 4, 5 movaps xmm5, xmm3 // xmm5 = 0, X, 1, 2 unpcklps xmm5, xmm4 // xmm5 = 0, 3, X, X unpckhps xmm3, xmm4 // xmm3 = 1, 4, 2, 5 movhlps xmm4, xmm3 // xmm4 = 2, 5, 4, 5 addps xmm5, xmm3 addps xmm5, xmm4 shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 ) rsqrtps xmm5, xmm5 movaps xmm4, xmm5 shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0] movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4] mulps xmm2, xmm0 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0] movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4] mulps xmm3, xmm0 movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0] movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4] mulps xmm4, xmm0 movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2 unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5 movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8 shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7 movaps xmm3, SIMD_SP_one addps xmm5, xmm4 addps xmm5, xmm2 movaps [ecx+0], xmm5 movss [ecx+12], xmm3 jmp loopVert done: } #elif 0 for ( int i = 0; i < numVerts; i++ ) { if ( !used[i] ) { continue; } const idDrawVert *v = &verts[i]; idVec3 lightDir = lightOrigin - v->xyz; idVec3 viewDir = viewOrigin - v->xyz; float ilength; ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] ); lightDir[0] *= ilength; lightDir[1] *= ilength; lightDir[2] *= ilength; ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] ); viewDir[0] *= ilength; viewDir[1] *= ilength; viewDir[2] *= ilength; lightDir += viewDir; texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2]; texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2]; texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2]; texCoords[i][3] = 1.0f; } #elif 1 ALIGN16( int usedVertNums[4] ); ALIGN16( float lightDir0[4] ); ALIGN16( float lightDir1[4] ); ALIGN16( float lightDir2[4] ); ALIGN16( float viewDir0[4] ); ALIGN16( float viewDir1[4] ); ALIGN16( float viewDir2[4] ); ALIGN16( float normal0[4] ); ALIGN16( float normal1[4] ); ALIGN16( float normal2[4] ); ALIGN16( float tangent0[4] ); ALIGN16( float tangent1[4] ); ALIGN16( float tangent2[4] ); ALIGN16( float tangent3[4] ); ALIGN16( float tangent4[4] ); ALIGN16( float tangent5[4] ); idVec3 localLightOrigin = lightOrigin; idVec3 localViewOrigin = viewOrigin; __asm { xor ecx, ecx mov eax, numVerts mov esi, used add esi, eax mov edi, verts sub edi, DRAWVERT_SIZE neg eax dec eax loopVert4: inc eax jge done4 add edi, DRAWVERT_SIZE cmp byte ptr [esi+eax], 0 je loopVert4 mov usedVertNums[ecx*4], eax inc ecx cmp ecx, 4 movss xmm3, localLightOrigin[0] movss xmm4, localLightOrigin[4] movss xmm5, localLightOrigin[8] subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0] subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4] subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8] movss lightDir0[ecx*4-4], xmm3 movss lightDir1[ecx*4-4], xmm4 movss lightDir2[ecx*4-4], xmm5 movss xmm0, localViewOrigin[0] movss xmm1, localViewOrigin[4] movss xmm2, localViewOrigin[8] subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0] subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4] subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8] movss viewDir0[ecx*4-4], xmm0 movss viewDir1[ecx*4-4], xmm1 movss viewDir2[ecx*4-4], xmm2 movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0] movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4] movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8] movss normal0[ecx*4-4], xmm3 movss normal1[ecx*4-4], xmm4 movss normal2[ecx*4-4], xmm5 movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0] movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4] movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8] movss tangent0[ecx*4-4], xmm0 movss tangent1[ecx*4-4], xmm1 movss tangent2[ecx*4-4], xmm2 movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0] movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4] movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8] movss tangent3[ecx*4-4], xmm3 movss tangent4[ecx*4-4], xmm4 movss tangent5[ecx*4-4], xmm5 jl loopVert4 movaps xmm6, lightDir0 movaps xmm0, xmm6 mulps xmm6, xmm6 movaps xmm7, lightDir1 movaps xmm1, xmm7 mulps xmm7, xmm7 addps xmm6, xmm7 movaps xmm5, lightDir2 movaps xmm2, xmm5 mulps xmm5, xmm5 addps xmm6, xmm5 rsqrtps xmm6, xmm6 mulps xmm0, xmm6 mulps xmm1, xmm6 mulps xmm2, xmm6 movaps xmm3, viewDir0 movaps xmm7, xmm3 mulps xmm7, xmm7 movaps xmm4, viewDir1 movaps xmm6, xmm4 mulps xmm6, xmm6 addps xmm7, xmm6 movaps xmm5, viewDir2 movaps xmm6, xmm5 mulps xmm6, xmm6 addps xmm7, xmm6 rsqrtps xmm7, xmm7 mulps xmm3, xmm7 addps xmm0, xmm3 mulps xmm4, xmm7 addps xmm1, xmm4 mulps xmm5, xmm7 addps xmm2, xmm5 movaps xmm3, tangent0 mulps xmm3, xmm0 movaps xmm4, tangent1 mulps xmm4, xmm1 addps xmm3, xmm4 movaps xmm5, tangent2 mulps xmm5, xmm2 addps xmm5, xmm3 movaps xmm3, tangent3 mulps xmm3, xmm0 movaps xmm4, tangent4 mulps xmm4, xmm1 addps xmm3, xmm4 movaps xmm6, tangent5 mulps xmm6, xmm2 addps xmm6, xmm3 mulps xmm0, normal0 mulps xmm1, normal1 addps xmm0, xmm1 mulps xmm2, normal2 addps xmm0, xmm2 mov ecx, numVerts shl ecx, 4 mov edx, usedVertNums[0] add ecx, texCoords shl edx, 4 movss xmm3, SIMD_SP_one movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 movss [ecx+edx+12], xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[4] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shl edx, 4 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 movss [ecx+edx+12], xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[8] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shl edx, 4 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 movss [ecx+edx+12], xmm3 shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 ) mov edx, usedVertNums[12] shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 ) shl edx, 4 shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) movss [ecx+edx+0], xmm5 movss [ecx+edx+4], xmm6 movss [ecx+edx+8], xmm0 movss [ecx+edx+12], xmm3 xor ecx, ecx jmp loopVert4 done4: test ecx, ecx jz done xor eax, eax mov edi, numVerts shl edi, 4 add edi, texCoords loopVert1: movss xmm6, lightDir0[eax*4] movss xmm0, xmm6 mulss xmm6, xmm6 movss xmm7, lightDir1[eax*4] movss xmm1, xmm7 mulss xmm7, xmm7 addss xmm6, xmm7 movss xmm5, lightDir2[eax*4] movss xmm2, xmm5 mulss xmm5, xmm5 addss xmm6, xmm5 rsqrtss xmm6, xmm6 mulss xmm0, xmm6 mulss xmm1, xmm6 mulss xmm2, xmm6 movss xmm3, viewDir0[eax*4] movss xmm7, xmm3 mulss xmm7, xmm7 movss xmm4, viewDir1[eax*4] movss xmm6, xmm4 mulss xmm6, xmm6 addss xmm7, xmm6 movss xmm5, viewDir2[eax*4] movss xmm6, xmm5 mulss xmm6, xmm6 addss xmm7, xmm6 rsqrtss xmm7, xmm7 mulss xmm3, xmm7 addss xmm0, xmm3 mulss xmm4, xmm7 addss xmm1, xmm4 mulss xmm5, xmm7 addss xmm2, xmm5 mov edx, usedVertNums[eax*4] shl edx, 4 movss xmm3, tangent0[eax*4] mulss xmm3, xmm0 movss xmm4, tangent1[eax*4] mulss xmm4, xmm1 addss xmm3, xmm4 movss xmm5, tangent2[eax*4] mulss xmm5, xmm2 addss xmm5, xmm3 movss [edi+edx+0], xmm5 movss xmm3, tangent3[eax*4] mulss xmm3, xmm0 movss xmm4, tangent4[eax*4] mulss xmm4, xmm1 addss xmm3, xmm4 movss xmm6, tangent5[eax*4] mulss xmm6, xmm2 addss xmm6, xmm3 movss [edi+edx+4], xmm6 mulss xmm0, normal0[eax*4] mulss xmm1, normal1[eax*4] addss xmm0, xmm1 mulss xmm2, normal2[eax*4] addss xmm0, xmm2 movss [edi+edx+8], xmm0 movss xmm3, SIMD_SP_one movss [edi+edx+12], xmm3 inc eax dec ecx jg loopVert1 done: } #else ALIGN16( int usedVertNums[4] ); ALIGN16( float lightDir0[4] ); ALIGN16( float lightDir1[4] ); ALIGN16( float lightDir2[4] ); ALIGN16( float viewDir0[4] ); ALIGN16( float viewDir1[4] ); ALIGN16( float viewDir2[4] ); ALIGN16( float normal0[4] ); ALIGN16( float normal1[4] ); ALIGN16( float normal2[4] ); ALIGN16( float tangent0[4] ); ALIGN16( float tangent1[4] ); ALIGN16( float tangent2[4] ); ALIGN16( float tangent3[4] ); ALIGN16( float tangent4[4] ); ALIGN16( float tangent5[4] ); ALIGN16( float texCoords0[4] ); ALIGN16( float texCoords1[4] ); ALIGN16( float texCoords2[4] ); idVec3 localLightOrigin = lightOrigin; idVec3 localViewOrigin = viewOrigin; int numUsedVerts = 0; for ( int i = 0; i < numVerts; i++ ) { if ( !used[i] ) { continue; } const idDrawVert *v = &verts[i]; lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0]; lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1]; lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2]; viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0]; viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1]; viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2]; normal0[numUsedVerts] = v->normal[0]; normal1[numUsedVerts] = v->normal[1]; normal2[numUsedVerts] = v->normal[2]; tangent0[numUsedVerts] = v->tangents[0][0]; tangent1[numUsedVerts] = v->tangents[0][1]; tangent2[numUsedVerts] = v->tangents[0][2]; tangent3[numUsedVerts] = v->tangents[1][0]; tangent4[numUsedVerts] = v->tangents[1][1]; tangent5[numUsedVerts] = v->tangents[1][2]; usedVertNums[numUsedVerts++] = i; if ( numUsedVerts < 4 ) { continue; } ALIGN16( float temp[4] ); temp[0] = lightDir0[0] * lightDir0[0]; temp[1] = lightDir0[1] * lightDir0[1]; temp[2] = lightDir0[2] * lightDir0[2]; temp[3] = lightDir0[3] * lightDir0[3]; temp[0] += lightDir1[0] * lightDir1[0]; temp[1] += lightDir1[1] * lightDir1[1]; temp[2] += lightDir1[2] * lightDir1[2]; temp[3] += lightDir1[3] * lightDir1[3]; temp[0] += lightDir2[0] * lightDir2[0]; temp[1] += lightDir2[1] * lightDir2[1]; temp[2] += lightDir2[2] * lightDir2[2]; temp[3] += lightDir2[3] * lightDir2[3]; temp[0] = idMath::RSqrt( temp[0] ); temp[1] = idMath::RSqrt( temp[1] ); temp[2] = idMath::RSqrt( temp[2] ); temp[3] = idMath::RSqrt( temp[3] ); lightDir0[0] *= temp[0]; lightDir0[1] *= temp[1]; lightDir0[2] *= temp[2]; lightDir0[3] *= temp[3]; lightDir1[0] *= temp[0]; lightDir1[1] *= temp[1]; lightDir1[2] *= temp[2]; lightDir1[3] *= temp[3]; lightDir2[0] *= temp[0]; lightDir2[1] *= temp[1]; lightDir2[2] *= temp[2]; lightDir2[3] *= temp[3]; temp[0] = viewDir0[0] * viewDir0[0]; temp[1] = viewDir0[1] * viewDir0[1]; temp[2] = viewDir0[2] * viewDir0[2]; temp[3] = viewDir0[3] * viewDir0[3]; temp[0] += viewDir1[0] * viewDir1[0]; temp[1] += viewDir1[1] * viewDir1[1]; temp[2] += viewDir1[2] * viewDir1[2]; temp[3] += viewDir1[3] * viewDir1[3]; temp[0] += viewDir2[0] * viewDir2[0]; temp[1] += viewDir2[1] * viewDir2[1]; temp[2] += viewDir2[2] * viewDir2[2]; temp[3] += viewDir2[3] * viewDir2[3]; temp[0] = idMath::RSqrt( temp[0] ); temp[1] = idMath::RSqrt( temp[1] ); temp[2] = idMath::RSqrt( temp[2] ); temp[3] = idMath::RSqrt( temp[3] ); viewDir0[0] *= temp[0]; viewDir0[1] *= temp[1]; viewDir0[2] *= temp[2]; viewDir0[3] *= temp[3]; viewDir1[0] *= temp[0]; viewDir1[1] *= temp[1]; viewDir1[2] *= temp[2]; viewDir1[3] *= temp[3]; viewDir2[0] *= temp[0]; viewDir2[1] *= temp[1]; viewDir2[2] *= temp[2]; viewDir2[3] *= temp[3]; lightDir0[0] += viewDir0[0]; lightDir0[1] += viewDir0[1]; lightDir0[2] += viewDir0[2]; lightDir0[3] += viewDir0[3]; lightDir1[0] += viewDir1[0]; lightDir1[1] += viewDir1[1]; lightDir1[2] += viewDir1[2]; lightDir1[3] += viewDir1[3]; lightDir2[0] += viewDir2[0]; lightDir2[1] += viewDir2[1]; lightDir2[2] += viewDir2[2]; lightDir2[3] += viewDir2[3]; texCoords0[0] = lightDir0[0] * tangent0[0]; texCoords0[1] = lightDir0[1] * tangent0[1]; texCoords0[2] = lightDir0[2] * tangent0[2]; texCoords0[3] = lightDir0[3] * tangent0[3]; texCoords0[0] += lightDir1[0] * tangent1[0]; texCoords0[1] += lightDir1[1] * tangent1[1]; texCoords0[2] += lightDir1[2] * tangent1[2]; texCoords0[3] += lightDir1[3] * tangent1[3]; texCoords0[0] += lightDir2[0] * tangent2[0]; texCoords0[1] += lightDir2[1] * tangent2[1]; texCoords0[2] += lightDir2[2] * tangent2[2]; texCoords0[3] += lightDir2[3] * tangent2[3]; texCoords1[0] = lightDir0[0] * tangent3[0]; texCoords1[1] = lightDir0[1] * tangent3[1]; texCoords1[2] = lightDir0[2] * tangent3[2]; texCoords1[3] = lightDir0[3] * tangent3[3]; texCoords1[0] += lightDir1[0] * tangent4[0]; texCoords1[1] += lightDir1[1] * tangent4[1]; texCoords1[2] += lightDir1[2] * tangent4[2]; texCoords1[3] += lightDir1[3] * tangent4[3]; texCoords1[0] += lightDir2[0] * tangent5[0]; texCoords1[1] += lightDir2[1] * tangent5[1]; texCoords1[2] += lightDir2[2] * tangent5[2]; texCoords1[3] += lightDir2[3] * tangent5[3]; texCoords2[0] = lightDir0[0] * normal0[0]; texCoords2[1] = lightDir0[1] * normal0[1]; texCoords2[2] = lightDir0[2] * normal0[2]; texCoords2[3] = lightDir0[3] * normal0[3]; texCoords2[0] += lightDir1[0] * normal1[0]; texCoords2[1] += lightDir1[1] * normal1[1]; texCoords2[2] += lightDir1[2] * normal1[2]; texCoords2[3] += lightDir1[3] * normal1[3]; texCoords2[0] += lightDir2[0] * normal2[0]; texCoords2[1] += lightDir2[1] * normal2[1]; texCoords2[2] += lightDir2[2] * normal2[2]; texCoords2[3] += lightDir2[3] * normal2[3]; for ( int j = 0; j < 4; j++ ) { int n = usedVertNums[j]; texCoords[n][0] = texCoords0[j]; texCoords[n][1] = texCoords1[j]; texCoords[n][2] = texCoords2[j]; texCoords[n][3] = 1.0f; } numUsedVerts = 0; } for ( int i = 0; i < numUsedVerts; i++ ) { float temp; temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i]; temp = idMath::RSqrt( temp ); lightDir0[i] *= temp; lightDir1[i] *= temp; lightDir2[i] *= temp; temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i]; temp = idMath::RSqrt( temp ); viewDir0[i] *= temp; viewDir1[i] *= temp; viewDir2[i] *= temp; lightDir0[i] += viewDir0[i]; lightDir1[i] += viewDir1[i]; lightDir2[i] += viewDir2[i]; texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i]; texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i]; texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i]; int n = usedVertNums[i]; texCoords[n][0] = texCoords0; texCoords[n][1] = texCoords1; texCoords[n][2] = texCoords2; texCoords[n][3] = 1.0f; } #endif } /* ============ idSIMD_SSE::CreateShadowCache ============ */ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) { #if 1 int outVerts; __asm { push ebx mov esi, lightOrigin movaps xmm5, SIMD_SP_lastOne movss xmm6, [esi+0] movhps xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 ) orps xmm6, SIMD_SP_lastOne movaps xmm7, xmm6 xor ebx, ebx xor ecx, ecx mov edx, vertRemap mov esi, verts mov edi, vertexCache mov eax, numVerts and eax, ~3 jz done4 shl eax, 2 add edx, eax neg eax loop4: prefetchnta [edx+128] prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] cmp dword ptr [edx+eax+0], ebx jne skip1 mov dword ptr [edx+eax+0], ecx movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] add ecx, 2 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ); orps xmm0, xmm5 movaps [edi+0*16], xmm0 subps xmm0, xmm6 movaps [edi+1*16], xmm0 add edi, 2*16 skip1: cmp dword ptr [edx+eax+4], ebx jne skip2 mov dword ptr [edx+eax+4], ecx movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] add ecx, 2 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 ) orps xmm1, xmm5 movaps [edi+0*16], xmm1 subps xmm1, xmm7 movaps [edi+1*16], xmm1 add edi, 2*16 skip2: cmp dword ptr [edx+eax+8], ebx jne skip3 mov dword ptr [edx+eax+8], ecx movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] add ecx, 2 shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 ); orps xmm2, xmm5 movaps [edi+0*16], xmm2 subps xmm2, xmm6 movaps [edi+1*16], xmm2 add edi, 2*16 skip3: cmp dword ptr [edx+eax+12], ebx jne skip4 mov dword ptr [edx+eax+12], ecx movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] add ecx, 2 shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 ) orps xmm3, xmm5 movaps [edi+0*16], xmm3 subps xmm3, xmm7 movaps [edi+1*16], xmm3 add edi, 2*16 skip4: add esi, 4*DRAWVERT_SIZE add eax, 4*4 jl loop4 done4: mov eax, numVerts and eax, 3 jz done1 shl eax, 2 add edx, eax neg eax loop1: cmp dword ptr [edx+eax+0], ebx jne skip0 mov dword ptr [edx+eax+0], ecx movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] add ecx, 2 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ) orps xmm0, xmm5 movaps [edi+0*16], xmm0 subps xmm0, xmm6 movaps [edi+1*16], xmm0 add edi, 2*16 skip0: add esi, DRAWVERT_SIZE add eax, 4 jl loop1 done1: pop ebx mov outVerts, ecx } return outVerts; #else int outVerts = 0; for ( int i = 0; i < numVerts; i++ ) { if ( vertRemap[i] ) { continue; } const float *v = verts[i].xyz.ToFloatPtr(); vertexCache[outVerts+0][0] = v[0]; vertexCache[outVerts+0][1] = v[1]; vertexCache[outVerts+0][2] = v[2]; vertexCache[outVerts+0][3] = 1.0f; // R_SetupProjection() builds the projection matrix with a slight crunch // for depth, which keeps this w=0 division from rasterizing right at the // wrap around point and causing depth fighting with the rear caps vertexCache[outVerts+1][0] = v[0] - lightOrigin[0]; vertexCache[outVerts+1][1] = v[1] - lightOrigin[1]; vertexCache[outVerts+1][2] = v[2] - lightOrigin[2]; vertexCache[outVerts+1][3] = 0.0f; vertRemap[i] = outVerts; outVerts += 2; } return outVerts; #endif } /* ============ idSIMD_SSE::CreateVertexProgramShadowCache ============ */ int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) { #if 1 __asm { movaps xmm4, SIMD_SP_lastOne movaps xmm5, xmm4 movaps xmm6, xmm4 movaps xmm7, xmm4 mov esi, verts mov edi, vertexCache mov eax, numVerts and eax, ~3 jz done4 shl eax, 5 add edi, eax neg eax loop4: prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ); movaps [edi+eax+1*16], xmm0 orps xmm0, xmm4 movaps [edi+eax+0*16], xmm0 movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 ) movaps [edi+eax+3*16], xmm1 orps xmm1, xmm5 movaps [edi+eax+2*16], xmm1 movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 ); movaps [edi+eax+5*16], xmm2 orps xmm2, xmm6 movaps [edi+eax+4*16], xmm2 movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 ) movaps [edi+eax+7*16], xmm3 orps xmm3, xmm7 movaps [edi+eax+6*16], xmm3 add esi, 4*DRAWVERT_SIZE add eax, 4*8*4 jl loop4 done4: mov eax, numVerts and eax, 3 jz done1 shl eax, 5 add edi, eax neg eax loop1: movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 ); movaps [edi+eax+1*16], xmm0 orps xmm0, xmm4 movaps [edi+eax+0*16], xmm0 add esi, DRAWVERT_SIZE add eax, 8*4 jl loop1 done1: } return numVerts * 2; #else for ( int i = 0; i < numVerts; i++ ) { const float *v = verts[i].xyz.ToFloatPtr(); vertexCache[i*2+0][0] = v[0]; vertexCache[i*2+0][1] = v[1]; vertexCache[i*2+0][2] = v[2]; vertexCache[i*2+0][3] = 1.0f; vertexCache[i*2+1][0] = v[0]; vertexCache[i*2+1][1] = v[1]; vertexCache[i*2+1][2] = v[2]; vertexCache[i*2+1][3] = 0.0f; } return numVerts * 2; #endif } /* ============ SSE_UpSample11kHzMonoPCMTo44kHz ============ */ static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 2*4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi-2*4*4+0], xmm0 movhps [edi-2*4*4+8], xmm0 movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi-1*4*4+0], xmm1 movhps [edi-1*4*4+8], xmm1 add eax, 2*2 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi+0], xmm0 movhps [edi+8], xmm0 done: } } /* ============ SSE_UpSample11kHzStereoPCMTo44kHz ============ */ static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples test eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 8*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx unpcklps xmm0, xmm1 movlps [edi-8*4+0], xmm0 movlps [edi-8*4+8], xmm0 movlps [edi-4*4+0], xmm0 movlps [edi-4*4+8], xmm0 add eax, 2*2 jl loop2 done2: } } /* ============ SSE_UpSample22kHzMonoPCMTo44kHz ============ */ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi-4*4+0], xmm0 movhps [edi-4*4+8], xmm0 add eax, 2*2 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi], xmm0 done: } } /* ============ SSE_UpSample22kHzStereoPCMTo44kHz ============ */ static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples test eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movss [edi-4*4], xmm0 movss [edi-2*4], xmm0 movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx movss [edi-3*4], xmm1 movss [edi-1*4], xmm1 add eax, 2*2 jl loop2 done2: } } /* ============ SSE_UpSample44kHzMonoPCMTo44kHz ============ */ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 2*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movss [edi-2*4], xmm0 movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx movss [edi-1*4], xmm1 add eax, 2*2 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx movss [edi], xmm0 done: } } /* ============ idSIMD_SSE::UpSamplePCMTo44kHz Duplicate samples for 44kHz output. ============ */ void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples ); } else { SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples ); } } else if ( kHz == 22050 ) { if ( numChannels == 1 ) { SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples ); } else { SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples ); } } else if ( kHz == 44100 ) { SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples ); } else { assert( 0 ); } } // DG: at least in the 22KHz Stereo OGG case with numSamples % 4 != 0 this is broken (writes 4 floats too much which can destroy the stack, see #303), // so let's just not use it anymore its MSVC+32bit only anyway and I doubt it gets noticeable speedups, so I don't feel like trying to understand and fix it.. #if 0 /* ============ SSE_UpSample11kHzMonoOGGTo44kHz ============ */ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 2 add esi, eax neg eax align 16 loop2: add edi, 2*16 movss xmm0, [esi+eax+0] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi-32], xmm0 movlps [edi-24], xmm0 movss xmm1, [esi+eax+4] mulss xmm1, xmm7 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi-16], xmm1 movlps [edi- 8], xmm1 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [esi] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi+0], xmm0 movlps [edi+8], xmm0 done: } } /* ============ SSE_UpSample11kHzStereoOGGTo44kHz ============ */ static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov ecx, [esi+0] mov edx, [esi+4] mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add ecx, eax add edx, eax neg eax align 16 loop2: add edi, 4*16 movlps xmm0, [ecx+eax] movlps xmm1, [edx+eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi-8*8], xmm0 movlps [edi-7*8], xmm0 movlps [edi-6*8], xmm0 movlps [edi-5*8], xmm0 movhps [edi-4*8], xmm0 movhps [edi-3*8], xmm0 movhps [edi-2*8], xmm0 movhps [edi-1*8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [ecx] movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi+0*8], xmm0 movlps [edi+1*8], xmm0 movlps [edi+2*8], xmm0 movlps [edi+3*8], xmm0 done: } } /* ============ SSE_UpSample22kHzMonoOGGTo44kHz ============ */ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 2 add esi, eax neg eax align 16 loop2: add edi, 2*8 movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm0, xmm7 movlps [edi-16], xmm0 movhps [edi- 8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [esi] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ) movlps [edi+0], xmm0 done: } } /* ============ SSE_UpSample22kHzStereoOGGTo44kHz ============ */ static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov ecx, [esi+0] mov edx, [esi+4] mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add ecx, eax add edx, eax neg eax align 16 loop2: add edi, 2*16 movlps xmm0, [ecx+eax] movlps xmm1, [edx+eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi-4*8], xmm0 movlps [edi-3*8], xmm0 movhps [edi-2*8], xmm0 movhps [edi-1*8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [ecx] movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi+0*8], xmm0 movlps [edi+1*8], xmm0 done: } } /* ============ SSE_UpSample44kHzMonoOGGTo44kHz ============ */ static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) { float constant = 32768.0f; KFLOAT_CA( mul, dest, src, constant, numSamples ) } /* ============ SSE_UpSample44kHzStereoOGGTo44kHz ============ */ static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov ecx, [esi+0] mov edx, [esi+4] mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add ecx, eax add edx, eax neg eax align 16 loop2: add edi, 16 movlps xmm0, [ecx+eax] movlps xmm1, [edx+eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi-2*8], xmm0 movhps [edi-1*8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [ecx] movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi+0*8], xmm0 done: } } /* ============ idSIMD_SSE::UpSampleOGGTo44kHz Duplicate samples for 44kHz output. ============ */ void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); } else { SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples ); } } else if ( kHz == 22050 ) { if ( numChannels == 1 ) { SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); } else { SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples ); } } else if ( kHz == 44100 ) { if ( numChannels == 1 ) { SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); } else { SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples ); } } else { assert( 0 ); } } #endif // 0 (DG: commenting out all the OGG-related SSE code) /* ============ idSIMD_SSE::MixSoundTwoSpeakerMono ============ */ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) { #if 1 ALIGN16( float incs[2] ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 2 add esi, eax neg eax mov ecx, lastV movlps xmm6, [ecx] xorps xmm7, xmm7 movhps xmm7, incs shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) addps xmm6, xmm7 shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 ) addps xmm7, xmm7 loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 ) mulps xmm0, xmm6 addps xmm0, [edi-4*4*4] addps xmm6, xmm7 movaps [edi-4*4*4], xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 ) mulps xmm1, xmm6 addps xmm1, [edi-3*4*4] addps xmm6, xmm7 movaps [edi-3*4*4], xmm1 movaps xmm2, [esi+eax+1*4*4] movaps xmm3, xmm2 shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 ) mulps xmm2, xmm6 addps xmm2, [edi-2*4*4] addps xmm6, xmm7 movaps [edi-2*4*4], xmm2 shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) mulps xmm3, xmm6 addps xmm3, [edi-1*4*4] addps xmm6, xmm7 movaps [edi-1*4*4], xmm3 add eax, 2*4*4 jl loop16 } #else int i; float incL; float incR; float sL0, sL1; float sR0, sR1; assert( numSamples == MIXBUFFER_SAMPLES ); incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sR0 = lastV[1]; sL1 = lastV[0] + incL; sR1 = lastV[1] + incR; incL *= 2; incR *= 2; for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { mixBuffer[i*2+0] += samples[i+0] * sL0; mixBuffer[i*2+1] += samples[i+0] * sR0; mixBuffer[i*2+2] += samples[i+1] * sL1; mixBuffer[i*2+3] += samples[i+1] * sR1; sL0 += incL; sR0 += incR; sL1 += incL; sR1 += incR; } #endif } /* ============ idSIMD_SSE::MixSoundTwoSpeakerStereo ============ */ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) { #if 1 ALIGN16( float incs[2] ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 3 add esi, eax neg eax mov ecx, lastV movlps xmm6, [ecx] xorps xmm7, xmm7 movhps xmm7, incs shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 ) addps xmm6, xmm7 shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 ) addps xmm7, xmm7 loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] mulps xmm0, xmm6 addps xmm0, [edi-4*4*4] addps xmm6, xmm7 movaps [edi-4*4*4], xmm0 movaps xmm2, [esi+eax+1*4*4] mulps xmm2, xmm6 addps xmm2, [edi-3*4*4] addps xmm6, xmm7 movaps [edi-3*4*4], xmm2 movaps xmm3, [esi+eax+2*4*4] mulps xmm3, xmm6 addps xmm3, [edi-2*4*4] addps xmm6, xmm7 movaps [edi-2*4*4], xmm3 movaps xmm4, [esi+eax+3*4*4] mulps xmm4, xmm6 addps xmm4, [edi-1*4*4] addps xmm6, xmm7 movaps [edi-1*4*4], xmm4 add eax, 4*4*4 jl loop16 } #else int i; float incL; float incR; float sL0, sL1; float sR0, sR1; assert( numSamples == MIXBUFFER_SAMPLES ); incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sR0 = lastV[1]; sL1 = lastV[0] + incL; sR1 = lastV[1] + incR; incL *= 2; incR *= 2; for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { mixBuffer[i*2+0] += samples[i*2+0] * sL0; mixBuffer[i*2+1] += samples[i*2+1] * sR0; mixBuffer[i*2+2] += samples[i*2+2] * sL1; mixBuffer[i*2+3] += samples[i*2+3] * sR1; sL0 += incL; sR0 += incR; sL1 += incL; sR1 += incR; } #endif } /* ============ idSIMD_SSE::MixSoundSixSpeakerMono ============ */ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { #if 1 ALIGN16( float incs[6] ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 2 add esi, eax neg eax mov ecx, lastV movlps xmm2, [ecx+ 0] movhps xmm2, [ecx+ 8] movlps xmm3, [ecx+16] movaps xmm4, xmm2 shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 ) xorps xmm5, xmm5 movhps xmm5, incs movlps xmm7, incs+8 movhps xmm7, incs+16 addps xmm3, xmm5 addps xmm4, xmm7 shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 ) movaps xmm6, xmm7 shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 ) addps xmm5, xmm5 addps xmm6, xmm6 addps xmm7, xmm7 loop24: add edi, 6*16 movaps xmm0, [esi+eax] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) mulps xmm1, xmm2 addps xmm1, [edi-6*16] addps xmm2, xmm5 movaps [edi-6*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 ) mulps xmm1, xmm3 addps xmm1, [edi-5*16] addps xmm3, xmm6 movaps [edi-5*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 ) mulps xmm1, xmm4 addps xmm1, [edi-4*16] addps xmm4, xmm7 movaps [edi-4*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 ) mulps xmm1, xmm2 addps xmm1, [edi-3*16] addps xmm2, xmm5 movaps [edi-3*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 ) mulps xmm1, xmm3 addps xmm1, [edi-2*16] addps xmm3, xmm6 movaps [edi-2*16], xmm1 shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 ) mulps xmm0, xmm4 addps xmm0, [edi-1*16] addps xmm4, xmm7 movaps [edi-1*16], xmm0 add eax, 4*4 jl loop24 } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11; float incL0, incL1, incL2, incL3, incL4, incL5; assert( numSamples == MIXBUFFER_SAMPLES ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[2]; sL3 = lastV[3]; sL4 = lastV[4]; sL5 = lastV[5]; sL6 = lastV[0] + incL0; sL7 = lastV[1] + incL1; sL8 = lastV[2] + incL2; sL9 = lastV[3] + incL3; sL10 = lastV[4] + incL4; sL11 = lastV[5] + incL5; incL0 *= 2; incL1 *= 2; incL2 *= 2; incL3 *= 2; incL4 *= 2; incL5 *= 2; for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { mixBuffer[i*6+ 0] += samples[i+0] * sL0; mixBuffer[i*6+ 1] += samples[i+0] * sL1; mixBuffer[i*6+ 2] += samples[i+0] * sL2; mixBuffer[i*6+ 3] += samples[i+0] * sL3; mixBuffer[i*6+ 4] += samples[i+0] * sL4; mixBuffer[i*6+ 5] += samples[i+0] * sL5; mixBuffer[i*6+ 6] += samples[i+1] * sL6; mixBuffer[i*6+ 7] += samples[i+1] * sL7; mixBuffer[i*6+ 8] += samples[i+1] * sL8; mixBuffer[i*6+ 9] += samples[i+1] * sL9; mixBuffer[i*6+10] += samples[i+1] * sL10; mixBuffer[i*6+11] += samples[i+1] * sL11; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL0; sL7 += incL1; sL8 += incL2; sL9 += incL3; sL10 += incL4; sL11 += incL5; } #endif } /* ============ idSIMD_SSE::MixSoundSixSpeakerStereo ============ */ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { #if 1 ALIGN16( float incs[6] ); assert( numSamples == MIXBUFFER_SAMPLES ); assert( SPEAKER_RIGHT == 1 ); assert( SPEAKER_BACKRIGHT == 5 ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 3 add esi, eax neg eax mov ecx, lastV movlps xmm2, [ecx+ 0] movhps xmm2, [ecx+ 8] movlps xmm3, [ecx+16] movaps xmm4, xmm2 shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 ) xorps xmm5, xmm5 movhps xmm5, incs movlps xmm7, incs+ 8 movhps xmm7, incs+16 addps xmm3, xmm5 addps xmm4, xmm7 shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 ) movaps xmm6, xmm7 shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 ) addps xmm5, xmm5 addps xmm6, xmm6 addps xmm7, xmm7 loop12: add edi, 3*16 movaps xmm0, [esi+eax+0] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 ) mulps xmm1, xmm2 addps xmm1, [edi-3*16] addps xmm2, xmm5 movaps [edi-3*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 ) mulps xmm1, xmm3 addps xmm1, [edi-2*16] addps xmm3, xmm6 movaps [edi-2*16], xmm1 add eax, 4*4 shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 ) mulps xmm0, xmm4 addps xmm0, [edi-1*16] addps xmm4, xmm7 movaps [edi-1*16], xmm0 jl loop12 emms } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11; float incL0, incL1, incL2, incL3, incL4, incL5; assert( numSamples == MIXBUFFER_SAMPLES ); assert( SPEAKER_RIGHT == 1 ); assert( SPEAKER_BACKRIGHT == 5 ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[2]; sL3 = lastV[3]; sL4 = lastV[4]; sL5 = lastV[5]; sL6 = lastV[0] + incL0; sL7 = lastV[1] + incL1; sL8 = lastV[2] + incL2; sL9 = lastV[3] + incL3; sL10 = lastV[4] + incL4; sL11 = lastV[5] + incL5; incL0 *= 2; incL1 *= 2; incL2 *= 2; incL3 *= 2; incL4 *= 2; incL5 *= 2; for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0; mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1; mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2; mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3; mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4; mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5; mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6; mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7; mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8; mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9; mixBuffer[i*6+10] += samples[i*2+2+0] * sL10; mixBuffer[i*6+11] += samples[i*2+2+1] * sL11; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL0; sL7 += incL1; sL8 += incL2; sL9 += incL3; sL10 += incL4; sL11 += incL5; } #endif } /* ============ idSIMD_SSE::MixedSoundToSamples ============ */ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) { #if 1 assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 ); __asm { mov eax, numSamples mov edi, mixBuffer mov esi, samples shl eax, 2 add edi, eax neg eax loop16: movaps xmm0, [edi+eax+0*16] movaps xmm2, [edi+eax+1*16] movaps xmm4, [edi+eax+2*16] movaps xmm6, [edi+eax+3*16] add esi, 4*4*2 movhlps xmm1, xmm0 movhlps xmm3, xmm2 movhlps xmm5, xmm4 movhlps xmm7, xmm6 prefetchnta [edi+eax+64] cvtps2pi mm0, xmm0 cvtps2pi mm2, xmm2 cvtps2pi mm4, xmm4 cvtps2pi mm6, xmm6 prefetchnta [edi+eax+128] cvtps2pi mm1, xmm1 cvtps2pi mm3, xmm3 cvtps2pi mm5, xmm5 cvtps2pi mm7, xmm7 add eax, 4*16 packssdw mm0, mm1 packssdw mm2, mm3 packssdw mm4, mm5 packssdw mm6, mm7 movq [esi-4*4*2], mm0 movq [esi-3*4*2], mm2 movq [esi-2*4*2], mm4 movq [esi-1*4*2], mm6 jl loop16 emms } #else for ( int i = 0; i < numSamples; i++ ) { if ( mixBuffer[i] <= -32768.0f ) { samples[i] = -32768; } else if ( mixBuffer[i] >= 32767.0f ) { samples[i] = 32767; } else { samples[i] = (short) mixBuffer[i]; } } #endif } #endif /* _MSC_VER */