// Copyright (C) 2007 Id Software, Inc. // #include "../precompiled.h" #pragma hdrstop #include "Simd_Generic.h" #include "Simd_MMX.h" #include "Simd_SSE.h" //=============================================================== // M // SSE implementation of idSIMDProcessor MrE // E //=============================================================== #ifdef ID_WIN_X86_ASM #include "Simd_InstructionMacros.h" // with alignment #define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST ) #define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\ __asm mov ecx,DST \ __asm shr ecx,2 \ __asm mov ebx,COUNT \ __asm neg ecx \ __asm mov edx,SRC0 \ __asm and ecx,3 \ __asm mov esi,SRC1 \ __asm sub ebx,ecx \ __asm jge noUnderFlow \ __asm xor ebx,ebx \ __asm mov ecx,COUNT \ __asm noUnderFlow: \ __asm mov PRE,ecx \ __asm mov eax,ebx \ __asm mov edi,DST \ __asm and eax,8-1 \ __asm mov POST,eax \ __asm and ebx,0xfffffff8 \ __asm jle done \ __asm shl ebx,2 \ __asm lea ecx,[ecx*4+ebx] \ __asm neg ebx \ __asm add edx,ecx \ __asm add esi,ecx \ __asm add edi,ecx \ __asm mov eax,edx \ __asm or eax,esi // without alignment (pre==0) #define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST ) #define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST ) #define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\ __asm mov eax,COUNT \ __asm mov PRE,0 \ __asm and eax,8-1 \ __asm mov ebx,COUNT \ __asm mov POST,eax \ __asm and ebx,0xfffffff8 \ __asm je done \ __asm shl ebx,2 \ __asm mov edx,SRC0 \ __asm mov esi,SRC1 \ __asm mov edi,DST \ __asm add edx,ebx \ __asm add esi,ebx \ __asm add edi,ebx \ __asm mov eax,edx \ __asm or eax,esi \ __asm or eax,edi \ __asm neg ebx \ /* when OPER is called: edx = s0 esi = s1 edi = d ebx = index*4 xmm0 & xmm1 must not be trashed */ #define KMOVDS1( DST, SRC0 ) \ __asm movss xmm2,SRC0 \ __asm movss DST,xmm2 #define KMOVDS4( DST, SRC0 ) \ __asm movups xmm2,SRC0 \ __asm movups DST,xmm2 #define KMINDS1( DST, SRC0 ) \ __asm movss xmm2,SRC0 \ __asm minss DST,xmm2 #define KMAXDS1( DST, SRC0 ) \ __asm movss xmm2,SRC0 \ __asm maxss DST,xmm2 // general ALU operation #define KALUDSS1( OP, DST, SRC0, SRC1 ) \ __asm movss xmm2,SRC0 \ __asm OP##ss xmm2,SRC1 \ __asm movss DST,xmm2 #define KALUDSS4( OP, DST, SRC0, SRC1 ) \ __asm movups xmm2,SRC0 \ __asm movups xmm3,SRC1 \ __asm OP##ps xmm2,xmm3 \ __asm movups DST,xmm2 #define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST, SRC0, SRC1 ) #define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST, SRC0, SRC1 ) #define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST, SRC0, SRC1 ) #define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST, SRC0, SRC1 ) #define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST, SRC0, SRC1 ) #define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST, SRC0, SRC1 ) #define KDIVDSS1( DST, SRC0, SRC1 ) \ __asm movss xmm2,SRC1 \ __asm rcpss xmm3,xmm2 \ __asm mulss xmm2,xmm3 \ __asm mulss xmm2,xmm3 \ __asm addss xmm3,xmm3 \ __asm subss xmm3,xmm2 \ __asm mulss xmm3,SRC0 \ __asm movss DST,xmm3 #define KDIVDSS4( DST, SRC0, SRC1 ) \ __asm movups xmm2,SRC1 \ __asm rcpps xmm3,xmm2 \ __asm mulps xmm2,xmm3 \ __asm mulps xmm2,xmm3 \ __asm addps xmm3,xmm3 \ __asm subps xmm3,xmm2 \ __asm movups xmm2,SRC0 \ __asm mulps xmm3,xmm2 \ __asm movups DST,xmm3 #define KF2IDS1( SRC0 ) \ __asm movss xmm2,SRC0 \ __asm cvttps2pi mm2,xmm2 \ __asm movd [edi+ebx],mm2 #define KF2IDS4( SRC0 ) \ __asm movups xmm2,SRC0 \ __asm cvttps2pi mm2,xmm2 \ __asm movq [edi+ebx+0],mm2 \ __asm shufps xmm2,xmm2,SHUFFLE_PS(1,0,3,2) \ __asm cvttps2pi mm2,xmm2 \ __asm movq [edi+ebx+8],mm2 #define KISQRTDS1( DST,SRC0 ) \ __asm movss xmm2,SRC0 \ __asm rsqrtss xmm3,xmm2 \ __asm mulss xmm2,xmm3 \ __asm mulss xmm2,xmm3 \ __asm subss xmm2,xmm1 \ __asm mulss xmm3,xmm0 \ __asm mulss xmm3,xmm2 \ __asm movss DST,xmm3 #define KISQRTDS4( DST,SRC0 ) \ __asm movups xmm2,SRC0 \ __asm rsqrtps xmm3,xmm2 \ __asm mulps xmm2,xmm3 \ __asm mulps xmm2,xmm3 \ __asm subps xmm2,xmm1 \ __asm mulps xmm3,xmm0 \ __asm mulps xmm3,xmm2 \ __asm movups DST,xmm3 // this is used in vector4 implementation to shift constant V4 #define KANDREGDSV( DST, SRC0, VALUE ) \ __asm mov DST,SRC0 \ __asm and DST,VALUE // this is used in vector4 code to operate with float arrays as sources #define KEXPANDFLOAT( DST, SRC ) \ __asm movss DST,SRC \ __asm shufps DST,DST,0 #define KADDDS1( DST, SRC ) KADDDSS1( DST, DST, SRC ) #define KADDDS4( DST, SRC ) KADDDSS4( DST, DST, SRC ) #define KSUBDS1( DST, SRC ) KSUBDSS1( DST, DST, SRC ) #define KSUBDS4( DST, SRC ) KSUBDSS4( DST, DST, SRC ) #define KMULDS1( DST, SRC ) KMULDSS1( DST, DST, SRC ) #define KMULDS4( DST, SRC ) KMULDSS4( DST, DST, SRC ) #define KDIVDS1( DST, SRC ) KDIVDSS1( DST, DST, SRC ) #define KDIVDS4( DST, SRC ) KDIVDSS4( DST, DST, SRC ) // handles pre & post leftovers #define KFLOATOPER( OPER, OPER4, COUNT ) \ __asm mov ecx,pre \ __asm mov ebx,COUNT \ __asm cmp ebx,ecx \ __asm cmovl ecx,COUNT \ __asm test ecx,ecx \ __asm je preDone \ __asm xor ebx,ebx \ __asm lpPre: \ OPER \ __asm add ebx,4 \ __asm dec ecx \ __asm jg lpPre \ __asm preDone: \ __asm mov ecx,post \ __asm mov ebx,COUNT \ __asm sub ebx,ecx \ __asm shl ebx,2 \ __asm cmp ecx,4 \ __asm jl post4Done \ OPER4 \ __asm sub ecx,4 \ __asm add ebx,4*4 \ __asm post4Done: \ __asm test ecx,ecx \ __asm je postDone \ __asm lpPost: \ OPER \ __asm add ebx,4 \ __asm dec ecx \ __asm jg lpPost \ __asm postDone: // operate on a constant and a float array #define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \ int pre,post; \ __asm movss xmm0,CONSTANT \ __asm shufps xmm0,xmm0,0 \ KFLOATINITDS( DST, SRC, COUNT, pre, post ) \ __asm and eax,15 \ __asm jne lpNA \ __asm jmp lpA \ __asm align 16 \ __asm lpA: \ __asm prefetchnta [edx+ebx+64] \ __asm movaps xmm1,xmm0 \ __asm movaps xmm2,xmm0 \ __asm ALUOP##ps xmm1,[edx+ebx] \ __asm ALUOP##ps xmm2,[edx+ebx+16] \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpA \ __asm jmp done \ __asm align 16 \ __asm lpNA: \ __asm prefetchnta [edx+ebx+64] \ __asm movaps xmm1,xmm0 \ __asm movaps xmm2,xmm0 \ __asm movups xmm3,[edx+ebx] \ __asm movups xmm4,[edx+ebx+16] \ __asm ALUOP##ps xmm1,xmm3 \ __asm ALUOP##ps xmm2,xmm4 \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpNA \ __asm done: \ __asm mov edx,SRC \ __asm mov edi,DST \ __asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \ __asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT ) // operate on two float arrays #define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \ int pre,post; \ KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \ __asm and eax,15 \ __asm jne lpNA \ __asm jmp lpA \ __asm align 16 \ __asm lpA: \ __asm movaps xmm1,[edx+ebx] \ __asm movaps xmm2,[edx+ebx+16] \ __asm ALUOP##ps xmm1,[esi+ebx] \ __asm ALUOP##ps xmm2,[esi+ebx+16] \ __asm prefetchnta [edx+ebx+64] \ __asm prefetchnta [esi+ebx+64] \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpA \ __asm jmp done \ __asm align 16 \ __asm lpNA: \ __asm movups xmm1,[edx+ebx] \ __asm movups xmm2,[edx+ebx+16] \ __asm movups xmm3,[esi+ebx] \ __asm movups xmm4,[esi+ebx+16] \ __asm prefetchnta [edx+ebx+64] \ __asm prefetchnta [esi+ebx+64] \ __asm ALUOP##ps xmm1,xmm3 \ __asm ALUOP##ps xmm2,xmm4 \ __asm movaps [edi+ebx],xmm1 \ __asm movaps [edi+ebx+16],xmm2 \ __asm add ebx,16*2 \ __asm jl lpNA \ __asm done: \ __asm mov edx,SRC0 \ __asm mov esi,SRC1 \ __asm mov edi,DST \ KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \ KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT ) ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 ); ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 ); ALIGN4_INIT4( unsigned long SIMD_SP_firstSignBit, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_ZERO, IEEE_SP_ZERO ); ALIGN4_INIT1( unsigned long SIMD_SP_signBit, IEEE_SP_SIGN ); ALIGN4_INIT1( unsigned long SIMD_SP_absMask, ~IEEE_SP_SIGN ); ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, ~IEEE_SP_INF ); ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF ); ALIGN4_INIT4( unsigned long SIMD_SP_clearLast, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 ); ALIGN4_INIT4( unsigned long SIMD_SP_clearFirstThree, 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF ); ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) ); ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) ); ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) ); ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) ); ALIGN4_INIT4( unsigned long SIMD_DW_facing_mask, 1<<0, 1<<8, 1<<16, 1<<24 ); ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x0, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN ); ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x1, IEEE_SP_SIGN, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN ); ALIGN4_INIT4( unsigned long SIMD_SP_quat2mat_x2, IEEE_SP_ZERO, IEEE_SP_SIGN, IEEE_SP_SIGN, IEEE_SP_SIGN ); ALIGN4_INIT1( float SIMD_SP_zero, 0.0f ); ALIGN4_INIT1( float SIMD_SP_half, 0.5f ); ALIGN4_INIT1( float SIMD_SP_one, 1.0f ); ALIGN4_INIT1( float SIMD_SP_two, 2.0f ); ALIGN4_INIT1( float SIMD_SP_three, 3.0f ); ALIGN4_INIT1( float SIMD_SP_four, 4.0f ); ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) ); ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f ); ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI ); ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI ); ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI ); ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI ); ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY ); ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f ); ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f * 0.5f ); ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f ); ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f ); ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f ); ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f ); ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f ); ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f ); ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f ); ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f ); ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f ); ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f ); ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f ); ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f ); ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f ); ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f ); ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f ); ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f ); ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f ); ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f ); ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f ); ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f ); ALIGN4_INIT1( float SIMD_SP_decofs, ( 1.0f / ( 32767.0f / idCompressedJointQuat::MAX_BONE_LENGTH ) ) ); ALIGN4_INIT1( float SIMD_SP_decquat, ( 1.0f / 32767.0f ) ); /* ============ SSE_Cross ============ */ void SSE_Cross( const idVec4 &v1, const idVec4 &v2, idVec4 &result ) { __asm { mov esi, v1 mov edi, v2 mov ecx, result movaps xmm1, [edi] // xmm1 = v2.x, v2.y, v2.z shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 0, 3 ) // xmm1 = v2.y, v2.z, v2.x mulps xmm1, [esi] // xmm1 = v1.x * v2.y, v1.y * v2.z, v1.z * v2.x movaps xmm0, [esi] // xmm0 = v1.x, v1.y, v1.z shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 0, 3 ) // xmm0 = v1.y, v1.z, v1.x mulps xmm0, [edi] // xmm0 = v2.x * v1.y, v2.y * v1.z, v2.z * v1.x subps xmm1, xmm0 // xmm1 = v1.x * v2.y - v2.x * v1.y, v1.y * v2.z - v2.y * v1.z, v1.z * v2.x - v2.z * v1.x shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 0, 3 ) // xmm1 = v1.y * v2.z - v2.y * v1.z, v1.z * v2.x - v2.z * v1.x, v1.x * v2.y - v2.x * v1.y movaps [ecx], xmm1 } } /* ============ SSE_Dot ============ */ void SSE_Dot( const idVec4 &v1, const idVec4 &v2, float &result ) { __asm { mov esi, v1 mov edi, v2 mov ecx, result movaps xmm0, [esi] mulps xmm0, [edi] movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 0 ) addss xmm0, xmm1 movss [ecx], xmm0 } } /* ============ SSE_Dot4 ============ */ void SSE_Dot4( const idVec4 v1[4], const idVec4 v2[4], float result[4] ) { __asm { mov esi, v1 mov edi, v2 mov ecx, result movaps xmm0, [esi+0*16] mulps xmm0, [edi+0*16] movaps xmm1, [esi+1*16] mulps xmm1, [edi+1*16] movaps xmm2, [esi+2*16] mulps xmm2, [edi+2*16] movaps xmm3, [esi+3*16] mulps xmm3, [edi+3*16] movaps xmm4, xmm0 // xmm4 = x0, x1, x2, x3 movlhps xmm0, xmm1 // xmm0 = x0, x1, y0, y1 movhlps xmm1, xmm4 // xmm1 = x2, x3, y2, y3 addps xmm0, xmm1 // xmm0 = x0+x2, x1+x3, y0+y2, y1+y3 movaps xmm4, xmm2 // xmm4 = z0, z1, z2, z3 movlhps xmm2, xmm3 // xmm2 = z0, z1, w0, w1 movhlps xmm3, xmm4 // xmm3 = z2, z3, w2, w3 addps xmm2, xmm3 // xmm2 = z0+z2, z1+z3, w0+w2, w1+w3 movaps xmm1, xmm0 // xmm0 = x0+x2, x1+x3, y0+y2, y1+y3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm0 = x0+x2, y0+y2, z0+z2, w0+w2 shufps xmm1, xmm2, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm4 = x1+x3, y1+y3, z1+z3, w1+w3 addps xmm0, xmm1 // xmm2 = x0+x2+x1+x3, y0+y2+y1+y3, z0+z2+z1+z3, w0+w2+w1+w3 movaps [ecx], xmm0 } } /* ============ SSE_Reciprocal ============ */ float SSE_Reciprocal( float x ) { float y; __asm { movss xmm0, x rcpss xmm1, xmm0 mulss xmm0, xmm1 mulss xmm0, xmm1 addss xmm1, xmm1 subss xmm1, xmm0 movss y, xmm1 } return y; } /* ============ SSE_Reciprocal4 ============ */ float SSE_Reciprocal4( float x[4] ) { __asm { mov edi, x movaps xmm0, [edi] rcpps xmm1, xmm0 mulps xmm0, xmm1 mulps xmm0, xmm1 addps xmm1, xmm1 subps xmm1, xmm0 movaps [edi], xmm1 } } /* ============ SSE_ReciprocalSqrt ============ */ float SSE_ReciprocalSqrt( float x ) { float y; __asm { movss xmm0, x rsqrtss xmm1, xmm0 mulss xmm0, xmm1 mulss xmm0, xmm1 subss xmm0, SIMD_SP_rsqrt_c0 mulss xmm1, SIMD_SP_rsqrt_c1 mulss xmm0, xmm1 movss y, xmm0 } return y; } /* ============ SSE_ReciprocalSqrt4 ============ */ void SSE_ReciprocalSqrt4( float x[4] ) { __asm { mov edi, x movaps xmm0, [edi] rsqrtps xmm1, xmm0 mulps xmm0, xmm1 mulps xmm0, xmm1 subps xmm0, SIMD_SP_rsqrt_c0 mulps xmm1, SIMD_SP_rsqrt_c1 mulps xmm0, xmm1 movaps [edi], xmm0 } } /* ============ SSE_SinZeroHalfPI The angle must be between zero and half PI. ============ */ float SSE_SinZeroHalfPI( float a ) { #if 1 float t; assert( a >= 0.0f && a <= idMath::HALF_PI ); __asm { movss xmm0, a movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, SIMD_SP_sin_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 movss t, xmm2 } return t; #else float s, t; assert( a >= 0.0f && a <= idMath::HALF_PI ); s = a * a; t = -2.39e-08f; t *= s; t += 2.7526e-06f; t *= s; t += -1.98409e-04f; t *= s; t += 8.3333315e-03f; t *= s; t += -1.666666664e-01f; t *= s; t += 1.0f; t *= a; return t; #endif } /* ============ SSE_Sin4ZeroHalfPI The angle must be between zero and half PI. ============ */ void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) { __asm { mov edi, a mov esi, s movaps xmm0, [edi] movaps xmm1, xmm0 mulps xmm1, xmm1 movaps xmm2, SIMD_SP_sin_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 movaps [esi], xmm2 } } /* ============ SSE_Sin ============ */ float SSE_Sin( float a ) { #if 1 float t; __asm { movss xmm1, a movss xmm2, xmm1 movss xmm3, xmm1 mulss xmm2, SIMD_SP_oneOverTwoPI cvttss2si ecx, xmm2 cmpltss xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one cvtsi2ss xmm2, ecx subss xmm2, xmm3 mulss xmm2, SIMD_SP_twoPI subss xmm1, xmm2 movss xmm0, SIMD_SP_PI // xmm0 = PI subss xmm0, xmm1 // xmm0 = PI - a movss xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBit // xmm1 = signbit( PI - a ) movss xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movss xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBit // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBit : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, SIMD_SP_sin_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_sin_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 movss t, xmm2 } return t; #else float s, t; if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) { a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI; } a = idMath::PI - a; if ( fabs( a ) >= idMath::HALF_PI ) { a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a; } s = a * a; t = -2.39e-08f; t *= s; t += 2.7526e-06f; t *= s; t += -1.98409e-04f; t *= s; t += 8.3333315e-03f; t *= s; t += -1.666666664e-01f; t *= s; t += 1.0f; t *= a; return t; #endif } /* ============ SSE_Sin4 ============ */ void SSE_Sin4( float a[4], float s[4] ) { __asm { mov edi, a mov esi, s movaps xmm1, [edi] movaps xmm2, xmm1 mulps xmm2, SIMD_SP_oneOverTwoPI movhlps xmm3, xmm2 cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 0, 0 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 0, 0, 0 ) cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm3, R_SHUFFLE_PS( 1, 0, 1, 0 ) movaps xmm3, xmm1 cmpltps xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one subps xmm2, xmm3 mulps xmm2, SIMD_SP_twoPI subps xmm1, xmm2 movaps xmm0, SIMD_SP_PI // xmm0 = PI subps xmm0, xmm1 // xmm0 = PI - a movaps xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBit // xmm1 = signbit( PI - a ) movaps xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movaps xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBit // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBit : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movaps xmm1, xmm0 mulps xmm1, xmm1 movaps xmm2, SIMD_SP_sin_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_sin_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 movaps [esi], xmm2 } } /* ============ SSE_CosZeroHalfPI The angle must be between zero and half PI. ============ */ float SSE_CosZeroHalfPI( float a ) { #if 1 float t; assert( a >= 0.0f && a <= idMath::HALF_PI ); __asm { movss xmm0, a mulss xmm0, xmm0 movss xmm1, SIMD_SP_cos_c0 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c1 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c2 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c3 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c4 mulss xmm1, xmm0 addss xmm1, SIMD_SP_one movss t, xmm1 } return t; #else float s, t; assert( a >= 0.0f && a <= idMath::HALF_PI ); s = a * a; t = -2.605e-07f; t *= s; t += 2.47609e-05f; t *= s; t += -1.3888397e-03f; t *= s; t += 4.16666418e-02f; t *= s; t += -4.999999963e-01f; t *= s; t += 1.0f; return t; #endif } /* ============ SSE_Cos4ZeroHalfPI The angle must be between zero and half PI. ============ */ void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) { __asm { mov edi, a mov esi, c movaps xmm0, [edi] mulps xmm0, xmm0 movaps xmm1, SIMD_SP_cos_c0 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c1 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c2 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c3 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c4 mulps xmm1, xmm0 addps xmm1, SIMD_SP_one movaps [esi], xmm2 } } /* ============ SSE_Cos ============ */ float SSE_Cos( float a ) { #if 1 float t; __asm { movss xmm1, a movss xmm2, xmm1 movss xmm3, xmm1 mulss xmm2, SIMD_SP_oneOverTwoPI cvttss2si ecx, xmm2 cmpltss xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one cvtsi2ss xmm2, ecx subss xmm2, xmm3 mulss xmm2, SIMD_SP_twoPI subss xmm1, xmm2 movss xmm0, SIMD_SP_PI // xmm0 = PI subss xmm0, xmm1 // xmm0 = PI - a movss xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBit // xmm1 = signbit( PI - a ) movss xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movss xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBit // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBit : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 mulss xmm0, xmm0 movss xmm1, SIMD_SP_cos_c0 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c1 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c2 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c3 mulss xmm1, xmm0 addss xmm1, SIMD_SP_cos_c4 mulss xmm1, xmm0 addss xmm1, SIMD_SP_one xorps xmm2, SIMD_SP_signBit xorps xmm1, xmm2 movss t, xmm1 } return t; #else float s, t; if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) { a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI; } a = idMath::PI - a; if ( fabs( a ) >= idMath::HALF_PI ) { a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a; d = 1.0f; } else { d = -1.0f; } s = a * a; t = -2.605e-07f; t *= s; t += 2.47609e-05f; t *= s; t += -1.3888397e-03f; t *= s; t += 4.16666418e-02f; t *= s; t += -4.999999963e-01f; t *= s; t += 1.0f; t *= d; return t; #endif } /* ============ SSE_Cos4 ============ */ void SSE_Cos4( float a[4], float c[4] ) { __asm { mov edi, a mov esi, c movaps xmm1, [edi] movaps xmm2, xmm1 mulps xmm2, SIMD_SP_oneOverTwoPI movhlps xmm3, xmm2 cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 0, 0 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 0, 0, 0 ) cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm3, R_SHUFFLE_PS( 1, 0, 1, 0 ) movaps xmm3, xmm1 cmpltps xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one subps xmm2, xmm3 mulps xmm2, SIMD_SP_twoPI subps xmm1, xmm2 movaps xmm0, SIMD_SP_PI // xmm0 = PI subps xmm0, xmm1 // xmm0 = PI - a movaps xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBit // xmm1 = signbit( PI - a ) movaps xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movaps xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBit // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBit : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 mulps xmm0, xmm0 movaps xmm1, SIMD_SP_cos_c0 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c1 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c2 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c3 mulps xmm1, xmm0 addps xmm1, SIMD_SP_cos_c4 mulps xmm1, xmm0 addps xmm1, SIMD_SP_one xorps xmm2, SIMD_SP_signBit xorps xmm1, xmm2 movaps [esi], xmm1 } } /* ============ SSE_SinCos ============ */ void SSE_SinCos( float a, float &s, float &c ) { __asm { mov edi, s mov esi, c movss xmm1, a movss xmm2, xmm1 movss xmm3, xmm1 mulss xmm2, SIMD_SP_oneOverTwoPI cvttss2si ecx, xmm2 cmpltss xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one cvtsi2ss xmm2, ecx subss xmm2, xmm3 mulss xmm2, SIMD_SP_twoPI subss xmm1, xmm2 movss xmm0, SIMD_SP_PI // xmm0 = PI subss xmm0, xmm1 // xmm0 = PI - a movss xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBit // xmm1 = signbit( PI - a ) movss xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movss xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBit // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBit : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm3, SIMD_SP_sin_c0 movss xmm4, SIMD_SP_cos_c0 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c1 addss xmm4, SIMD_SP_cos_c1 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c2 addss xmm4, SIMD_SP_cos_c2 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c3 addss xmm4, SIMD_SP_cos_c3 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_sin_c4 addss xmm4, SIMD_SP_cos_c4 mulss xmm3, xmm1 mulss xmm4, xmm1 addss xmm3, SIMD_SP_one addss xmm4, SIMD_SP_one mulss xmm3, xmm0 xorps xmm2, SIMD_SP_signBit xorps xmm4, xmm2 movss [edi], xmm2 movss [esi], xmm3 } } /* ============ SSE_SinCos4 ============ */ void SSE_SinCos4( float a[4], float s[4], float c[4] ) { __asm { mov eax, a mov edi, s mov esi, c movaps xmm1, [eax] movaps xmm2, xmm1 mulps xmm2, SIMD_SP_oneOverTwoPI movhlps xmm3, xmm2 cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 0, 0 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 0, 0, 0 ) cvttss2si ecx, xmm2 cvtsi2ss xmm2, ecx cvttss2si edx, xmm3 cvtsi2ss xmm3, edx shufps xmm2, xmm3, R_SHUFFLE_PS( 1, 0, 1, 0 ) movaps xmm3, xmm1 cmpltps xmm3, SIMD_SP_zero andps xmm3, SIMD_SP_one subps xmm2, xmm3 mulps xmm2, SIMD_SP_twoPI subps xmm1, xmm2 movaps xmm0, SIMD_SP_PI // xmm0 = PI subps xmm0, xmm1 // xmm0 = PI - a movaps xmm1, xmm0 // xmm1 = PI - a andps xmm1, SIMD_SP_signBit // xmm1 = signbit( PI - a ) movaps xmm2, xmm0 // xmm2 = PI - a xorps xmm2, xmm1 // xmm2 = fabs( PI - a ) cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000 movaps xmm3, SIMD_SP_PI // xmm3 = PI xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a ) andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f andps xmm2, SIMD_SP_signBit // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBit : 0.0f xorps xmm0, xmm2 addps xmm0, xmm3 movaps xmm0, [eax] movaps xmm1, xmm0 mulps xmm1, xmm1 movaps xmm3, SIMD_SP_sin_c0 movaps xmm4, SIMD_SP_cos_c0 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c1 addps xmm4, SIMD_SP_cos_c1 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c2 addps xmm4, SIMD_SP_cos_c2 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c3 addps xmm4, SIMD_SP_cos_c3 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_sin_c4 addps xmm4, SIMD_SP_cos_c4 mulps xmm3, xmm1 mulps xmm4, xmm1 addps xmm3, SIMD_SP_one addps xmm4, SIMD_SP_one mulps xmm3, xmm0 xorps xmm2, SIMD_SP_signBit xorps xmm4, xmm2 movaps [edi], xmm3 movaps [esi], xmm4 } } /* ============ SSE_ATanPositive Both 'x' and 'y' must be positive. ============ */ float SSE_ATanPositive( float y, float x ) { #if 1 float t; assert( y >= 0.0f && x >= 0.0f ); __asm { movss xmm0, x movss xmm3, xmm0 movss xmm1, y minss xmm0, xmm1 maxss xmm1, xmm3 cmpeqss xmm3, xmm0 rcpss xmm2, xmm1 mulss xmm1, xmm2 mulss xmm1, xmm2 addss xmm2, xmm2 subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulss xmm0, xmm2 // xmm0 = x / y or y / x movss xmm1, xmm3 andps xmm1, SIMD_SP_signBit xorps xmm0, xmm1 // xmm0 = -x / y or y / x andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f movss xmm1, xmm0 mulss xmm1, xmm1 // xmm1 = s movss xmm2, SIMD_SP_atan_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c5 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c6 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c7 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 addss xmm2, xmm3 movss t, xmm2 } return t; #else float a, d, s, t; assert( y >= 0.0f && x >= 0.0f ); if ( y > x ) { a = -x / y; d = idMath::HALF_PI; } else { a = y / x; d = 0.0f; } s = a * a; t = 0.0028662257f; t *= s; t += -0.0161657367f; t *= s; t += 0.0429096138f; t *= s; t += -0.0752896400f; t *= s; t += 0.1065626393f; t *= s; t += -0.1420889944f; t *= s; t += 0.1999355085f; t *= s; t += -0.3333314528f; t *= s; t += 1.0f; t *= a; t += d; return t; #endif } /* ============ SSE_ATan4Positive Both 'x' and 'y' must be positive. ============ */ void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) { __asm { mov esi, x mov edi, y mov edx, at movaps xmm0, [esi] movaps xmm3, xmm0 movaps xmm1, [edi] minps xmm0, xmm1 maxps xmm1, xmm3 cmpeqps xmm3, xmm0 rcpps xmm2, xmm1 mulps xmm1, xmm2 mulps xmm1, xmm2 addps xmm2, xmm2 subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulps xmm0, xmm2 // xmm0 = x / y or y / x movaps xmm1, xmm3 andps xmm1, SIMD_SP_signBit xorps xmm0, xmm1 // xmm0 = -x / y or y / x andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f movaps xmm1, xmm0 mulps xmm1, xmm1 // xmm1 = s movaps xmm2, SIMD_SP_atan_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c5 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c6 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c7 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 addps xmm2, xmm3 movaps [edx], xmm2 } } /* ============ SSE_ATan ============ */ float SSE_ATan( float y, float x ) { #if 1 float t; __asm { movss xmm0, x movss xmm3, xmm0 movss xmm4, xmm0 andps xmm0, SIMD_SP_absMask movss xmm1, y xorps xmm4, xmm1 andps xmm1, SIMD_SP_absMask andps xmm4, SIMD_SP_signBit minss xmm0, xmm1 maxss xmm1, xmm3 cmpeqss xmm3, xmm0 rcpss xmm2, xmm1 mulss xmm1, xmm2 mulss xmm1, xmm2 addss xmm2, xmm2 subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulss xmm0, xmm2 // xmm0 = x / y or y / x xorps xmm0, xmm4 movss xmm1, xmm3 andps xmm1, SIMD_SP_signBit xorps xmm0, xmm1 // xmm0 = -x / y or y / x orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f movss xmm1, xmm0 mulss xmm1, xmm1 // xmm1 = s movss xmm2, SIMD_SP_atan_c0 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c1 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c2 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c3 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c4 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c5 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c6 mulss xmm2, xmm1 addss xmm2, SIMD_SP_atan_c7 mulss xmm2, xmm1 addss xmm2, SIMD_SP_one mulss xmm2, xmm0 addss xmm2, xmm3 movss t, xmm2 } return t; #else float a, d, s, t; if ( fabs( y ) > fabs( x ) ) { a = -x / y; d = idMath::HALF_PI; *((unsigned long *)&d) ^= ( *((unsigned long *)&x) ^ *((unsigned long *)&y) ) & (1<<31); } else { a = y / x; d = 0.0f; } s = a * a; t = 0.0028662257f; t *= s; t += -0.0161657367f; t *= s; t += 0.0429096138f; t *= s; t += -0.0752896400f; t *= s; t += 0.1065626393f; t *= s; t += -0.1420889944f; t *= s; t += 0.1999355085f; t *= s; t += -0.3333314528f; t *= s; t += 1.0f; t *= a; t += d; return t; #endif } /* ============ SSE_ATan4 ============ */ void SSE_ATan4( float y[4], float x[4], float at[4] ) { __asm { mov esi, x mov edi, y mov edx, at movaps xmm0, [esi] movaps xmm3, xmm0 movaps xmm4, xmm0 andps xmm0, SIMD_SP_absMask movaps xmm1, [edi] xorps xmm4, xmm1 andps xmm1, SIMD_SP_absMask andps xmm4, SIMD_SP_signBit minps xmm0, xmm1 maxps xmm1, xmm3 cmpeqps xmm3, xmm0 rcpps xmm2, xmm1 mulps xmm1, xmm2 mulps xmm1, xmm2 addps xmm2, xmm2 subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x mulps xmm0, xmm2 // xmm0 = x / y or y / x xorps xmm0, xmm4 movaps xmm1, xmm3 andps xmm1, SIMD_SP_signBit xorps xmm0, xmm1 // xmm0 = -x / y or y / x orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f movaps xmm1, xmm0 mulps xmm1, xmm1 // xmm1 = s movaps xmm2, SIMD_SP_atan_c0 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c1 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c2 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c3 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c4 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c5 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c6 mulps xmm2, xmm1 addps xmm2, SIMD_SP_atan_c7 mulps xmm2, xmm1 addps xmm2, SIMD_SP_one mulps xmm2, xmm0 addps xmm2, xmm3 movaps [edx], xmm2 } } /* ============ SSE_TestTrigonometry ============ */ void SSE_TestTrigonometry( void ) { int i; float a, s1, s2, c1, c2; for ( i = 0; i < 100; i++ ) { a = i * idMath::HALF_PI / 100.0f; s1 = sin( a ); s2 = SSE_SinZeroHalfPI( a ); if ( fabs( s1 - s2 ) > 1e-7f ) { assert( 0 ); } c1 = cos( a ); c2 = SSE_CosZeroHalfPI( a ); if ( fabs( c1 - c2 ) > 1e-7f ) { assert( 0 ); } } for ( i = -200; i < 200; i++ ) { a = i * idMath::TWO_PI / 100.0f; s1 = sin( a ); s2 = SSE_Sin( a ); if ( fabs( s1 - s2 ) > 1e-6f ) { assert( 0 ); } c1 = cos( a ); c2 = SSE_Cos( a ); if ( fabs( c1 - c2 ) > 1e-6f ) { assert( 0 ); } SSE_SinCos( a, s2, c2 ); if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) { assert( 0 ); } } } /* ============ idSIMD_SSE::GetName ============ */ const char * idSIMD_SSE::GetName( void ) const { return "MMX & SSE"; } /* ============ idSIMD_SSE::Add dst[i] = constant + src[i]; ============ */ void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) { KFLOAT_CA( add, dst, src, constant, count ) } /* ============ idSIMD_SSE::Add dst[i] = src0[i] + src1[i]; ============ */ void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) { KFLOAT_AA( add, dst, src0, src1, count ) } /* ============ idSIMD_SSE::Sub dst[i] = constant - src[i]; ============ */ void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) { KFLOAT_CA( sub, dst, src, constant, count ) } /* ============ idSIMD_SSE::Sub dst[i] = src0[i] - src1[i]; ============ */ void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) { KFLOAT_AA( sub, dst, src0, src1, count ) } /* ============ idSIMD_SSE::Mul dst[i] = constant * src[i]; ============ */ void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) { KFLOAT_CA( mul, dst, src, constant, count ) } /* ============ idSIMD_SSE::Mul dst[i] = src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) { KFLOAT_AA( mul, dst, src0, src1, count ) } /* ============ idSIMD_SSE::Div dst[i] = constant / src[i]; ============ */ void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) { int pre, post; // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x)); __asm { movss xmm1,constant shufps xmm1,xmm1,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [edx+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 mulps xmm4,xmm1 mulps xmm5,xmm1 movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [edx+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 mulps xmm4,xmm1 mulps xmm5,xmm1 movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpNA done: mov edx,src mov edi,dst KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ), KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count ) } } /* ============ idSIMD_SSE::Div dst[i] = src0[i] / src1[i]; ============ */ void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) { int pre,post; // 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x)); __asm { KFLOATINITDSS( dst, src0, src1, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[esi+ebx] movaps xmm3,[esi+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [esi+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 mulps xmm4,[edx+ebx] mulps xmm5,[edx+ebx+16] movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[esi+ebx] movups xmm3,[esi+ebx+16] rcpps xmm4,xmm2 rcpps xmm5,xmm3 prefetchnta [esi+ebx+64] mulps xmm2,xmm4 mulps xmm2,xmm4 mulps xmm3,xmm5 mulps xmm3,xmm5 addps xmm4,xmm4 addps xmm5,xmm5 subps xmm4,xmm2 subps xmm5,xmm3 movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] mulps xmm4,xmm2 mulps xmm5,xmm3 movaps [edi+ebx],xmm4 movaps [edi+ebx+16],xmm5 add ebx,16*2 jl lpNA done: mov edx,src0 mov esi,src1 mov edi,dst KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ), KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count ) } } /* ============ Simd_MulAdd assumes count >= 7 ============ */ static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) { __asm mov esi, dst __asm mov edi, src __asm mov eax, count __asm shl eax, 2 __asm mov ecx, esi __asm mov edx, eax __asm or ecx, edi __asm fld constant __asm and ecx, 15 __asm jz SimdMulAdd16 __asm and ecx, 3 __asm jnz SimdMulAdd8 __asm mov ecx, esi __asm xor ecx, edi __asm and ecx, 15 __asm jnz MulAdd8 __asm mov ecx, esi __asm and ecx, 15 __asm neg ecx __asm add ecx, 16 __asm sub eax, ecx __asm add edi, ecx __asm add esi, ecx __asm neg ecx __asm mov edx, eax __asm loopPreMulAdd16: __asm fld st __asm fmul dword ptr [edi+ecx] __asm fadd dword ptr [esi+ecx] __asm fstp dword ptr [esi+ecx] __asm add ecx, 4 __asm jl loopPreMulAdd16 __asm SimdMulAdd16: __asm and eax, ~15 __asm movss xmm1, constant __asm shufps xmm1, xmm1, 0x00 __asm add esi, eax __asm add edi, eax __asm neg eax __asm align 16 __asm loopMulAdd16: __asm movaps xmm0, [edi+eax] __asm mulps xmm0, xmm1 __asm addps xmm0, [esi+eax] __asm movaps [esi+eax], xmm0 __asm add eax, 16 __asm jl loopMulAdd16 __asm jmp postMulAdd __asm MulAdd8: __asm mov ecx, esi __asm and ecx, 7 __asm jz SimdMulAdd8 __asm sub eax, ecx __asm add esi, ecx __asm add edi, ecx __asm neg ecx __asm mov edx, eax __asm loopPreMulAdd8: __asm fld st __asm fmul dword ptr [edi+ecx] __asm fadd dword ptr [esi+ecx] __asm fstp dword ptr [esi+ecx] __asm add ecx, 4 __asm jl loopPreMulAdd8 __asm SimdMulAdd8: __asm and eax, ~15 __asm movss xmm1, constant __asm shufps xmm1, xmm1, 0x00 __asm add esi, eax __asm add edi, eax __asm neg eax __asm align 16 __asm loopMulAdd8: __asm movlps xmm0, [edi+eax] __asm movhps xmm0, [edi+eax+8] __asm mulps xmm0, xmm1 __asm movlps xmm2, [esi+eax] __asm movhps xmm2, [esi+eax+8] __asm addps xmm0, xmm2 __asm movlps [esi+eax], xmm0 __asm movhps [esi+eax+8], xmm0 __asm add eax, 16 __asm jl loopMulAdd8 __asm jmp postMulAdd __asm postMulAdd: __asm and edx, 15 __asm jz MulAddDone __asm add esi, edx __asm add edi, edx __asm neg edx __asm loopPostMulAdd: __asm fld st __asm fmul dword ptr [edi+edx] __asm fadd dword ptr [esi+edx] __asm fstp dword ptr [esi+edx] __asm add edx, 4 __asm jl loopPostMulAdd __asm MulAddDone: __asm fstp st } #define MULADD_FEW( OPER ) \ switch( count ) { \ case 0: \ return; \ case 1: \ dst[0] OPER c * src[0]; \ return; \ case 2: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \ return; \ case 3: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \ return; \ case 4: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ return; \ case 5: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; \ return; \ case 6: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \ return; \ case 7: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \ return; \ case 8: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ return; \ case 9: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ dst[8] OPER c * src[8]; \ return; \ case 10: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \ return; \ case 11: \ dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \ dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \ dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \ return; \ } /* ============ idSIMD_SSE::MulAdd dst[i] += constant * src[i]; ============ */ void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) { float c = constant; MULADD_FEW( += ) Simd_MulAdd( dst, constant, src, count ); } /* ============ idSIMD_SSE::MulAdd dst[i] += src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) { for ( int i = 0; i < count; i++ ) { dst[i] += src0[i] + src1[i]; } } /* ============ idSIMD_SSE::MulSub dst[i] -= constant * src[i]; ============ */ void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) { float c = constant; MULADD_FEW( -= ) Simd_MulAdd( dst, -constant, src, count ); } /* ============ idSIMD_SSE::MulSub dst[i] -= src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) { for ( int i = 0; i < count; i++ ) { dst[i] -= src0[i] + src1[i]; } } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) { __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) jz done4 imul eax, 12 add esi, eax neg eax loop4: movlps xmm1, [esi+eax+ 0] movlps xmm2, [esi+eax+ 8] movlps xmm3, [esi+eax+16] movhps xmm1, [esi+eax+24] movhps xmm2, [esi+eax+32] movhps xmm3, [esi+eax+40] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLE_PS( 1, 3, 0, 2 ) shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 2, 1, 3 ) add ecx, 16 add eax, 4*12 mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 2, 1, 3 ) movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loop4 done4: and edx, 3 jz done1 loop1: movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 add ecx, 4 addss xmm0, xmm1 add eax, 12 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loop1 done1: } } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i].Normal() + src[i][3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) { __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm5, [edi+0] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [edi+4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, [edi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) jz startVert1 imul eax, 16 add esi, eax neg eax loopVert4: movlps xmm1, [esi+eax+ 0] movlps xmm3, [esi+eax+ 8] movhps xmm1, [esi+eax+16] movhps xmm3, [esi+eax+24] movlps xmm2, [esi+eax+32] movlps xmm4, [esi+eax+40] movhps xmm2, [esi+eax+48] movhps xmm4, [esi+eax+56] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm1, xmm2, R_SHUFFLE_PS( 1, 3, 1, 3 ) movaps xmm2, xmm3 shufps xmm2, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm3, xmm4, R_SHUFFLE_PS( 1, 3, 1, 3 ) add ecx, 16 add eax, 4*16 mulps xmm0, xmm5 mulps xmm1, xmm6 mulps xmm2, xmm7 addps xmm0, xmm3 addps xmm0, xmm1 addps xmm0, xmm2 movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 startVert1: and edx, 3 jz done loopVert1: movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm5 mulss xmm1, xmm6 mulss xmm2, xmm7 addss xmm0, [esi+eax+12] add ecx, 4 addss xmm0, xmm1 add eax, 16 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 done: } } /* ============ idSIMD_SSE::Dot dst[i] = constant * src[i].xyz; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) { assert_16_byte_aligned( src ); __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) jz startVert1 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert4: movlps xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, X, X */ movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, 3, X, X */ movhps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, 4, 5 */ movhps xmm2, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, 3, 6, 7 */ movlps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm3 = 8, 9, X, X */ movss xmm7, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm3 = 10, 11, X, X */ movhps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm3 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm3 = 10, 11, 14, 15 */ movaps xmm0, xmm1 /* xmm0 = 0, 1, 4, 5 */ shufps xmm0, xmm3, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm0 = 0, 4, 8, 12 */ shufps xmm1, xmm3, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm1 = 1, 5, 9, 13 */ shufps xmm2, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm2 = 2, 6, 10, 14 */ add ecx, 16 add eax, 4*DRAWVERT_SIZE mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 startVert1: and edx, 3 jz done loopVert1: movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 add ecx, 4 addss xmm0, xmm1 add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 done: } } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i] + constant[3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) { __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, [edi+12] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) jz done4 imul eax, 12 add esi, eax neg eax loop4: movlps xmm1, [esi+eax+ 0] movlps xmm2, [esi+eax+ 8] movlps xmm3, [esi+eax+16] movhps xmm1, [esi+eax+24] movhps xmm2, [esi+eax+32] movhps xmm3, [esi+eax+40] movaps xmm0, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLE_PS( 1, 3, 0, 2 ) shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 2, 1, 3 ) add ecx, 16 add eax, 4*12 mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 2, 1, 3 ) movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loop4 done4: and edx, 3 jz done1 loop1: movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] movss xmm2, [esi+eax+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 addss xmm0, xmm7 add ecx, 4 addss xmm0, xmm1 add eax, 12 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loop1 done1: } } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) { #define SINGLE_OP(SRC, DEST) \ __asm movlps xmm0,[SRC] \ __asm movlps xmm1,[SRC+8] \ __asm mulps xmm0,xmm4 \ __asm mulps xmm1,xmm5 \ __asm addps xmm0,xmm1 \ __asm movaps xmm1,xmm0 \ __asm shufps xmm1,xmm1,SHUFFLE_PS(1,1,1,1) \ __asm addss xmm0,xmm1 \ __asm movss [DEST],xmm0 \ __asm add SRC,16 \ __asm add DEST,4 #define DUAL_OP(SRC, DEST) \ __asm movlps xmm0,[SRC] \ __asm movlps xmm1,[SRC+8] \ __asm movhps xmm0,[SRC+16] \ __asm movhps xmm1,[SRC+24] \ __asm mulps xmm0,xmm4 \ __asm mulps xmm1,xmm5 \ __asm addps xmm0,xmm1 \ __asm shufps xmm1,xmm0,SHUFFLE_PS(2,0,1,0) \ __asm shufps xmm0,xmm0,SHUFFLE_PS(3,1,2,0) \ __asm addps xmm0,xmm1 \ __asm movhps [DEST],xmm0 \ __asm add SRC,32 \ __asm add DEST,8 __asm { mov edx, dst mov eax, src mov ebx, constant mov ecx, count movlps xmm4, [ebx] shufps xmm4, xmm4, SHUFFLE_PS(1,0,1,0) movlps xmm5, [ebx+8] shufps xmm5, xmm5, SHUFFLE_PS(1,0,1,0) xorps xmm0, xmm0 xorps xmm1, xmm1 _lpAlignDest: test edx, 0x0f jz _destAligned SINGLE_OP(eax,edx) dec ecx jnz _lpAlignDest jmp _vpExit _destAligned: push ecx cmp ecx, 4 jl _post and ecx, ~3 shl ecx, 2 lea eax, [eax+ecx*4] add edx, ecx neg ecx movlps xmm0, [eax+ecx*4] movhps xmm0, [eax+ecx*4+16] movlps xmm2, [eax+ecx*4+32] movhps xmm2, [eax+ecx*4+48] jmp _lpStart align 16 _lp: prefetchnta [eax+ecx*4+128] addps xmm1, xmm0 movlps xmm0, [eax+ecx*4] movhps xmm0, [eax+ecx*4+16] movlps xmm2, [eax+ecx*4+32] movhps xmm2, [eax+ecx*4+48] movaps [edx+ecx-16],xmm1 _lpStart: movlps xmm1, [eax+ecx*4+8] movhps xmm1, [eax+ecx*4+24] movlps xmm3, [eax+ecx*4+40] movhps xmm3, [eax+ecx*4+56] add ecx, 16 mulps xmm1, xmm5 mulps xmm2, xmm4 mulps xmm3, xmm5 addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2 mulps xmm0, xmm4 addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0 movaps xmm1, xmm0 shufps xmm0, xmm2, SHUFFLE_PS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0 shufps xmm1, xmm2, SHUFFLE_PS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0 js _lp addps xmm1, xmm0 movaps [edx+ecx-16], xmm1 _post: pop ecx and ecx, 0x3 cmp ecx, 2 jl _post1 DUAL_OP(eax,edx) sub ecx, 2 _post1: cmp ecx, 1 jne _vpExit SINGLE_OP(eax,edx) _vpExit: } #undef DUAL_OP #undef SINGLE_OP } /* ============ idSIMD_SSE::Dot dst[i] = constant.Normal() * src[i].xyz + constant[3]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) { assert_16_byte_aligned( src ); __asm { mov eax, count mov edi, constant mov edx, eax mov esi, src mov ecx, dst and eax, ~3 movss xmm4, [edi+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm5, [edi+4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [edi+8] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, [edi+12] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) jz startVert1 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert4: movlps xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, X, X */ movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, 3, X, X */ movhps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, 4, 5 */ movhps xmm2, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, 3, 6, 7 */ movlps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm3 = 8, 9, X, X */ movhps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] /* xmm3 = 8, 9, 12, 13 */ movaps xmm0, xmm1 /* xmm0 = 0, 1, 4, 5 */ shufps xmm0, xmm3, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm0 = 0, 4, 8, 12 */ shufps xmm1, xmm3, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm1 = 1, 5, 9, 13 */ movss xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm3 = 10, 11, X, X */ movhps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] /* xmm3 = 10, 11, 14, 15 */ shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm2 = 2, 6, 10, 14 */ add ecx, 16 add eax, 4*DRAWVERT_SIZE mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 addps xmm0, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 movlps [ecx-16+0], xmm0 movhps [ecx-16+8], xmm0 jl loopVert4 startVert1: and edx, 3 jz done loopVert1: movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulss xmm0, xmm4 mulss xmm1, xmm5 mulss xmm2, xmm6 addss xmm0, xmm7 add ecx, 4 addss xmm0, xmm1 add eax, DRAWVERT_SIZE addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loopVert1 done: } } /* ============ idSIMD_SSE::Dot dst[i] = src0[i] * src1[i]; ============ */ void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) { __asm { mov eax, count mov edi, src0 mov edx, eax mov esi, src1 mov ecx, dst and eax, ~3 jz done4 imul eax, 12 add edi, eax add esi, eax neg eax loop4: movlps xmm0, [esi+eax] // 0, 1, X, X movlps xmm3, [edi+eax] // 0, 1, X, X movlps xmm1, [esi+eax+8] // 2, 3, X, X movlps xmm4, [edi+eax+8] // 2, 3, X, X movhps xmm0, [esi+eax+24] // 0, 1, 6, 7 movhps xmm3, [edi+eax+24] // 0, 1, 6, 7 movhps xmm1, [esi+eax+32] // 2, 3, 8, 9 movhps xmm4, [edi+eax+32] // 2, 3, 8, 9 movlps xmm2, [esi+eax+16] // 4, 5, X, X movlps xmm5, [edi+eax+16] // 4, 5, X, X movhps xmm2, [esi+eax+40] // 4, 5, 10, 11 movhps xmm5, [edi+eax+40] // 4, 5, 10, 11 add ecx, 16 add eax, 48 mulps xmm0, xmm3 mulps xmm1, xmm4 mulps xmm2, xmm5 movaps xmm7, xmm0 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) // 0, 6, 3, 9 shufps xmm0, xmm2, R_SHUFFLE_PS( 1, 3, 0, 2 ) // 1, 7, 4, 10 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 2, 1, 3 ) // 2, 8, 5, 11 addps xmm7, xmm0 addps xmm7, xmm1 shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 2, 1, 3 ) movlps [ecx-16+0], xmm7 movhps [ecx-16+8], xmm7 jl loop4 done4: and edx, 3 jz done1 loop1: movss xmm0, [esi+eax+0] movss xmm3, [edi+eax+0] movss xmm1, [esi+eax+4] movss xmm4, [edi+eax+4] movss xmm2, [esi+eax+8] movss xmm5, [edi+eax+8] mulss xmm0, xmm3 mulss xmm1, xmm4 mulss xmm2, xmm5 add ecx, 4 addss xmm0, xmm1 add eax, 12 addss xmm0, xmm2 dec edx movss [ecx-4], xmm0 jnz loop1 done1: } } /* ============ idSIMD_SSE::Dot dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ... ============ */ void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) { switch( count ) { case 0: dot = 0.0f; return; case 1: dot = src1[0] * src2[0]; return; case 2: dot = src1[0] * src2[0] + src1[1] * src2[1]; return; case 3: dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2]; return; default: __asm { mov ecx, src1 mov edx, src2 mov eax, ecx or eax, edx and eax, 15 jz alignedDot // unaligned mov eax, count shr eax, 2 shl eax, 4 add ecx, eax add edx, eax neg eax movups xmm0, [ecx+eax] movups xmm1, [edx+eax] mulps xmm0, xmm1 add eax, 16 jz doneDot loopUnalignedDot: movups xmm1, [ecx+eax] movups xmm2, [edx+eax] mulps xmm1, xmm2 addps xmm0, xmm1 add eax, 16 jl loopUnalignedDot jmp doneDot // aligned alignedDot: mov eax, count shr eax, 2 shl eax, 4 add ecx, eax add edx, eax neg eax movaps xmm0, [ecx+eax] movaps xmm1, [edx+eax] mulps xmm0, xmm1 add eax, 16 jz doneDot loopAlignedDot: movaps xmm1, [ecx+eax] movaps xmm2, [edx+eax] mulps xmm1, xmm2 addps xmm0, xmm1 add eax, 16 jl loopAlignedDot doneDot: } switch( count & 3 ) { case 1: __asm { movss xmm1, [ecx] movss xmm2, [edx] mulss xmm1, xmm2 addss xmm0, xmm1 } break; case 2: __asm { xorps xmm2, xmm2 movlps xmm1, [ecx] movlps xmm2, [edx] mulps xmm1, xmm2 addps xmm0, xmm1 } break; case 3: __asm { movss xmm1, [ecx] movhps xmm1, [ecx+4] movss xmm2, [edx] movhps xmm2, [edx+4] mulps xmm1, xmm2 addps xmm0, xmm1 } break; } __asm { movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm1 mov eax, dot movss [eax], xmm0 } return; } } // // cmpeqps == Equal // cmpneqps != Not Equal // cmpltps < Less Than // cmpnltps >= Not Less Than // cmpnleps > Not Less Or Equal // #define FLIP not al #define NOFLIP #define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \ int i, cnt, pre, post; \ float *aligned; \ \ /* if the float array is not aligned on a 4 byte boundary */ \ if ( ((UINT_PTR) SRC0) & 3 ) { \ /* unaligned memory access */ \ pre = 0; \ cnt = COUNT >> 2; \ post = COUNT - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, SRC0 \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm mov ecx, 0x01010101 \ __asm loopNA: \ __asm movups xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, ecx \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopNA \ __asm pop ebx \ } \ else { \ /* aligned memory access */ \ aligned = (float *) ((((UINT_PTR) SRC0) + 15) & ~15); \ if ( (UINT_PTR)aligned > ((UINT_PTR)src0) + COUNT ) { \ pre = COUNT; \ post = 0; \ } \ else { \ pre = aligned - SRC0; \ cnt = (COUNT - pre) >> 2; \ post = COUNT - pre - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, aligned \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm add edi, pre \ __asm mov ecx, 0x01010101 \ __asm loopA: \ __asm movaps xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, ecx \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopA \ __asm pop ebx \ } \ } \ doneCmp: \ double c = constant; \ for ( i = 0; i < pre; i++ ) { \ dst[i] = src0[i] CMP c; \ } \ for ( i = count - post; i < count; i++ ) { \ dst[i] = src0[i] CMP c; \ } #define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \ int i, cnt, pre, post; \ float *aligned; \ \ /* if the float array is not aligned on a 4 byte boundary */ \ if ( ((UINT_PTR) SRC0) & 3 ) { \ /* unaligned memory access */ \ pre = 0; \ cnt = COUNT >> 2; \ post = COUNT - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, SRC0 \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm mov cl, bitNum \ __asm loopNA: \ __asm movups xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, 0x01010101 \ __asm shl ebx, cl \ __asm or ebx, dword ptr [edi] \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopNA \ __asm pop ebx \ } \ else { \ /* aligned memory access */ \ aligned = (float *) ((((UINT_PTR) SRC0) + 15) & ~15); \ if ( (UINT_PTR)aligned > ((UINT_PTR)src0) + COUNT ) { \ pre = COUNT; \ post = 0; \ } \ else { \ pre = aligned - SRC0; \ cnt = (COUNT - pre) >> 2; \ post = COUNT - pre - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, aligned \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm add edi, pre \ __asm mov cl, bitNum \ __asm loopA: \ __asm movaps xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, 0x01010101 \ __asm shl ebx, cl \ __asm or ebx, dword ptr [edi] \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopA \ __asm pop ebx \ } \ } \ doneCmp: \ float c = constant; \ for ( i = 0; i < pre; i++ ) { \ dst[i] |= ( src0[i] CMP c ) << BITNUM; \ } \ for ( i = count - post; i < count; i++ ) { \ dst[i] |= ( src0[i] CMP c ) << BITNUM; \ } #define COMPARESETBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \ int i, cnt, pre, post; \ float *aligned; \ \ /* if the float array is not aligned on a 4 byte boundary */ \ if ( ((UINT_PTR) SRC0) & 3 ) { \ /* unaligned memory access */ \ pre = 0; \ cnt = COUNT >> 2; \ post = COUNT - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, SRC0 \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm mov cl, bitNum \ __asm loopNA: \ __asm movups xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, 0x01010101 \ __asm shl ebx, cl \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopNA \ __asm pop ebx \ } \ else { \ /* aligned memory access */ \ aligned = (float *) ((((UINT_PTR) SRC0) + 15) & ~15); \ if ( (UINT_PTR)aligned > ((UINT_PTR)src0) + COUNT ) { \ pre = COUNT; \ post = 0; \ } \ else { \ pre = aligned - SRC0; \ cnt = (COUNT - pre) >> 2; \ post = COUNT - pre - (cnt<<2); \ __asm mov edx, cnt \ __asm test edx, edx \ __asm je doneCmp \ __asm push ebx \ __asm neg edx \ __asm mov esi, aligned \ __asm prefetchnta [esi+64] \ __asm movss xmm1, CONSTANT \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mov edi, DST \ __asm add edi, pre \ __asm mov cl, bitNum \ __asm loopA: \ __asm movaps xmm0, [esi] \ __asm prefetchnta [esi+128] \ __asm CMPSIMD xmm0, xmm1 \ __asm movmskps eax, xmm0 \ __asm DOFLIP \ __asm mov ah, al \ __asm shr ah, 1 \ __asm mov bx, ax \ __asm shl ebx, 14 \ __asm mov bx, ax \ __asm and ebx, 0x01010101 \ __asm shl ebx, cl \ __asm mov dword ptr [edi], ebx \ __asm add esi, 16 \ __asm add edi, 4 \ __asm inc edx \ __asm jl loopA \ __asm pop ebx \ } \ } \ doneCmp: \ float c = constant; \ for ( i = 0; i < pre; i++ ) { \ dst[i] = ( src0[i] CMP c ) << BITNUM; \ } \ for ( i = count - post; i < count; i++ ) { \ dst[i] = ( src0[i] CMP c ) << BITNUM; \ } /* ============ idSIMD_SSE::CmpGT dst[i] = src0[i] > constant; ============ */ #pragma warning( push ) #pragma warning( disable: 4740 ) void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP ) } /* ============ idSIMD_SSE::CmpGT dst[i] |= ( src0[i] > constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP ) } /* ============ idSIMD_SSE::CmpGE dst[i] = src0[i] >= constant; ============ */ void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpGE dst[i] |= ( src0[i] >= constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpLT dst[i] = src0[i] < constant; ============ */ void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP ) } /* ============ idSIMD_SSE::SetCmpLT dst[i] = ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::SetCmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPARESETBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP ) } /* ============ idSIMD_SSE::CmpLE dst[i] = src0[i] <= constant; ============ */ void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) { COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP ) } /* ============ idSIMD_SSE::CmpLE dst[i] |= ( src0[i] <= constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP ) } #pragma warning( pop ) /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) { int i, pre, post; min = idMath::INFINITY; max = -idMath::INFINITY; __asm { push ebx mov eax, min mov ebx, max movss xmm0, [eax] movss xmm1, [ebx] shufps xmm0, xmm0, 0 shufps xmm1, xmm1, 0 KFLOATINITS( src, count, pre, post ) and eax, 15 jz lpA jmp lpNA align 16 lpNA: movups xmm2, [edx+ebx] movups xmm3, [edx+ebx+16] minps xmm0, xmm2 maxps xmm1, xmm2 prefetchnta [edx+ebx+64] minps xmm0, xmm3 maxps xmm1, xmm3 add ebx, 16*2 jl lpNA jmp done2 lpA: movaps xmm2, [edx+ebx] movaps xmm3, [edx+ebx+16] minps xmm0, xmm2 maxps xmm1, xmm2 prefetchnta [edx+ebx+64] minps xmm0, xmm3 maxps xmm1, xmm3 add ebx, 16*2 jl lpA jmp done2 align 16 done2: movaps xmm2, xmm0 movaps xmm3, xmm1 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 2, 3, 0 ) minss xmm0, xmm2 maxss xmm1, xmm3 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 2, 3, 0 ) minss xmm0, xmm2 maxss xmm1, xmm3 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 2, 3, 0 ) minss xmm0, xmm2 maxss xmm1, xmm3 mov eax, min mov ebx, max movss [eax], xmm0 movss [ebx], xmm1 done: pop ebx } for ( i = 0; i < pre; i++ ) { float tmp = src[i]; if ( tmp > max ) { max = tmp; } if ( tmp < min ) { min = tmp; } } for ( i = count - post; i < count; i++ ) { float tmp = src[i]; if ( tmp > max ) { max = tmp; } if ( tmp < min ) { min = tmp; } } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) { __asm { mov eax, count test eax, eax movss xmm0, SIMD_SP_infinity xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) subps xmm1, xmm0 jz done mov esi, src test eax, 1 jz startLoop movlps xmm2, [esi] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) add esi, 2*4 dec eax minps xmm0, xmm2 maxps xmm1, xmm2 jz done startLoop: shl eax, 3 add esi, eax neg eax loopVert: movlps xmm2, [esi+eax] movhps xmm2, [esi+eax+8] add eax, 4*4 minps xmm0, xmm2 maxps xmm1, xmm2 jl loopVert done: movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 3, 0, 1 ) minps xmm0, xmm2 mov esi, min movlps [esi], xmm0 movaps xmm3, xmm1 shufps xmm3, xmm3, R_SHUFFLE_PS( 2, 3, 0, 1 ) maxps xmm1, xmm3 mov edi, max movlps [edi], xmm1 } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) { __asm { movss xmm0, SIMD_SP_infinity xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov esi, src mov eax, count and eax, ~3 jz done4 imul eax, 12 add esi, eax neg eax loop4: movss xmm4, [esi+eax+0*12+8] movhps xmm4, [esi+eax+0*12+0] minps xmm0, xmm4 maxps xmm1, xmm4 movss xmm5, [esi+eax+1*12+0] movhps xmm5, [esi+eax+1*12+4] minps xmm2, xmm5 maxps xmm3, xmm5 movss xmm6, [esi+eax+2*12+8] movhps xmm6, [esi+eax+2*12+0] minps xmm0, xmm6 maxps xmm1, xmm6 movss xmm7, [esi+eax+3*12+0] movhps xmm7, [esi+eax+3*12+4] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*12 jl loop4 done4: mov eax, count and eax, 3 jz done1 imul eax, 12 add esi, eax neg eax loop1: movss xmm4, [esi+eax+0*12+8] movhps xmm4, [esi+eax+0*12+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 12 jl loop1 done1: shufps xmm2, xmm2, R_SHUFFLE_PS( 3, 1, 0, 2 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 3, 1, 0, 2 ) minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) { __asm { movss xmm0, SIMD_SP_infinity xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov esi, src mov eax, count and eax, ~3 jz done4 imul eax, DRAWVERT_SIZE add esi, eax neg eax loop4: movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm2, xmm5 maxps xmm3, xmm5 movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm6 maxps xmm1, xmm6 movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*DRAWVERT_SIZE jl loop4 done4: mov eax, count and eax, 3 jz done1 imul eax, DRAWVERT_SIZE add esi, eax neg eax loop1: movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, DRAWVERT_SIZE jl loop1 done1: minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const vertIndex_t *indexes, const int count ) { #if defined( GL_INDEX_SHORT ) assert_sizeof( vertIndex_t, 2 ); __asm { movss xmm0, SIMD_SP_infinity xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov edi, indexes mov esi, src mov eax, count and eax, ~3 jz done4 shl eax, 1 add edi, eax neg eax loop4: movzx edx, word ptr [edi+eax+0] shl edx, DRAWVERT_SIZE_SHIFT movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 movzx edx, word ptr [edi+eax+2] shl edx, DRAWVERT_SIZE_SHIFT movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm2, xmm5 maxps xmm3, xmm5 movzx edx, word ptr [edi+eax+4] shl edx, DRAWVERT_SIZE_SHIFT movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm6 maxps xmm1, xmm6 movzx edx, word ptr [edi+eax+6] shl edx, DRAWVERT_SIZE_SHIFT movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*2 jl loop4 done4: mov eax, count and eax, 3 jz done1 shl eax, 1 add edi, eax neg eax loop1: movzx edx, word ptr [edi+eax+0] imul edx, DRAWVERT_SIZE; movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 2 jl loop1 done1: minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } #elif defined ( GL_INDEX_INT ) assert_sizeof( vertIndex_t, 4 ); __asm { movss xmm0, SIMD_SP_infinity xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov edi, indexes mov esi, src mov eax, count and eax, ~3 jz done4 shl eax, 2 add edi, eax neg eax loop4: mov edx, dword ptr [edi+eax+0] shl edx, DRAWVERT_SIZE_SHIFT movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 mov edx, dword ptr [edi+eax+4] shl edx, DRAWVERT_SIZE_SHIFT movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm2, xmm5 maxps xmm3, xmm5 mov edx, dword ptr [edi+eax+8] shl edx, DRAWVERT_SIZE_SHIFT movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm6 maxps xmm1, xmm6 mov edx, dword ptr [edi+eax+12] shl edx, DRAWVERT_SIZE_SHIFT movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*4 jl loop4 done4: mov eax, count and eax, 3 jz done1 shl eax, 2 add edi, eax neg eax loop1: mov edx, [edi+eax+0] imul edx, DRAWVERT_SIZE; movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8] movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, 4 jl loop1 done1: minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } #else idSIMD_Generic::MinMax( min, max, src, indexes, count ); #endif } /* ============ idSIMD_SSE::MinMax ============ */ void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const shadowCache_t *src, const int count ) { __asm { movss xmm0, SIMD_SP_infinity xorps xmm1, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) subps xmm1, xmm0 movaps xmm2, xmm0 movaps xmm3, xmm1 mov esi, src mov eax, count and eax, ~3 jz done4 imul eax, SHADOWVERT_SIZE add esi, eax neg eax loop4: movss xmm4, [esi+eax+0*SHADOWVERT_SIZE+8] movhps xmm4, [esi+eax+0*SHADOWVERT_SIZE+0] minps xmm0, xmm4 maxps xmm1, xmm4 movss xmm5, [esi+eax+1*SHADOWVERT_SIZE+8] movhps xmm5, [esi+eax+1*SHADOWVERT_SIZE+0] minps xmm2, xmm5 maxps xmm3, xmm5 movss xmm6, [esi+eax+2*SHADOWVERT_SIZE+8] movhps xmm6, [esi+eax+2*SHADOWVERT_SIZE+0] minps xmm0, xmm6 maxps xmm1, xmm6 movss xmm7, [esi+eax+3*SHADOWVERT_SIZE+8] movhps xmm7, [esi+eax+3*SHADOWVERT_SIZE+0] minps xmm2, xmm7 maxps xmm3, xmm7 add eax, 4*SHADOWVERT_SIZE jl loop4 done4: mov eax, count and eax, 3 jz done1 imul eax, SHADOWVERT_SIZE add esi, eax neg eax loop1: movss xmm4, [esi+eax+0*SHADOWVERT_SIZE+8] movhps xmm4, [esi+eax+0*SHADOWVERT_SIZE+0] minps xmm0, xmm4 maxps xmm1, xmm4 add eax, SHADOWVERT_SIZE jl loop1 done1: minps xmm0, xmm2 maxps xmm1, xmm3 mov esi, min movhps [esi], xmm0 movss [esi+8], xmm0 mov edi, max movhps [edi], xmm1 movss [edi+8], xmm1 } } /* ============ idSIMD_SSE::Clamp ============ */ void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) { int i, pre, post; __asm { movss xmm0,min movss xmm1,max shufps xmm0,xmm0,0 shufps xmm1,xmm1,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] maxps xmm2,xmm0 maxps xmm3,xmm0 prefetchnta [edx+ebx+64] minps xmm2,xmm1 minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] maxps xmm2,xmm0 maxps xmm3,xmm0 prefetchnta [edx+ebx+64] minps xmm2,xmm1 minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpNA done: } for ( i = 0; i < pre; i++ ) { if ( src[i] < min ) dst[i] = min; else if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } for( i = count - post; i < count; i++ ) { if ( src[i] < min ) dst[i] = min; else if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } } /* ============ idSIMD_SSE::ClampMin ============ */ void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) { int i, pre, post; __asm { movss xmm0,min shufps xmm0,xmm0,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] maxps xmm2,xmm0 prefetchnta [edx+ebx+64] maxps xmm3,xmm0 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] maxps xmm2,xmm0 prefetchnta [edx+ebx+64] maxps xmm3,xmm0 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpNA done: } for( i = 0; i < pre; i++ ) { if ( src[i] < min ) dst[i] = min; else dst[i] = src[i]; } for( i = count - post; i < count; i++ ) { if ( src[i] < min ) dst[i] = min; else dst[i] = src[i]; } } /* ============ idSIMD_SSE::ClampMax ============ */ void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) { int i, pre, post; __asm { movss xmm1,max shufps xmm1,xmm1,0 KFLOATINITDS( dst, src, count, pre, post ) and eax,15 jne lpNA jmp lpA align 16 lpA: movaps xmm2,[edx+ebx] movaps xmm3,[edx+ebx+16] minps xmm2,xmm1 prefetchnta [edx+ebx+64] minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpA jmp done align 16 lpNA: movups xmm2,[edx+ebx] movups xmm3,[edx+ebx+16] minps xmm2,xmm1 prefetchnta [edx+ebx+64] minps xmm3,xmm1 movaps [edi+ebx],xmm2 movaps [edi+ebx+16],xmm3 add ebx,16*2 jl lpNA done: } for( i = 0; i < pre; i++ ) { if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } for( i = count - post; i < count; i++ ) { if ( src[i] > max ) dst[i] = max; else dst[i] = src[i]; } } /* ============ idSIMD_SSE::Zero16 ============ */ void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) { __asm { mov edx, dst mov eax, count add eax, 3 shr eax, 2 jz doneZero16 shl eax, 4 add edx, eax neg eax xorps xmm0, xmm0 loopZero16: movaps [edx+eax], xmm0 add eax, 16 jl loopZero16 doneZero16: } } /* ============ idSIMD_SSE::Negate16 ============ */ void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) { __asm { mov edx, dst mov eax, count add eax, 3 shr eax, 2 jz doneNegate16 shl eax, 4 add edx, eax neg eax movss xmm0, SIMD_SP_signBit shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) loopNegate16: movaps xmm1, [edx+eax] xorps xmm1, xmm0 movaps [edx+eax], xmm1 add eax, 16 jl loopNegate16 doneNegate16: } } /* ============ idSIMD_SSE::Copy16 ============ */ void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) { __asm { mov ecx, src mov edx, dst mov eax, count add eax, 3 shr eax, 2 jz doneCopy16 shl eax, 4 add ecx, eax add edx, eax neg eax loopCopy16: movaps xmm0, [ecx+eax] movaps [edx+eax], xmm0 add eax, 16 jl loopCopy16 doneCopy16: } } /* ============ idSIMD_SSE::Add16 ============ */ void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) { __asm { mov ecx, src1 mov edx, src2 mov esi, dst mov eax, count add eax, 3 shr eax, 2 jz doneAdd16 shl eax, 4 add esi, eax add ecx, eax add edx, eax neg eax loopAdd16: movaps xmm0, [ecx+eax] addps xmm0, [edx+eax] movaps [esi+eax], xmm0 add eax, 16 jl loopAdd16 doneAdd16: } } /* ============ idSIMD_SSE::Sub16 ============ */ void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) { __asm { mov ecx, src1 mov edx, src2 mov esi, dst mov eax, count add eax, 3 shr eax, 2 jz doneSub16 shl eax, 4 add esi, eax add ecx, eax add edx, eax neg eax loopSub16: movaps xmm0, [ecx+eax] subps xmm0, [edx+eax] movaps [esi+eax], xmm0 add eax, 16 jl loopSub16 doneSub16: } } /* ============ idSIMD_SSE::Mul16 ============ */ void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) { __asm { mov ecx, dst mov edx, src1 mov eax, count add eax, 3 shr eax, 2 jz doneMulScalar16 movss xmm1, constant shl eax, 4 add ecx, eax add edx, eax neg eax shufps xmm1, xmm1, 0x00 loopMulScalar16: movaps xmm0, [edx+eax] mulps xmm0, xmm1 movaps [ecx+eax], xmm0 add eax, 16 jl loopMulScalar16 doneMulScalar16: } } /* ============ idSIMD_SSE::AddAssign16 ============ */ void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) { __asm { mov ecx, dst mov edx, src mov eax, count add eax, 3 shr eax, 2 jz doneAddAssign16 shl eax, 4 add ecx, eax add edx, eax neg eax loopAddAssign16: movaps xmm0, [ecx+eax] addps xmm0, [edx+eax] movaps [ecx+eax], xmm0 add eax, 16 jl loopAddAssign16 doneAddAssign16: } } /* ============ idSIMD_SSE::SubAssign16 ============ */ void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) { __asm { mov ecx, dst mov edx, src mov eax, count add eax, 3 shr eax, 2 jz doneSubAssign16 shl eax, 4 add ecx, eax add edx, eax neg eax loopSubAssign16: movaps xmm0, [ecx+eax] subps xmm0, [edx+eax] movaps [ecx+eax], xmm0 add eax, 16 jl loopSubAssign16 doneSubAssign16: } } /* ============ idSIMD_SSE::MulAssign16 ============ */ void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) { __asm { mov ecx, dst mov eax, count add eax, 3 shr eax, 2 jz doneMulAssign16 movss xmm1, constant shl eax, 4 add ecx, eax neg eax shufps xmm1, xmm1, 0x00 loopMulAssign16: movaps xmm0, [ecx+eax] mulps xmm0, xmm1 movaps [ecx+eax], xmm0 add eax, 16 jl loopMulAssign16 doneMulAssign16: } } /* ============ idSIMD_SSE::MatX_MultiplyVecX optimizes the following matrix multiplications: NxN * Nx1 Nx6 * 6x1 6xN * Nx1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss [eax+offset], reg1 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps [eax+offset], reg1 #define STORE4( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 \ __asm movhps [eax+offset+8], reg1 #define STOREC = int numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: { switch( numRows ) { case 1: { // 1x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] STORE1( 0, xmm0, xmm1 ) } return; } case 6: { // 6x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0]; mPtr++; } return; } } break; } case 2: { switch( numRows ) { case 2: { // 2x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm1, [esi+4] movss xmm2, [edi] mulss xmm2, xmm0 movss xmm3, [edi+4] mulss xmm3, xmm1 addss xmm2, xmm3 STORE1( 0, xmm2, xmm4 ) mulss xmm0, [edi+8] mulss xmm1, [edi+8+4] addss xmm0, xmm1 STORE1( 4, xmm0, xmm4 ) } return; } case 6: { // 6x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, [esi] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movaps xmm0, [edi] mulps xmm0, xmm7 movaps xmm1, [edi+16] mulps xmm1, xmm7 movaps xmm2, xmm0 shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm2, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) movaps xmm3, [edi+32] addps xmm0, xmm2 mulps xmm3, xmm7 STORE4( 0, xmm0, xmm4 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm1, xmm3 addps xmm3, xmm1 STORE2LO( 16, xmm3, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } return; } } break; } case 3: { switch( numRows ) { case 3: { // 3x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm4, [edi] mulss xmm4, xmm0 movss xmm1, [esi+4] movss xmm5, [edi+4] mulss xmm5, xmm1 addss xmm4, xmm5 movss xmm2, [esi+8] movss xmm6, [edi+8] mulss xmm6, xmm2 addss xmm4, xmm6 movss xmm3, [edi+12] mulss xmm3, xmm0 STORE1( 0, xmm4, xmm7 ); movss xmm5, [edi+12+4] mulss xmm5, xmm1 addss xmm3, xmm5 movss xmm6, [edi+12+8] mulss xmm6, xmm2 addss xmm3, xmm6 mulss xmm0, [edi+24] mulss xmm1, [edi+24+4] STORE1( 4, xmm3, xmm7 ); addss xmm0, xmm1 mulss xmm2, [edi+24+8] addss xmm0, xmm2 STORE1( 8, xmm0, xmm7 ); } return; } case 6: { // 6x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm5, [esi] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 movlps xmm1, [edi+4*4] shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 movlps xmm2, [edi+6*4] movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 mulps xmm0, xmm5 movlps xmm3, [edi+10*4] shufps xmm2, xmm3, R_SHUFFLE_PS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 movaps xmm3, xmm1 shufps xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 mulps xmm1, xmm6 shufps xmm3, xmm2, R_SHUFFLE_PS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm3 STORE4( 0, xmm0, xmm4 ) movss xmm1, [edi+12*4] mulss xmm1, xmm5 movss xmm2, [edi+13*4] mulss xmm2, xmm6 movss xmm3, [edi+14*4] mulss xmm3, xmm7 addss xmm1, xmm2 addss xmm1, xmm3 STORE1( 16, xmm1, xmm4 ) mulss xmm5, [edi+15*4] mulss xmm6, [edi+16*4] mulss xmm7, [edi+17*4] addss xmm5, xmm6 addss xmm5, xmm7 STORE1( 20, xmm5, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } return; } } break; } case 4: { switch( numRows ) { case 4: { // 4x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi ] movlps xmm0, qword ptr [edi ] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm4 STORE4( 0, xmm0, xmm2 ) } return; } case 6: { // 6x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi+ 0] movlps xmm0, qword ptr [edi+ 0] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm4 movlps xmm1, qword ptr [edi+64] movhps xmm1, qword ptr [edi+80] STORE4( 0, xmm0, xmm4 ) mulps xmm1, xmm6 movlps xmm2, qword ptr [edi+72] movhps xmm2, qword ptr [edi+88] mulps xmm2, xmm7 addps xmm1, xmm2 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm3, xmm1 addps xmm1, xmm3 STORE2LO( 16, xmm1, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } return; } } break; } case 5: { switch( numRows ) { case 5: { // 5x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 shufps xmm0, xmm5, R_SHUFFLE_PS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 shufps xmm1, xmm5, R_SHUFFLE_PS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 shufps xmm2, xmm5, R_SHUFFLE_PS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 shufps xmm3, xmm5, R_SHUFFLE_PS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 shufps xmm4, xmm5, R_SHUFFLE_PS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 movss xmm7, [esi+0*4] shufps xmm7, xmm7, 0 mulps xmm0, xmm7 movss xmm5, [esi+1*4] shufps xmm5, xmm5, 0 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm6, [esi+2*4] shufps xmm6, xmm6, 0 mulps xmm2, xmm6 addps xmm0, xmm2 movss xmm1, [esi+3*4] shufps xmm1, xmm1, 0 mulps xmm3, xmm1 addps xmm0, xmm3 movss xmm2, [esi+4*4] shufps xmm2, xmm2, 0 mulps xmm4, xmm2 addps xmm0, xmm4 mulss xmm7, [edi+20*4] mulss xmm5, [edi+21*4] addps xmm7, xmm5 mulss xmm6, [edi+22*4] addps xmm7, xmm6 mulss xmm1, [edi+23*4] addps xmm7, xmm1 mulss xmm2, [edi+24*4] addps xmm7, xmm2 STORE4( 0, xmm0, xmm3 ) STORE1( 16, xmm7, xmm4 ) } return; } case 6: { // 6x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [esi] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm0, [edi] movhps xmm3, [edi+8] movaps xmm1, [edi+16] movlps xmm2, [edi+32] shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 shufps xmm3, xmm1, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 mulps xmm0, xmm6 mulps xmm3, xmm7 movlps xmm2, [edi+40] addps xmm0, xmm3 // xmm0 + xmm1 movhps xmm5, [edi+40+8] movlps xmm3, [edi+40+16] movhps xmm3, [edi+40+24] movlps xmm4, [edi+40+32] shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 shufps xmm3, xmm4, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 shufps xmm5, xmm3, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 mulps xmm2, xmm6 mulps xmm5, xmm7 addps xmm2, xmm5 // xmm2 + xmm3 movss xmm5, [esi+16] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm4, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm4, xmm2, R_SHUFFLE_PS( 1, 3, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLE_PS( 0, 3, 0, 3 ) addps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 STORE4( 0, xmm0, xmm2 ) movlps xmm4, [edi+80] movhps xmm3, [edi+80+8] movaps xmm1, [edi+80+16] movlps xmm2, [edi+80+32] shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 shufps xmm3, xmm1, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 mulps xmm4, xmm6 mulps xmm3, xmm7 mulps xmm1, xmm5 addps xmm4, xmm3 // xmm4 + xmm1 shufps xmm1, xmm4, R_SHUFFLE_PS( 0, 3, 0, 2 ) shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 3, 0, 0 ) addps xmm4, xmm1 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 3, 0, 1 ) addps xmm4, xmm1 STORE2LO( 16, xmm4, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } return; } } break; } case 6: { switch( numRows ) { case 1: { // 1x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] movss xmm1, [esi+4] mulss xmm1, [edi+4] movss xmm2, [esi+8] addss xmm0, xmm1 mulss xmm2, [edi+8] movss xmm3, [esi+12] addss xmm0, xmm2 mulss xmm3, [edi+12] movss xmm4, [esi+16] addss xmm0, xmm3 mulss xmm4, [edi+16] movss xmm5, [esi+20] addss xmm0, xmm4 mulss xmm5, [edi+20] movss xmm6, [esi+24] addss xmm0, xmm5 mulss xmm6, [edi+24] addss xmm0, xmm6 STORE1( 0, xmm0, xmm7 ) } return; } case 2: { // 2x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) } return; } case 3: { // 3x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) // row 2 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm1 STORE1( 8, xmm0, xmm3 ) } return; } case 4: { // 4x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm4 ) } return; } case 5: { // 5x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm3 ) // row 5 movaps xmm0, [edi+96] movaps xmm1, [edi+96+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x01 addss xmm0, xmm1 STORE1( 16, xmm0, xmm3 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 STORE4( 0, xmm0, xmm3 ) STORE2LO( 16, xmm2, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } return; } } break; } default: { int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] STOREC sum; mPtr += numColumns; } break; } } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_MultiplyAddVecX optimizes the following matrix multiplications: NxN * Nx1 Nx6 * 6x1 6xN * Nx1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm addss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC += int numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: { switch( numRows ) { case 1: { // 1x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] STORE1( 0, xmm0, xmm1 ) } return; } case 6: { // 6x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0]; mPtr++; } return; } } break; } case 2: { switch( numRows ) { case 2: { // 2x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm1, [esi+4] movss xmm2, [edi] mulss xmm2, xmm0 movss xmm3, [edi+4] mulss xmm3, xmm1 addss xmm2, xmm3 STORE1( 0, xmm2, xmm4 ) mulss xmm0, [edi+8] mulss xmm1, [edi+8+4] addss xmm0, xmm1 STORE1( 4, xmm0, xmm4 ) } return; } case 6: { // 6x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, [esi] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movaps xmm0, [edi] mulps xmm0, xmm7 movaps xmm1, [edi+16] mulps xmm1, xmm7 movaps xmm2, xmm0 shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm2, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) movaps xmm3, [edi+32] addps xmm0, xmm2 mulps xmm3, xmm7 STORE4( 0, xmm0, xmm4 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm1, xmm3 addps xmm3, xmm1 STORE2LO( 16, xmm3, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } return; } } break; } case 3: { switch( numRows ) { case 3: { // 3x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm4, [edi] mulss xmm4, xmm0 movss xmm1, [esi+4] movss xmm5, [edi+4] mulss xmm5, xmm1 addss xmm4, xmm5 movss xmm2, [esi+8] movss xmm6, [edi+8] mulss xmm6, xmm2 addss xmm4, xmm6 movss xmm3, [edi+12] mulss xmm3, xmm0 STORE1( 0, xmm4, xmm7 ); movss xmm5, [edi+12+4] mulss xmm5, xmm1 addss xmm3, xmm5 movss xmm6, [edi+12+8] mulss xmm6, xmm2 addss xmm3, xmm6 mulss xmm0, [edi+24] mulss xmm1, [edi+24+4] STORE1( 4, xmm3, xmm7 ); addss xmm0, xmm1 mulss xmm2, [edi+24+8] addss xmm0, xmm2 STORE1( 8, xmm0, xmm7 ); } return; } case 6: { // 6x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm5, [esi] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 movlps xmm1, [edi+4*4] shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 movlps xmm2, [edi+6*4] movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 mulps xmm0, xmm5 movlps xmm3, [edi+10*4] shufps xmm2, xmm3, R_SHUFFLE_PS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 movaps xmm3, xmm1 shufps xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 mulps xmm1, xmm6 shufps xmm3, xmm2, R_SHUFFLE_PS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm3 STORE4( 0, xmm0, xmm4 ) movss xmm1, [edi+12*4] mulss xmm1, xmm5 movss xmm2, [edi+13*4] mulss xmm2, xmm6 movss xmm3, [edi+14*4] mulss xmm3, xmm7 addss xmm1, xmm2 addss xmm1, xmm3 STORE1( 16, xmm1, xmm4 ) mulss xmm5, [edi+15*4] mulss xmm6, [edi+16*4] mulss xmm7, [edi+17*4] addss xmm5, xmm6 addss xmm5, xmm7 STORE1( 20, xmm5, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } return; } } break; } case 4: { switch( numRows ) { case 4: { // 4x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi ] movlps xmm0, qword ptr [edi ] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm4 STORE4( 0, xmm0, xmm2 ) } return; } case 6: { // 6x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi+ 0] movlps xmm0, qword ptr [edi+ 0] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm4 movlps xmm1, qword ptr [edi+64] movhps xmm1, qword ptr [edi+80] STORE4( 0, xmm0, xmm4 ) mulps xmm1, xmm6 movlps xmm2, qword ptr [edi+72] movhps xmm2, qword ptr [edi+88] mulps xmm2, xmm7 addps xmm1, xmm2 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm3, xmm1 addps xmm1, xmm3 STORE2LO( 16, xmm1, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } return; } } break; } case 5: { switch( numRows ) { case 5: { // 5x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 shufps xmm0, xmm5, R_SHUFFLE_PS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 shufps xmm1, xmm5, R_SHUFFLE_PS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 shufps xmm2, xmm5, R_SHUFFLE_PS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 shufps xmm3, xmm5, R_SHUFFLE_PS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 shufps xmm4, xmm5, R_SHUFFLE_PS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 movss xmm7, [esi+0*4] shufps xmm7, xmm7, 0 mulps xmm0, xmm7 movss xmm5, [esi+1*4] shufps xmm5, xmm5, 0 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm6, [esi+2*4] shufps xmm6, xmm6, 0 mulps xmm2, xmm6 addps xmm0, xmm2 movss xmm1, [esi+3*4] shufps xmm1, xmm1, 0 mulps xmm3, xmm1 addps xmm0, xmm3 movss xmm2, [esi+4*4] shufps xmm2, xmm2, 0 mulps xmm4, xmm2 addps xmm0, xmm4 mulss xmm7, [edi+20*4] mulss xmm5, [edi+21*4] addps xmm7, xmm5 mulss xmm6, [edi+22*4] addps xmm7, xmm6 mulss xmm1, [edi+23*4] addps xmm7, xmm1 mulss xmm2, [edi+24*4] addps xmm7, xmm2 STORE4( 0, xmm0, xmm3 ) STORE1( 16, xmm7, xmm4 ) } return; } case 6: { // 6x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [esi] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm0, [edi] movhps xmm3, [edi+8] movaps xmm1, [edi+16] movlps xmm2, [edi+32] shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 shufps xmm3, xmm1, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 mulps xmm0, xmm6 mulps xmm3, xmm7 movlps xmm2, [edi+40] addps xmm0, xmm3 // xmm0 + xmm1 movhps xmm5, [edi+40+8] movlps xmm3, [edi+40+16] movhps xmm3, [edi+40+24] movlps xmm4, [edi+40+32] shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 shufps xmm3, xmm4, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 shufps xmm5, xmm3, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 mulps xmm2, xmm6 mulps xmm5, xmm7 addps xmm2, xmm5 // xmm2 + xmm3 movss xmm5, [esi+16] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm4, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm4, xmm2, R_SHUFFLE_PS( 1, 3, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLE_PS( 0, 3, 0, 3 ) addps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 STORE4( 0, xmm0, xmm2 ) movlps xmm4, [edi+80] movhps xmm3, [edi+80+8] movaps xmm1, [edi+80+16] movlps xmm2, [edi+80+32] shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 shufps xmm3, xmm1, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 mulps xmm4, xmm6 mulps xmm3, xmm7 mulps xmm1, xmm5 addps xmm4, xmm3 // xmm4 + xmm1 shufps xmm1, xmm4, R_SHUFFLE_PS( 0, 3, 0, 2 ) shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 3, 0, 0 ) addps xmm4, xmm1 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 3, 0, 1 ) addps xmm4, xmm1 STORE2LO( 16, xmm4, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } return; } } break; } case 6: { switch( numRows ) { case 1: { // 1x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] movss xmm1, [esi+4] mulss xmm1, [edi+4] movss xmm2, [esi+8] addss xmm0, xmm1 mulss xmm2, [edi+8] movss xmm3, [esi+12] addss xmm0, xmm2 mulss xmm3, [edi+12] movss xmm4, [esi+16] addss xmm0, xmm3 mulss xmm4, [edi+16] movss xmm5, [esi+20] addss xmm0, xmm4 mulss xmm5, [edi+20] movss xmm6, [esi+24] addss xmm0, xmm5 mulss xmm6, [edi+24] addss xmm0, xmm6 STORE1( 0, xmm0, xmm7 ) } return; } case 2: { // 2x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) } return; } case 3: { // 3x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) // row 2 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm1 STORE1( 8, xmm0, xmm3 ) } return; } case 4: { // 4x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm4 ) } return; } case 5: { // 5x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm3 ) // row 5 movaps xmm0, [edi+96] movaps xmm1, [edi+96+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x01 addss xmm0, xmm1 STORE1( 16, xmm0, xmm3 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 STORE4( 0, xmm0, xmm3 ) STORE2LO( 16, xmm2, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } return; } } break; } default: { int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] STOREC sum; mPtr += numColumns; } break; } } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_MultiplySubVecX optimizes the following matrix multiplications: NxN * Nx1 Nx6 * 6x1 6xN * Nx1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm subss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC -= int numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: { switch( numRows ) { case 1: { // 1x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] STORE1( 0, xmm0, xmm1 ) } return; } case 6: { // 6x1 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0]; mPtr++; } return; } } break; } case 2: { switch( numRows ) { case 2: { // 2x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm1, [esi+4] movss xmm2, [edi] mulss xmm2, xmm0 movss xmm3, [edi+4] mulss xmm3, xmm1 addss xmm2, xmm3 STORE1( 0, xmm2, xmm4 ) mulss xmm0, [edi+8] mulss xmm1, [edi+8+4] addss xmm0, xmm1 STORE1( 4, xmm0, xmm4 ) } return; } case 6: { // 6x2 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, [esi] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movaps xmm0, [edi] mulps xmm0, xmm7 movaps xmm1, [edi+16] mulps xmm1, xmm7 movaps xmm2, xmm0 shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm2, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) movaps xmm3, [edi+32] addps xmm0, xmm2 mulps xmm3, xmm7 STORE4( 0, xmm0, xmm4 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm1, xmm3 addps xmm3, xmm1 STORE2LO( 16, xmm3, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } return; } } break; } case 3: { switch( numRows ) { case 3: { // 3x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] movss xmm4, [edi] mulss xmm4, xmm0 movss xmm1, [esi+4] movss xmm5, [edi+4] mulss xmm5, xmm1 addss xmm4, xmm5 movss xmm2, [esi+8] movss xmm6, [edi+8] mulss xmm6, xmm2 addss xmm4, xmm6 movss xmm3, [edi+12] mulss xmm3, xmm0 STORE1( 0, xmm4, xmm7 ); movss xmm5, [edi+12+4] mulss xmm5, xmm1 addss xmm3, xmm5 movss xmm6, [edi+12+8] mulss xmm6, xmm2 addss xmm3, xmm6 mulss xmm0, [edi+24] mulss xmm1, [edi+24+4] STORE1( 4, xmm3, xmm7 ); addss xmm0, xmm1 mulss xmm2, [edi+24+8] addss xmm0, xmm2 STORE1( 8, xmm0, xmm7 ); } return; } case 6: { // 6x3 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm5, [esi] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [esi+4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3 movlps xmm1, [edi+4*4] shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2 movlps xmm2, [edi+6*4] movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9 mulps xmm0, xmm5 movlps xmm3, [edi+10*4] shufps xmm2, xmm3, R_SHUFFLE_PS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11 movaps xmm3, xmm1 shufps xmm1, xmm2, R_SHUFFLE_PS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10 mulps xmm1, xmm6 shufps xmm3, xmm2, R_SHUFFLE_PS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11 mulps xmm3, xmm7 addps xmm0, xmm1 addps xmm0, xmm3 STORE4( 0, xmm0, xmm4 ) movss xmm1, [edi+12*4] mulss xmm1, xmm5 movss xmm2, [edi+13*4] mulss xmm2, xmm6 movss xmm3, [edi+14*4] mulss xmm3, xmm7 addss xmm1, xmm2 addss xmm1, xmm3 STORE1( 16, xmm1, xmm4 ) mulss xmm5, [edi+15*4] mulss xmm6, [edi+16*4] mulss xmm7, [edi+17*4] addss xmm5, xmm6 addss xmm5, xmm7 STORE1( 20, xmm5, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } return; } } break; } case 4: { switch( numRows ) { case 4: { // 4x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi ] movlps xmm0, qword ptr [edi ] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm4 STORE4( 0, xmm0, xmm2 ) } return; } case 6: { // 6x4 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, qword ptr [esi+ 0] movlps xmm0, qword ptr [edi+ 0] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm0, qword ptr [edi+16] mulps xmm0, xmm6 movlps xmm7, qword ptr [esi+ 8] movlps xmm2, qword ptr [edi+ 8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movhps xmm2, qword ptr [edi+24] mulps xmm2, xmm7 movlps xmm1, qword ptr [edi+32] movhps xmm1, qword ptr [edi+48] mulps xmm1, xmm6 movlps xmm3, qword ptr [edi+40] addps xmm0, xmm2 movhps xmm3, qword ptr [edi+56] mulps xmm3, xmm7 movaps xmm4, xmm0 addps xmm1, xmm3 shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm4 movlps xmm1, qword ptr [edi+64] movhps xmm1, qword ptr [edi+80] STORE4( 0, xmm0, xmm4 ) mulps xmm1, xmm6 movlps xmm2, qword ptr [edi+72] movhps xmm2, qword ptr [edi+88] mulps xmm2, xmm7 addps xmm1, xmm2 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm3, xmm1 addps xmm1, xmm3 STORE2LO( 16, xmm1, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } return; } } break; } case 5: { switch( numRows ) { case 5: { // 5x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1 movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11 movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1 shufps xmm0, xmm5, R_SHUFFLE_PS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15 movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1 movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11 movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1 shufps xmm1, xmm5, R_SHUFFLE_PS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16 movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3 movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13 movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3 shufps xmm2, xmm5, R_SHUFFLE_PS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17 movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3 movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13 movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9 shufps xmm3, xmm5, R_SHUFFLE_PS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18 movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15 shufps xmm4, xmm5, R_SHUFFLE_PS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19 movss xmm7, [esi+0*4] shufps xmm7, xmm7, 0 mulps xmm0, xmm7 movss xmm5, [esi+1*4] shufps xmm5, xmm5, 0 mulps xmm1, xmm5 addps xmm0, xmm1 movss xmm6, [esi+2*4] shufps xmm6, xmm6, 0 mulps xmm2, xmm6 addps xmm0, xmm2 movss xmm1, [esi+3*4] shufps xmm1, xmm1, 0 mulps xmm3, xmm1 addps xmm0, xmm3 movss xmm2, [esi+4*4] shufps xmm2, xmm2, 0 mulps xmm4, xmm2 addps xmm0, xmm4 mulss xmm7, [edi+20*4] mulss xmm5, [edi+21*4] addps xmm7, xmm5 mulss xmm6, [edi+22*4] addps xmm7, xmm6 mulss xmm1, [edi+23*4] addps xmm7, xmm1 mulss xmm2, [edi+24*4] addps xmm7, xmm2 STORE4( 0, xmm0, xmm3 ) STORE1( 16, xmm7, xmm4 ) } return; } case 6: { // 6x5 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [esi] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm7, [esi+8] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm0, [edi] movhps xmm3, [edi+8] movaps xmm1, [edi+16] movlps xmm2, [edi+32] shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9 shufps xmm3, xmm1, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8 mulps xmm0, xmm6 mulps xmm3, xmm7 movlps xmm2, [edi+40] addps xmm0, xmm3 // xmm0 + xmm1 movhps xmm5, [edi+40+8] movlps xmm3, [edi+40+16] movhps xmm3, [edi+40+24] movlps xmm4, [edi+40+32] shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16 shufps xmm3, xmm4, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19 shufps xmm5, xmm3, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18 mulps xmm2, xmm6 mulps xmm5, xmm7 addps xmm2, xmm5 // xmm2 + xmm3 movss xmm5, [esi+16] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm4, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm4, xmm2, R_SHUFFLE_PS( 1, 3, 1, 3 ) shufps xmm1, xmm3, R_SHUFFLE_PS( 0, 3, 0, 3 ) addps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 STORE4( 0, xmm0, xmm2 ) movlps xmm4, [edi+80] movhps xmm3, [edi+80+8] movaps xmm1, [edi+80+16] movlps xmm2, [edi+80+32] shufps xmm4, xmm1, R_SHUFFLE_PS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26 shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29 shufps xmm3, xmm1, R_SHUFFLE_PS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28 mulps xmm4, xmm6 mulps xmm3, xmm7 mulps xmm1, xmm5 addps xmm4, xmm3 // xmm4 + xmm1 shufps xmm1, xmm4, R_SHUFFLE_PS( 0, 3, 0, 2 ) shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 3, 0, 0 ) addps xmm4, xmm1 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 3, 0, 1 ) addps xmm4, xmm1 STORE2LO( 16, xmm4, xmm2 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } return; } } break; } case 6: { switch( numRows ) { case 1: { // 1x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] mulss xmm0, [edi] movss xmm1, [esi+4] mulss xmm1, [edi+4] movss xmm2, [esi+8] addss xmm0, xmm1 mulss xmm2, [edi+8] movss xmm3, [esi+12] addss xmm0, xmm2 mulss xmm3, [edi+12] movss xmm4, [esi+16] addss xmm0, xmm3 mulss xmm4, [edi+16] movss xmm5, [esi+20] addss xmm0, xmm4 mulss xmm5, [edi+20] movss xmm6, [esi+24] addss xmm0, xmm5 mulss xmm6, [edi+24] addss xmm0, xmm6 STORE1( 0, xmm0, xmm7 ) } return; } case 2: { // 2x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) } return; } case 3: { // 3x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 1, 3 ) movhlps xmm0, xmm1 addps xmm0, xmm1 STORE2LO( 0, xmm0, xmm3 ) // row 2 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm1 STORE1( 8, xmm0, xmm3 ) } return; } case 4: { // 4x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm4 ) } return; } case 5: { // 5x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr // load idVecX movlps xmm4, [esi] movhps xmm4, [esi+8] movlps xmm5, [esi+16] movlhps xmm5, xmm4 movhlps xmm6, xmm4 movlhps xmm6, xmm5 // row 0 and 1 movaps xmm0, [edi] movaps xmm1, [edi+16] movaps xmm2, [edi+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm7, xmm0 movlhps xmm7, xmm2 addps xmm7, xmm1 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm7, xmm0 // row 2 and 3 movaps xmm0, [edi+48] movaps xmm1, [edi+48+16] movaps xmm2, [edi+48+32] mulps xmm0, xmm4 mulps xmm1, xmm5 mulps xmm2, xmm6 movhlps xmm3, xmm0 movlhps xmm3, xmm2 addps xmm1, xmm3 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 2, 3 ) addps xmm1, xmm0 // last 4 additions for the first 4 rows and store result movaps xmm0, xmm7 shufps xmm7, xmm1, R_SHUFFLE_PS( 0, 2, 0, 2 ) shufps xmm0, xmm1, R_SHUFFLE_PS( 1, 3, 1, 3 ) addps xmm0, xmm7 STORE4( 0, xmm0, xmm3 ) // row 5 movaps xmm0, [edi+96] movaps xmm1, [edi+96+16] mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, 0x01 addss xmm0, xmm1 STORE1( 16, xmm0, xmm3 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 STORE4( 0, xmm0, xmm3 ) STORE2LO( 16, xmm2, xmm4 ) } return; } default: { for ( int i = 0; i < numRows; i++ ) { dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } return; } } break; } default: { int numColumns = mat.GetNumColumns(); for ( int i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] STOREC sum; mPtr += numColumns; } break; } } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_TransposeMultiplyVecX optimizes the following matrix multiplications: Nx6 * Nx1 6xN * 6x1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss [eax+offset], reg1 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps [eax+offset], reg1 #define STORE4( offset, reg1, reg2 ) \ __asm movlps [eax+offset], reg1 \ __asm movhps [eax+offset+8], reg1 #define STOREC = int numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: switch( numColumns ) { case 6: { // 1x6 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm3 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0]; mPtr++; } return; } } break; case 2: switch( numColumns ) { case 6: { // 2x6 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) movaps xmm2, [edi] mulps xmm2, xmm0 movlps xmm3, [edi+24] movhps xmm3, [edi+32] mulps xmm3, xmm1 addps xmm2, xmm3 shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm4, [edi+16] movhps xmm4, [edi+40] mulps xmm4, xmm0 movhlps xmm3, xmm4 addps xmm3, xmm4 STORE4( 0, xmm2, xmm5 ) STORE2LO( 16, xmm3, xmm6 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } return; } } break; case 3: switch( numColumns ) { case 6: { // 3x6 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movss xmm1, [esi+2*4] movlps xmm3, [edi+(0*6+0)*4] movhps xmm3, [edi+(0*6+2)*4] movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(2*6+4)*4] mulps xmm5, xmm1 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } return; } } break; case 4: switch( numColumns ) { case 6: { // 4x6 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } return; } } break; case 5: switch( numColumns ) { case 6: { // 5x6 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movss xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm4, xmm2 mulps xmm4, [edi+(4*6+0)*4] addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } return; } } break; case 6: switch( numColumns ) { case 1: { // 6x1 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movhps xmm0, [esi+8] movlps xmm1, [esi+16] mulps xmm0, [edi] mulps xmm1, [edi+16] shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 3, 2 ) addps xmm0, xmm1 movhlps xmm2, xmm0 addss xmm2, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm2, xmm0 STORE1( 0, xmm2, xmm3 ) } return; } case 2: { // 6x2 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm6, [edi+0*4] mulps xmm6, xmm0 movlps xmm1, [esi+2*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm7, [edi+4*4] mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm2, [esi+4*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm7, [edi+8*4] mulps xmm7, xmm2 addps xmm6, xmm7 movhlps xmm3, xmm6 addps xmm3, xmm6 STORE2LO( 0, xmm3, xmm7 ) } return; } case 3: { // 6x3 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+(0*3+2)*4] movhps xmm0, [edi+(0*3+0)*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm6, [esi+0*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm1, [edi+(1*3+0)*4] movhps xmm1, [edi+(1*3+1)*4] movss xmm7, [esi+1*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm2, [edi+(2*3+2)*4] movhps xmm2, [edi+(2*3+0)*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm7, [esi+2*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movss xmm3, [edi+(3*3+0)*4] movhps xmm3, [edi+(3*3+1)*4] movss xmm7, [esi+3*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movss xmm4, [edi+(4*3+2)*4] movhps xmm4, [edi+(4*3+0)*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm7, [esi+4*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movss xmm5, [edi+(5*3+0)*4] movhps xmm5, [edi+(5*3+1)*4] movss xmm7, [esi+5*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE1( 0, xmm6, xmm7 ) STORE2HI( 4, xmm6, xmm7 ) } return; } case 4: { // 6x4 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm3, [edi+(0*4+0)*4] movhps xmm3, [edi+(0*4+2)*4] movss xmm4, [esi+0*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*4+0)*4] movhps xmm5, [edi+(1*4+2)*4] movss xmm6, [esi+1*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*4+0)*4] movhps xmm4, [edi+(2*4+2)*4] movss xmm6, [esi+2*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*4+0)*4] movhps xmm5, [edi+(3*4+2)*4] movss xmm6, [esi+3*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(4*4+0)*4] movhps xmm4, [edi+(4*4+2)*4] movss xmm6, [esi+4*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(5*4+0)*4] movhps xmm5, [edi+(5*4+2)*4] movss xmm6, [esi+5*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) } return; } case 5: { // 6x5 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [edi+(0*5+0)*4] movhps xmm6, [edi+(0*5+2)*4] movss xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movlps xmm7, [edi+(1*5+0)*4] movhps xmm7, [edi+(1*5+2)*4] movss xmm1, [esi+1*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm7, [edi+(2*5+0)*4] movhps xmm7, [edi+(2*5+2)*4] movss xmm2, [esi+2*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movlps xmm7, [edi+(3*5+0)*4] movhps xmm7, [edi+(3*5+2)*4] movss xmm3, [esi+3*4] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movlps xmm7, [edi+(4*5+0)*4] movhps xmm7, [edi+(4*5+2)*4] movss xmm4, [esi+4*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movlps xmm7, [edi+(5*5+0)*4] movhps xmm7, [edi+(5*5+2)*4] movss xmm5, [esi+5*4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE4( 0, xmm6, xmm7 ) movss xmm6, [edi+(0*5+4)*4] mulss xmm6, xmm0 movss xmm7, [edi+(1*5+4)*4] mulss xmm7, xmm1 addss xmm6, xmm7 movss xmm7, [edi+(2*5+4)*4] mulss xmm7, xmm2 addss xmm6, xmm7 movss xmm7, [edi+(3*5+4)*4] mulss xmm7, xmm3 addss xmm6, xmm7 movss xmm7, [edi+(4*5+4)*4] mulss xmm7, xmm4 addss xmm6, xmm7 movss xmm7, [edi+(5*5+4)*4] mulss xmm7, xmm5 addss xmm6, xmm7 STORE1( 16, xmm6, xmm7 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movlps xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(4*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) movlps xmm5, [edi+(5*6+0)*4] movhps xmm5, [edi+(5*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] movhps xmm5, [edi+(5*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } return; } } break; default: int numRows = mat.GetNumRows(); for ( int i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] STOREC sum; } break; } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_TransposeMultiplyAddVecX optimizes the following matrix multiplications: Nx6 * Nx1 6xN * 6x1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm addss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm addps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm addps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC += int numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: switch( numColumns ) { case 6: { // 1x6 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm3 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0]; mPtr++; } return; } } break; case 2: switch( numColumns ) { case 6: { // 2x6 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) movaps xmm2, [edi] mulps xmm2, xmm0 movlps xmm3, [edi+24] movhps xmm3, [edi+32] mulps xmm3, xmm1 addps xmm2, xmm3 shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm4, [edi+16] movhps xmm4, [edi+40] mulps xmm4, xmm0 movhlps xmm3, xmm4 addps xmm3, xmm4 STORE4( 0, xmm2, xmm5 ) STORE2LO( 16, xmm3, xmm6 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } return; } } break; case 3: switch( numColumns ) { case 6: { // 3x6 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movss xmm1, [esi+2*4] movlps xmm3, [edi+(0*6+0)*4] movhps xmm3, [edi+(0*6+2)*4] movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(2*6+4)*4] mulps xmm5, xmm1 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } return; } } break; case 4: switch( numColumns ) { case 6: { // 4x6 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } return; } } break; case 5: switch( numColumns ) { case 6: { // 5x6 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movss xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm4, xmm2 mulps xmm4, [edi+(4*6+0)*4] addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } return; } } break; case 6: switch( numColumns ) { case 1: { // 6x1 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movhps xmm0, [esi+8] movlps xmm1, [esi+16] mulps xmm0, [edi] mulps xmm1, [edi+16] shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 3, 2 ) addps xmm0, xmm1 movhlps xmm2, xmm0 addss xmm2, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm2, xmm0 STORE1( 0, xmm2, xmm3 ) } return; } case 2: { // 6x2 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm6, [edi+0*4] mulps xmm6, xmm0 movlps xmm1, [esi+2*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm7, [edi+4*4] mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm2, [esi+4*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm7, [edi+8*4] mulps xmm7, xmm2 addps xmm6, xmm7 movhlps xmm3, xmm6 addps xmm3, xmm6 STORE2LO( 0, xmm3, xmm7 ) } return; } case 3: { // 6x3 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+(0*3+2)*4] movhps xmm0, [edi+(0*3+0)*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm6, [esi+0*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm1, [edi+(1*3+0)*4] movhps xmm1, [edi+(1*3+1)*4] movss xmm7, [esi+1*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm2, [edi+(2*3+2)*4] movhps xmm2, [edi+(2*3+0)*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm7, [esi+2*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movss xmm3, [edi+(3*3+0)*4] movhps xmm3, [edi+(3*3+1)*4] movss xmm7, [esi+3*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movss xmm4, [edi+(4*3+2)*4] movhps xmm4, [edi+(4*3+0)*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm7, [esi+4*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movss xmm5, [edi+(5*3+0)*4] movhps xmm5, [edi+(5*3+1)*4] movss xmm7, [esi+5*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE1( 0, xmm6, xmm7 ) STORE2HI( 4, xmm6, xmm7 ) } return; } case 4: { // 6x4 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm3, [edi+(0*4+0)*4] movhps xmm3, [edi+(0*4+2)*4] movss xmm4, [esi+0*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*4+0)*4] movhps xmm5, [edi+(1*4+2)*4] movss xmm6, [esi+1*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*4+0)*4] movhps xmm4, [edi+(2*4+2)*4] movss xmm6, [esi+2*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*4+0)*4] movhps xmm5, [edi+(3*4+2)*4] movss xmm6, [esi+3*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(4*4+0)*4] movhps xmm4, [edi+(4*4+2)*4] movss xmm6, [esi+4*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(5*4+0)*4] movhps xmm5, [edi+(5*4+2)*4] movss xmm6, [esi+5*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) } return; } case 5: { // 6x5 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [edi+(0*5+0)*4] movhps xmm6, [edi+(0*5+2)*4] movss xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movlps xmm7, [edi+(1*5+0)*4] movhps xmm7, [edi+(1*5+2)*4] movss xmm1, [esi+1*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm7, [edi+(2*5+0)*4] movhps xmm7, [edi+(2*5+2)*4] movss xmm2, [esi+2*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movlps xmm7, [edi+(3*5+0)*4] movhps xmm7, [edi+(3*5+2)*4] movss xmm3, [esi+3*4] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movlps xmm7, [edi+(4*5+0)*4] movhps xmm7, [edi+(4*5+2)*4] movss xmm4, [esi+4*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movlps xmm7, [edi+(5*5+0)*4] movhps xmm7, [edi+(5*5+2)*4] movss xmm5, [esi+5*4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE4( 0, xmm6, xmm7 ) movss xmm6, [edi+(0*5+4)*4] mulss xmm6, xmm0 movss xmm7, [edi+(1*5+4)*4] mulss xmm7, xmm1 addss xmm6, xmm7 movss xmm7, [edi+(2*5+4)*4] mulss xmm7, xmm2 addss xmm6, xmm7 movss xmm7, [edi+(3*5+4)*4] mulss xmm7, xmm3 addss xmm6, xmm7 movss xmm7, [edi+(4*5+4)*4] mulss xmm7, xmm4 addss xmm6, xmm7 movss xmm7, [edi+(5*5+4)*4] mulss xmm7, xmm5 addss xmm6, xmm7 STORE1( 16, xmm6, xmm7 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movlps xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(4*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) movlps xmm5, [edi+(5*6+0)*4] movhps xmm5, [edi+(5*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] movhps xmm5, [edi+(5*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } return; } } break; default: int numRows = mat.GetNumRows(); for ( int i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] STOREC sum; } break; } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ void idSIMD_SSE::MatX_TransposeMultiplySubVecX optimizes the following matrix multiplications: Nx6 * Nx1 6xN * 6x1 with N in the range [1-6] ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { #define STORE1( offset, reg1, reg2 ) \ __asm movss reg2, [eax+offset] \ __asm subss reg2, reg1 \ __asm movss [eax+offset], reg2 #define STORE2LO( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 #define STORE2HI( offset, reg1, reg2 ) \ __asm movhps reg2, [eax+offset] \ __asm subps reg2, reg1 \ __asm movhps [eax+offset], reg2 #define STORE4( offset, reg1, reg2 ) \ __asm movlps reg2, [eax+offset] \ __asm movhps reg2, [eax+offset+8] \ __asm subps reg2, reg1 \ __asm movlps [eax+offset], reg2 \ __asm movhps [eax+offset+8], reg2 #define STOREC -= int numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: switch( numColumns ) { case 6: { // 1x6 * 1x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] STORE4( 0, xmm0, xmm2 ) STORE2LO( 16, xmm1, xmm3 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0]; mPtr++; } return; } } break; case 2: switch( numColumns ) { case 6: { // 2x6 * 2x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) movaps xmm2, [edi] mulps xmm2, xmm0 movlps xmm3, [edi+24] movhps xmm3, [edi+32] mulps xmm3, xmm1 addps xmm2, xmm3 shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm4, [edi+16] movhps xmm4, [edi+40] mulps xmm4, xmm0 movhlps xmm3, xmm4 addps xmm3, xmm4 STORE4( 0, xmm2, xmm5 ) STORE2LO( 16, xmm3, xmm6 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } return; } } break; case 3: switch( numColumns ) { case 6: { // 3x6 * 3x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movss xmm1, [esi+2*4] movlps xmm3, [edi+(0*6+0)*4] movhps xmm3, [edi+(0*6+2)*4] movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(2*6+4)*4] mulps xmm5, xmm1 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } return; } } break; case 4: switch( numColumns ) { case 6: { // 4x6 * 4x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*6+0)*4] movhps xmm4, [edi+(2*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } return; } } break; case 5: switch( numColumns ) { case 6: { // 5x6 * 5x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movss xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm4, xmm2 mulps xmm4, [edi+(4*6+0)*4] addps xmm3, xmm4 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movhlps xmm4, xmm3 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } return; } } break; case 6: switch( numColumns ) { case 1: { // 6x1 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi] movhps xmm0, [esi+8] movlps xmm1, [esi+16] mulps xmm0, [edi] mulps xmm1, [edi+16] shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 3, 2 ) addps xmm0, xmm1 movhlps xmm2, xmm0 addss xmm2, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm2, xmm0 STORE1( 0, xmm2, xmm3 ) } return; } case 2: { // 6x2 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm6, [edi+0*4] mulps xmm6, xmm0 movlps xmm1, [esi+2*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm7, [edi+4*4] mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm2, [esi+4*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm7, [edi+8*4] mulps xmm7, xmm2 addps xmm6, xmm7 movhlps xmm3, xmm6 addps xmm3, xmm6 STORE2LO( 0, xmm3, xmm7 ) } return; } case 3: { // 6x3 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movss xmm0, [edi+(0*3+2)*4] movhps xmm0, [edi+(0*3+0)*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm6, [esi+0*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm1, [edi+(1*3+0)*4] movhps xmm1, [edi+(1*3+1)*4] movss xmm7, [esi+1*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm2, [edi+(2*3+2)*4] movhps xmm2, [edi+(2*3+0)*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm7, [esi+2*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movss xmm3, [edi+(3*3+0)*4] movhps xmm3, [edi+(3*3+1)*4] movss xmm7, [esi+3*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movss xmm4, [edi+(4*3+2)*4] movhps xmm4, [edi+(4*3+0)*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 1, 3, 0 ) movss xmm7, [esi+4*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movss xmm5, [edi+(5*3+0)*4] movhps xmm5, [edi+(5*3+1)*4] movss xmm7, [esi+5*4] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE1( 0, xmm6, xmm7 ) STORE2HI( 4, xmm6, xmm7 ) } return; } case 4: { // 6x4 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm3, [edi+(0*4+0)*4] movhps xmm3, [edi+(0*4+2)*4] movss xmm4, [esi+0*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm4 movlps xmm5, [edi+(1*4+0)*4] movhps xmm5, [edi+(1*4+2)*4] movss xmm6, [esi+1*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(2*4+0)*4] movhps xmm4, [edi+(2*4+2)*4] movss xmm6, [esi+2*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(3*4+0)*4] movhps xmm5, [edi+(3*4+2)*4] movss xmm6, [esi+3*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 movlps xmm4, [edi+(4*4+0)*4] movhps xmm4, [edi+(4*4+2)*4] movss xmm6, [esi+4*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm6 addps xmm3, xmm4 movlps xmm5, [edi+(5*4+0)*4] movhps xmm5, [edi+(5*4+2)*4] movss xmm6, [esi+5*4] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) } return; } case 5: { // 6x5 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm6, [edi+(0*5+0)*4] movhps xmm6, [edi+(0*5+2)*4] movss xmm0, [esi+0*4] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movlps xmm7, [edi+(1*5+0)*4] movhps xmm7, [edi+(1*5+2)*4] movss xmm1, [esi+1*4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movlps xmm7, [edi+(2*5+0)*4] movhps xmm7, [edi+(2*5+2)*4] movss xmm2, [esi+2*4] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 movlps xmm7, [edi+(3*5+0)*4] movhps xmm7, [edi+(3*5+2)*4] movss xmm3, [esi+3*4] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm3 addps xmm6, xmm7 movlps xmm7, [edi+(4*5+0)*4] movhps xmm7, [edi+(4*5+2)*4] movss xmm4, [esi+4*4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movlps xmm7, [edi+(5*5+0)*4] movhps xmm7, [edi+(5*5+2)*4] movss xmm5, [esi+5*4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 STORE4( 0, xmm6, xmm7 ) movss xmm6, [edi+(0*5+4)*4] mulss xmm6, xmm0 movss xmm7, [edi+(1*5+4)*4] mulss xmm7, xmm1 addss xmm6, xmm7 movss xmm7, [edi+(2*5+4)*4] mulss xmm7, xmm2 addss xmm6, xmm7 movss xmm7, [edi+(3*5+4)*4] mulss xmm7, xmm3 addss xmm6, xmm7 movss xmm7, [edi+(4*5+4)*4] mulss xmm7, xmm4 addss xmm6, xmm7 movss xmm7, [edi+(5*5+4)*4] mulss xmm7, xmm5 addss xmm6, xmm7 STORE1( 16, xmm6, xmm7 ) } return; } case 6: { // 6x6 * 6x1 __asm { mov esi, vPtr mov edi, mPtr mov eax, dstPtr movlps xmm0, [esi+0*4] movlps xmm1, [esi+2*4] movlps xmm2, [esi+4*4] movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, [edi+(0*6+0)*4] movlps xmm5, [edi+(1*6+0)*4] movhps xmm5, [edi+(1*6+2)*4] movaps xmm6, xmm0 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(2*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) movlps xmm5, [edi+(3*6+0)*4] movhps xmm5, [edi+(3*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, [edi+(4*6+0)*4] addps xmm3, xmm6 movaps xmm6, xmm2 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) movlps xmm5, [edi+(5*6+0)*4] movhps xmm5, [edi+(5*6+2)*4] mulps xmm5, xmm6 addps xmm3, xmm5 STORE4( 0, xmm3, xmm7 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) movlps xmm3, [edi+(0*6+4)*4] movhps xmm3, [edi+(1*6+4)*4] mulps xmm3, xmm0 movlps xmm4, [edi+(2*6+4)*4] movhps xmm4, [edi+(3*6+4)*4] mulps xmm4, xmm1 addps xmm3, xmm4 movlps xmm5, [edi+(4*6+4)*4] movhps xmm5, [edi+(5*6+4)*4] mulps xmm5, xmm2 addps xmm3, xmm5 movhlps xmm4, xmm3 addps xmm3, xmm4 STORE2LO( 16, xmm3, xmm7 ) } return; } default: { for ( int i = 0; i < numColumns; i++ ) { dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } return; } } break; default: int numRows = mat.GetNumRows(); for ( int i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] STOREC sum; } break; } #undef STOREC #undef STORE4 #undef STORE2HI #undef STORE2LO #undef STORE1 } /* ============ idSIMD_SSE::MatX_MultiplyMatX optimizes the following matrix multiplications: NxN * Nx6 6xN * Nx6 Nx6 * 6xN 6x6 * 6xN with N in the range [1-6]. The hot cache clock cycle counts are generally better for the SIMD version than the FPU version. At times up to 40% less clock cycles on a P3. In practise however, the results are poor probably due to memory access. ============ */ void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) { int i, j, k, l, n; float *dstPtr; const float *m1Ptr, *m2Ptr; double sum; assert( m1.GetNumColumns() == m2.GetNumRows() ); dstPtr = dst.ToFloatPtr(); m1Ptr = m1.ToFloatPtr(); m2Ptr = m2.ToFloatPtr(); k = m1.GetNumRows(); l = m2.GetNumColumns(); n = m1.GetNumColumns(); switch( n ) { case 1: { if ( !(l^6) ) { switch( k ) { case 1: { // 1x1 * 1x6, no precision loss compared to FPU version __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movss xmm0, [edi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, [esi] mulps xmm1, xmm0 movaps [eax], xmm1 movlps xmm2, [esi+16] mulps xmm2, xmm0 movlps [eax+16], xmm2 } return; } case 6: { // 6x1 * 1x6, no precision loss compared to FPU version __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr xorps xmm1, xmm1 movaps xmm0, [edi] movlps xmm1, [edi+16] movlhps xmm1, xmm0 movhlps xmm2, xmm0 movlhps xmm2, xmm1 // row 0 and 1 movaps xmm3, [esi] movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 1, 1 ) movaps xmm6, xmm3 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 movaps [eax], xmm4 movaps [eax+16], xmm5 movaps [eax+32], xmm6 // row 2 and 3 movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 2, 2, 2 ) movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLE_PS( 2, 2, 3, 3 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 3, 3, 3, 3 ) mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm3, xmm2 movaps [eax+48], xmm4 movaps [eax+64], xmm5 movaps [eax+80], xmm3 // row 4 and 5 movlps xmm3, [esi+16] movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 1, 1 ) shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm3, xmm2 movaps [eax+96], xmm4 movaps [eax+112], xmm5 movaps [eax+128], xmm3 } return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m2Ptr++; } m1Ptr++; } break; } case 2: { if ( !(l^6) ) { switch( k ) { case 2: { // 2x2 * 2x6 #define MUL_Nx2_2x6_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movlps xmm1, [esi+16] \ __asm movhps xmm1, [esi+40] \ __asm movlps xmm2, [esi+24] \ __asm movhps xmm2, [esi+32] #define MUL_Nx2_2x6_ROW2( row ) \ __asm movaps xmm3, [edi+row*16] \ __asm movaps xmm5, xmm0 \ __asm movaps xmm4, xmm3 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm4 \ __asm movaps xmm4, xmm3 \ __asm movaps xmm6, xmm2 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 1, 1, 1 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm5, xmm6 \ __asm movaps [eax+row*48], xmm5 \ __asm movaps xmm4, xmm3 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm movaps xmm7, xmm1 \ __asm mulps xmm7, xmm4 \ __asm movaps xmm4, xmm3 \ __asm movaps xmm5, xmm0 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 2, 2, 2 ) \ __asm mulps xmm5, xmm4 \ __asm movaps xmm4, xmm3 \ __asm movaps xmm6, xmm2 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 3, 3, 3, 3 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm5, xmm6 \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 2, 2, 3, 3 ) \ __asm movaps xmm6, xmm1 \ __asm mulps xmm6, xmm3 \ __asm movaps xmm4, xmm7 \ __asm movlhps xmm7, xmm6 \ __asm movhlps xmm6, xmm4 \ __asm addps xmm6, xmm7 \ __asm movlps [eax+row*48+16], xmm6 \ __asm movlps [eax+row*48+24], xmm5 \ __asm movhps [eax+row*48+32], xmm5 \ __asm movhps [eax+row*48+40], xmm6 MUL_Nx2_2x6_INIT MUL_Nx2_2x6_ROW2( 0 ) return; } case 6: { // 6x2 * 2x6 MUL_Nx2_2x6_INIT MUL_Nx2_2x6_ROW2( 0 ) MUL_Nx2_2x6_ROW2( 1 ) MUL_Nx2_2x6_ROW2( 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l]; m2Ptr++; } m1Ptr += 2; } break; } case 3: { if ( !(l^6) ) { switch( k ) { case 3: { // 3x3 * 3x6 __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movaps xmm5, xmmword ptr [esi] movlps xmm6, qword ptr [esi+24] movhps xmm6, qword ptr [esi+32] movaps xmm7, xmmword ptr [esi+48] movss xmm0, dword ptr [edi] shufps xmm0, xmm0, 0 mulps xmm0, xmm5 movss xmm1, dword ptr [edi+4] shufps xmm1, xmm1, 0 mulps xmm1, xmm6 movss xmm2, dword ptr [edi+8] shufps xmm2, xmm2, 0 mulps xmm2, xmm7 addps xmm0, xmm1 addps xmm0, xmm2 movaps xmmword ptr [eax], xmm0 movss xmm3, dword ptr [edi+12] shufps xmm3, xmm3, 0 mulps xmm3, xmm5 movss xmm4, dword ptr [edi+16] shufps xmm4, xmm4, 0 mulps xmm4, xmm6 movss xmm0, dword ptr [edi+20] shufps xmm0, xmm0, 0 mulps xmm0, xmm7 addps xmm3, xmm4 addps xmm0, xmm3 movlps qword ptr [eax+24], xmm0 movhps qword ptr [eax+32], xmm0 movss xmm1, dword ptr [edi+24] shufps xmm1, xmm1, 0 mulps xmm1, xmm5 movss xmm2, dword ptr [edi+28] shufps xmm2, xmm2, 0 mulps xmm2, xmm6 movss xmm3, dword ptr [edi+32] shufps xmm3, xmm3, 0 mulps xmm3, xmm7 addps xmm1, xmm2 addps xmm1, xmm3 movaps xmmword ptr [eax+48], xmm1 movlps xmm5, qword ptr [esi+16] movlps xmm6, qword ptr [esi+40] movlps xmm7, qword ptr [esi+64] shufps xmm5, xmm5, 0x44 shufps xmm6, xmm6, 0x44 shufps xmm7, xmm7, 0x44 movaps xmm3, xmmword ptr [edi] movlps xmm4, qword ptr [edi+16] movaps xmm0, xmm3 shufps xmm0, xmm0, 0xF0 mulps xmm0, xmm5 movaps xmm1, xmm3 shufps xmm1, xmm4, 0x05 mulps xmm1, xmm6 shufps xmm3, xmm4, 0x5A mulps xmm3, xmm7 addps xmm1, xmm0 addps xmm1, xmm3 movlps qword ptr [eax+16], xmm1 movhps qword ptr [eax+40], xmm1 movss xmm0, dword ptr [edi+24] shufps xmm0, xmm0, 0 mulps xmm0, xmm5 movss xmm2, dword ptr [edi+28] shufps xmm2, xmm2, 0 mulps xmm2, xmm6 movss xmm4, dword ptr [edi+32] shufps xmm4, xmm4, 0 mulps xmm4, xmm7 addps xmm0, xmm2 addps xmm0, xmm4 movlps qword ptr [eax+64], xmm0 } return; } case 6: { // 6x3 * 3x6 #define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] #define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \ __asm movss xmm3, [edi+(row*3+0)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm3, xmm0 \ __asm movss xmm4, [edi+(row*3+1)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm4, xmm1 \ __asm addps xmm3, xmm4 \ __asm movss xmm5, [edi+(row*3+2)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm5 \ __asm movlps [eax+(row*6+0)*4], xmm3 \ __asm movhps [eax+(row*6+2)*4], xmm3 #define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm movlps xmm2, [esi+16*4] \ __asm shufps xmm0, xmm0, 0x44 \ __asm shufps xmm1, xmm1, 0x44 \ __asm shufps xmm2, xmm2, 0x44 \ __asm movlps xmm3, [edi+0*4] \ __asm movhps xmm3, [edi+2*4] \ __asm movaps xmm4, xmm3 \ __asm movaps xmm5, xmm3 \ __asm shufps xmm3, xmm3, 0xF0 \ __asm mulps xmm3, xmm0 \ __asm movlps xmm6, [edi+4*4] \ __asm movhps xmm6, [edi+6*4] \ __asm shufps xmm4, xmm6, 0x05 \ __asm mulps xmm4, xmm1 \ __asm addps xmm3, xmm4 \ __asm shufps xmm5, xmm6, 0x5A \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm5 \ __asm movlps [eax+4*4], xmm3 \ __asm movhps [eax+10*4], xmm3 \ __asm movaps xmm5, xmm6 \ __asm movlps xmm3, [edi+8*4] \ __asm movhps xmm3, [edi+10*4] \ __asm movaps xmm4, xmm3 \ __asm shufps xmm5, xmm3, 0x5A \ __asm mulps xmm5, xmm0 \ __asm shufps xmm6, xmm3, 0xAF \ __asm mulps xmm6, xmm1 \ __asm addps xmm5, xmm6 \ __asm shufps xmm4, xmm4, 0xF0 \ __asm mulps xmm4, xmm2 \ __asm addps xmm4, xmm5 \ __asm movlps [eax+16*4], xmm4 \ __asm movhps [eax+22*4], xmm4 \ __asm movlps xmm6, [edi+12*4] \ __asm movhps xmm6, [edi+14*4] \ __asm movaps xmm5, xmm6 \ __asm movaps xmm4, xmm6 \ __asm shufps xmm6, xmm6, 0xF0 \ __asm mulps xmm6, xmm0 \ __asm movlps xmm3, [edi+16*4] \ __asm shufps xmm5, xmm3, 0x05 \ __asm mulps xmm5, xmm1 \ __asm addps xmm5, xmm6 \ __asm shufps xmm4, xmm3, 0x5A \ __asm mulps xmm4, xmm2 \ __asm addps xmm4, xmm5 \ __asm movlps [eax+28*4], xmm4 \ __asm movhps [eax+34*4], xmm4 MUL_Nx3_3x6_FIRST4COLUMNS_INIT MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 ) MUL_Nx3_3x6_LAST2COLUMNS_ROW6 return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l]; m2Ptr++; } m1Ptr += 3; } break; } case 4: { if ( !(l^6) ) { switch( k ) { case 4: { // 4x4 * 4x6 #define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] \ __asm movlps xmm3, [esi+18*4] \ __asm movhps xmm3, [esi+20*4] #define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \ __asm movss xmm4, [edi+row*16+0*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm4, xmm0 \ __asm movss xmm5, [edi+row*16+1*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm4, xmm5 \ __asm movss xmm6, [edi+row*16+2*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm4, xmm6 \ __asm movss xmm7, [edi+row*16+3*4] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm3 \ __asm addps xmm4, xmm7 \ __asm movlps [eax+row*24+0], xmm4 \ __asm movhps [eax+row*24+8], xmm4 #define MUL_Nx4_4x6_LAST2COLUMNS_INIT \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm movlps xmm2, [esi+16*4] \ __asm movlps xmm3, [esi+22*4] \ __asm shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm1, xmm0, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm3, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) #define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \ __asm movlps xmm7, [edi+row*32+ 0*4] \ __asm movhps xmm7, [edi+row*32+ 4*4] \ __asm movaps xmm6, xmm7 \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 3, 3 ) \ __asm mulps xmm6, xmm0 \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 1, 2, 2 ) \ __asm mulps xmm7, xmm1 \ __asm addps xmm6, xmm7 \ __asm movlps xmm4, [edi+row*32+ 2*4] \ __asm movhps xmm4, [edi+row*32+ 6*4] \ __asm movaps xmm5, xmm4 \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 3, 3 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 1, 2, 2 ) \ __asm mulps xmm4, xmm3 \ __asm addps xmm6, xmm4 \ __asm movlps [eax+row*48+ 4*4], xmm6 \ __asm movhps [eax+row*48+10*4], xmm6 MUL_Nx4_4x6_FIRST4COLUMNS_INIT MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx4_4x6_LAST2COLUMNS_INIT MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) return; } case 6: { // 6x4 * 4x6 MUL_Nx4_4x6_FIRST4COLUMNS_INIT MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 ) MUL_Nx4_4x6_LAST2COLUMNS_INIT MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 ) MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l]; m2Ptr++; } m1Ptr += 4; } break; } case 5: { if ( !(l^6) ) { switch( k ) { case 5: { // 5x5 * 5x6 #define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] \ __asm movlps xmm3, [esi+18*4] \ __asm movhps xmm3, [esi+20*4] \ __asm movlps xmm4, [esi+24*4] \ __asm movhps xmm4, [esi+26*4] #define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \ __asm movss xmm6, [edi+row*20+0*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm0 \ __asm movss xmm5, [edi+row*20+1*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+row*20+2*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+row*20+3*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm3 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+row*20+4*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm movlps [eax+row*24+0], xmm6 \ __asm movhps [eax+row*24+8], xmm6 #define MUL_Nx5_5x6_LAST2COLUMNS_INIT \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm movlps xmm2, [esi+16*4] \ __asm movlps xmm3, [esi+22*4] \ __asm movlps xmm4, [esi+28*4] \ __asm shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm3, xmm4, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm4, xmm0, R_SHUFFLE_PS( 0, 1, 0, 1 ) #define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \ __asm movlps xmm7, [edi+row*40+ 0*4] \ __asm movhps xmm7, [edi+row*40+ 6*4] \ __asm movaps xmm6, xmm7 \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 2, 2 ) \ __asm mulps xmm6, xmm0 \ __asm movaps xmm5, xmm7 \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 1, 3, 3 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm6, xmm5 \ __asm movlps xmm7, [edi+row*40+ 2*4] \ __asm movhps xmm7, [edi+row*40+ 8*4] \ __asm movaps xmm5, xmm7 \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 2, 2 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm movaps xmm5, xmm7 \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 1, 3, 3 ) \ __asm mulps xmm5, xmm3 \ __asm addps xmm6, xmm5 \ __asm movlps xmm5, [edi+row*40+ 4*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm movlps [eax+row*48+ 4*4], xmm6 \ __asm movhps [eax+row*48+10*4], xmm6 #define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \ __asm movlps xmm6, [edi+20*4+0*4] \ __asm unpcklps xmm6, xmm6 \ __asm mulps xmm6, xmm0 \ __asm movlps xmm5, [edi+20*4+2*4] \ __asm unpcklps xmm5, xmm5 \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm movss xmm5, [edi+20*4+4*4] \ __asm unpcklps xmm5, xmm5 \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm movhlps xmm7, xmm6 \ __asm addps xmm6, xmm7 \ __asm movlps [eax+row*24+4*4], xmm6 MUL_Nx5_5x6_FIRST4COLUMNS_INIT MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx5_5x6_LAST2COLUMNS_INIT MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 ) return; } case 6: { // 6x5 * 5x6 MUL_Nx5_5x6_FIRST4COLUMNS_INIT MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 ) MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 ) MUL_Nx5_5x6_LAST2COLUMNS_INIT MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 ) MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l]; m2Ptr++; } m1Ptr += 5; } break; } case 6: { switch( k ) { case 1: { if ( !(l^1) ) { // 1x6 * 6x1 dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] + m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5]; return; } break; } case 2: { if ( !(l^2) ) { // 2x6 * 6x2 #define MUL_Nx6_6x2_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movaps xmm1, [esi+16] \ __asm movaps xmm2, [esi+32] #define MUL_Nx6_6x2_ROW2( row ) \ __asm movaps xmm7, [edi+row*48+0*4] \ __asm movaps xmm6, xmm7 \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm7, xmm0 \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 2, 2, 3, 3 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movaps xmm6, [edi+row*48+4*4] \ __asm movaps xmm5, xmm6 \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 2, 2, 3, 3 ) \ __asm mulps xmm5, xmm0 \ __asm movaps xmm6, [edi+row*48+24+2*4] \ __asm movaps xmm4, xmm6 \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm5, xmm6 \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 2, 3, 3 ) \ __asm mulps xmm4, xmm2 \ __asm addps xmm5, xmm4 \ __asm movaps xmm4, xmm5 \ __asm movhlps xmm5, xmm7 \ __asm movlhps xmm7, xmm4 \ __asm addps xmm7, xmm5 \ __asm movaps [eax+row*16], xmm7 MUL_Nx6_6x2_INIT MUL_Nx6_6x2_ROW2( 0 ) return; } break; } case 3: { if ( !(l^3) ) { // 3x6 * 6x3 #define MUL_Nx6_6x3_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movss xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 1*4] \ __asm movss xmm1, [esi+ 3*4] \ __asm movhps xmm1, [esi+ 4*4] \ __asm movss xmm2, [esi+ 6*4] \ __asm movhps xmm2, [esi+ 7*4] \ __asm movss xmm3, [esi+ 9*4] \ __asm movhps xmm3, [esi+10*4] \ __asm movss xmm4, [esi+12*4] \ __asm movhps xmm4, [esi+13*4] \ __asm movss xmm5, [esi+15*4] \ __asm movhps xmm5, [esi+16*4] #define MUL_Nx6_6x3_ROW( row ) \ __asm movss xmm7, [edi+row*24+0] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+row*24+4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+8] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+12] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+16] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+20] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movss [eax+row*12+0], xmm7 \ __asm movhps [eax+row*12+4], xmm7 MUL_Nx6_6x3_INIT MUL_Nx6_6x3_ROW( 0 ) MUL_Nx6_6x3_ROW( 1 ) MUL_Nx6_6x3_ROW( 2 ) return; } break; } case 4: { if ( !(l^4) ) { // 4x6 * 6x4 #define MUL_Nx6_6x4_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movaps xmm1, [esi+16] \ __asm movaps xmm2, [esi+32] \ __asm movaps xmm3, [esi+48] \ __asm movaps xmm4, [esi+64] \ __asm movaps xmm5, [esi+80] #define MUL_Nx6_6x4_ROW( row ) \ __asm movss xmm7, [edi+row*24+0] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+row*24+4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+8] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+12] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+16] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+row*24+20] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movaps [eax+row*16], xmm7 MUL_Nx6_6x4_INIT MUL_Nx6_6x4_ROW( 0 ) MUL_Nx6_6x4_ROW( 1 ) MUL_Nx6_6x4_ROW( 2 ) MUL_Nx6_6x4_ROW( 3 ) return; } break; } case 5: { if ( !(l^5) ) { // 5x6 * 6x5 #define MUL_Nx6_6x5_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movlps xmm1, [esi+20] \ __asm movhps xmm1, [esi+28] \ __asm movlps xmm2, [esi+40] \ __asm movhps xmm2, [esi+48] \ __asm movlps xmm3, [esi+60] \ __asm movhps xmm3, [esi+68] \ __asm movaps xmm4, [esi+80] \ __asm movlps xmm5, [esi+100] \ __asm movhps xmm5, [esi+108] #define MUL_Nx6_6x5_ROW( row ) \ __asm movss xmm7, [edi+row*24+0] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm fld dword ptr [edi+(row*6+0)*4] \ __asm fmul dword ptr [esi+(4+0*5)*4] \ __asm movss xmm6, [edi+row*24+4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+1)*4] \ __asm fmul dword ptr [esi+(4+1*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+8] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+2)*4] \ __asm fmul dword ptr [esi+(4+2*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+12] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+3)*4] \ __asm fmul dword ptr [esi+(4+3*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+16] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+4)*4] \ __asm fmul dword ptr [esi+(4+4*5)*4] \ __asm faddp st(1),st \ __asm movss xmm6, [edi+row*24+20] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm fld dword ptr [edi+(row*6+5)*4] \ __asm fmul dword ptr [esi+(4+5*5)*4] \ __asm faddp st(1),st \ __asm fstp dword ptr [eax+(row*5+4)*4] \ __asm movlps [eax+row*20], xmm7 \ __asm movhps [eax+row*20+8], xmm7 MUL_Nx6_6x5_INIT MUL_Nx6_6x5_ROW( 0 ) MUL_Nx6_6x5_ROW( 1 ) MUL_Nx6_6x5_ROW( 2 ) MUL_Nx6_6x5_ROW( 3 ) MUL_Nx6_6x5_ROW( 4 ) return; } break; } case 6: { switch( l ) { case 1: { // 6x6 * 6x1 __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movlps xmm7, qword ptr [esi] movlps xmm6, qword ptr [esi+8] shufps xmm7, xmm7, 0x44 shufps xmm6, xmm6, 0x44 movlps xmm0, qword ptr [edi ] movhps xmm0, qword ptr [edi+ 24] mulps xmm0, xmm7 movlps xmm3, qword ptr [edi+ 8] movhps xmm3, qword ptr [edi+ 32] mulps xmm3, xmm6 movlps xmm1, qword ptr [edi+ 48] movhps xmm1, qword ptr [edi+ 72] mulps xmm1, xmm7 movlps xmm2, qword ptr [edi+ 96] movhps xmm2, qword ptr [edi+120] mulps xmm2, xmm7 movlps xmm4, qword ptr [edi+ 56] movhps xmm4, qword ptr [edi+ 80] movlps xmm5, qword ptr [edi+104] movhps xmm5, qword ptr [edi+128] mulps xmm4, xmm6 movlps xmm7, qword ptr [esi+16] addps xmm0, xmm3 shufps xmm7, xmm7, 0x44 mulps xmm5, xmm6 addps xmm1, xmm4 movlps xmm3, qword ptr [edi+ 16] movhps xmm3, qword ptr [edi+ 40] addps xmm2, xmm5 movlps xmm4, qword ptr [edi+ 64] movhps xmm4, qword ptr [edi+ 88] mulps xmm3, xmm7 movlps xmm5, qword ptr [edi+112] movhps xmm5, qword ptr [edi+136] addps xmm0, xmm3 mulps xmm4, xmm7 mulps xmm5, xmm7 addps xmm1, xmm4 addps xmm2, xmm5 movaps xmm6, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm6, xmm1, 0xDD movaps xmm7, xmm2 shufps xmm7, xmm2, 0x88 shufps xmm2, xmm2, 0xDD addps xmm0, xmm6 addps xmm2, xmm7 movlps [eax], xmm0 movhps [eax+8], xmm0 movlps [eax+16], xmm2 } return; } case 2: { // 6x6 * 6x2 MUL_Nx6_6x2_INIT MUL_Nx6_6x2_ROW2( 0 ) MUL_Nx6_6x2_ROW2( 1 ) MUL_Nx6_6x2_ROW2( 2 ) return; } case 3: { // 6x6 * 6x3 MUL_Nx6_6x3_INIT MUL_Nx6_6x3_ROW( 0 ) MUL_Nx6_6x3_ROW( 1 ) MUL_Nx6_6x3_ROW( 2 ) MUL_Nx6_6x3_ROW( 3 ) MUL_Nx6_6x3_ROW( 4 ) MUL_Nx6_6x3_ROW( 5 ) return; } case 4: { // 6x6 * 6x4 MUL_Nx6_6x4_INIT MUL_Nx6_6x4_ROW( 0 ) MUL_Nx6_6x4_ROW( 1 ) MUL_Nx6_6x4_ROW( 2 ) MUL_Nx6_6x4_ROW( 3 ) MUL_Nx6_6x4_ROW( 4 ) MUL_Nx6_6x4_ROW( 5 ) return; } case 5: { // 6x6 * 6x5 MUL_Nx6_6x5_INIT MUL_Nx6_6x5_ROW( 0 ) MUL_Nx6_6x5_ROW( 1 ) MUL_Nx6_6x5_ROW( 2 ) MUL_Nx6_6x5_ROW( 3 ) MUL_Nx6_6x5_ROW( 4 ) MUL_Nx6_6x5_ROW( 5 ) return; } case 6: { // 6x6 * 6x6 __asm { mov ecx, dword ptr m2Ptr movlps xmm3, qword ptr [ecx+72] mov edx, dword ptr m1Ptr // Loading first 4 columns (upper 4 rows) of m2Ptr. movaps xmm0, xmmword ptr [ecx] movlps xmm1, qword ptr [ecx+24] movhps xmm1, qword ptr [ecx+32] movaps xmm2, xmmword ptr [ecx+48] movhps xmm3, qword ptr [ecx+80] // Calculating first 4 elements in the first row of the destination matrix. movss xmm4, dword ptr [edx] movss xmm5, dword ptr [edx+4] mov eax, dword ptr dstPtr shufps xmm4, xmm4, 0 movss xmm6, dword ptr [edx+8] shufps xmm5, xmm5, 0 movss xmm7, dword ptr [edx+12] mulps xmm4, xmm0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm5, xmm1 mulps xmm6, xmm2 addps xmm5, xmm4 mulps xmm7, xmm3 addps xmm6, xmm5 addps xmm7, xmm6 movaps xmmword ptr [eax], xmm7 // Calculating first 4 elements in the second row of the destination matrix. movss xmm4, dword ptr [edx+24] shufps xmm4, xmm4, 0 mulps xmm4, xmm0 movss xmm5, dword ptr [edx+28] shufps xmm5, xmm5, 0 mulps xmm5, xmm1 movss xmm6, dword ptr [edx+32] shufps xmm6, xmm6, 0 movss xmm7, dword ptr [edx+36] shufps xmm7, xmm7, 0 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm7, xmm6 addps xmm5, xmm4 addps xmm7, xmm5 // Calculating first 4 elements in the third row of the destination matrix. movss xmm4, dword ptr [edx+48] movss xmm5, dword ptr [edx+52] movlps qword ptr [eax+24], xmm7 ; save 2nd movhps qword ptr [eax+32], xmm7 ; row movss xmm6, dword ptr [edx+56] movss xmm7, dword ptr [edx+60] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm5, xmm4 addps xmm7, xmm6 addps xmm7, xmm5 movaps xmmword ptr [eax+48], xmm7 // Calculating first 4 elements in the fourth row of the destination matrix. movss xmm4, dword ptr [edx+72] movss xmm5, dword ptr [edx+76] movss xmm6, dword ptr [edx+80] movss xmm7, dword ptr [edx+84] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm4, xmm5 addps xmm6, xmm4 addps xmm7, xmm6 movlps qword ptr [eax+72], xmm7 movhps qword ptr [eax+80], xmm7 // Calculating first 4 elements in the fifth row of the destination matrix. movss xmm4, dword ptr [edx+96] movss xmm5, dword ptr [edx+100] movss xmm6, dword ptr [edx+104] movss xmm7, dword ptr [edx+108] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm5, xmm4 addps xmm7, xmm6 addps xmm7, xmm5 movaps xmmword ptr [eax+96], xmm7 // Calculating first 4 elements in the sixth row of the destination matrix. movss xmm4, dword ptr [edx+120] movss xmm5, dword ptr [edx+124] movss xmm6, dword ptr [edx+128] movss xmm7, dword ptr [edx+132] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm6, xmm6, 0 shufps xmm7, xmm7, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 mulps xmm7, xmm3 addps xmm4, xmm5 addps xmm6, xmm4 addps xmm7, xmm6 movhps qword ptr [eax+128], xmm7 movlps qword ptr [eax+120], xmm7 // Loading first 4 columns (lower 2 rows) of m2Ptr. movlps xmm0, qword ptr [ecx+96] movhps xmm0, qword ptr [ecx+104] movlps xmm1, qword ptr [ecx+120] movhps xmm1, qword ptr [ecx+128] // Calculating first 4 elements in the first row of the destination matrix. movss xmm2, dword ptr [edx+16] shufps xmm2, xmm2, 0 movss xmm4, dword ptr [edx+40] movss xmm3, dword ptr [edx+20] movss xmm5, dword ptr [edx+44] movaps xmm6, xmmword ptr [eax] movlps xmm7, qword ptr [eax+24] shufps xmm3, xmm3, 0 shufps xmm5, xmm5, 0 movhps xmm7, qword ptr [eax+32] shufps xmm4, xmm4, 0 mulps xmm5, xmm1 mulps xmm2, xmm0 mulps xmm3, xmm1 mulps xmm4, xmm0 addps xmm6, xmm2 addps xmm7, xmm4 addps xmm7, xmm5 addps xmm6, xmm3 movlps qword ptr [eax+24], xmm7 movaps xmmword ptr [eax], xmm6 movhps qword ptr [eax+32], xmm7 // Calculating first 4 elements in the third row of the destination matrix. movss xmm2, dword ptr [edx+64] movss xmm4, dword ptr [edx+88] movss xmm5, dword ptr [edx+92] movss xmm3, dword ptr [edx+68] movaps xmm6, xmmword ptr [eax+48] movlps xmm7, qword ptr [eax+72] movhps xmm7, qword ptr [eax+80] shufps xmm2, xmm2, 0 shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 shufps xmm3, xmm3, 0 mulps xmm2, xmm0 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm3, xmm1 addps xmm6, xmm2 addps xmm6, xmm3 addps xmm7, xmm4 addps xmm7, xmm5 movlps qword ptr [eax+72], xmm7 movaps xmmword ptr [eax+48], xmm6 movhps qword ptr [eax+80], xmm7 // Calculating first 4 elements in the fifth row of the destination matrix. movss xmm2, dword ptr [edx+112] movss xmm3, dword ptr [edx+116] movaps xmm6, xmmword ptr [eax+96] shufps xmm2, xmm2, 0 shufps xmm3, xmm3, 0 mulps xmm2, xmm0 mulps xmm3, xmm1 addps xmm6, xmm2 addps xmm6, xmm3 movaps xmmword ptr [eax+96], xmm6 // Calculating first 4 elements in the sixth row of the destination matrix. movss xmm4, dword ptr [edx+136] movss xmm5, dword ptr [edx+140] movhps xmm7, qword ptr [eax+128] movlps xmm7, qword ptr [eax+120] shufps xmm4, xmm4, 0 shufps xmm5, xmm5, 0 mulps xmm4, xmm0 mulps xmm5, xmm1 addps xmm7, xmm4 addps xmm7, xmm5 // Calculating last 2 columns of the destination matrix. movlps xmm0, qword ptr [ecx+16] movhps xmm0, qword ptr [ecx+40] movhps qword ptr [eax+128], xmm7 movlps qword ptr [eax+120], xmm7 movlps xmm2, qword ptr [ecx+64] movhps xmm2, qword ptr [ecx+88] movaps xmm3, xmm2 shufps xmm3, xmm3, 4Eh movlps xmm4, qword ptr [ecx+112] movhps xmm4, qword ptr [ecx+136] movaps xmm5, xmm4 shufps xmm5, xmm5, 4Eh movlps xmm6, qword ptr [edx] movhps xmm6, qword ptr [edx+24] movaps xmm7, xmm6 shufps xmm7, xmm7, 0F0h mulps xmm7, xmm0 shufps xmm6, xmm6, 0A5h movaps xmm1, xmm0 shufps xmm1, xmm1, 4Eh mulps xmm1, xmm6 addps xmm7, xmm1 movlps xmm6, qword ptr [edx+8] movhps xmm6, qword ptr [edx+32] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm2 mulps xmm6, xmm3 addps xmm7, xmm1 addps xmm7, xmm6 movhps xmm6, qword ptr [edx+40] movlps xmm6, qword ptr [edx+16] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm4 mulps xmm6, xmm5 addps xmm7, xmm1 addps xmm7, xmm6 movlps qword ptr [eax+16], xmm7 movhps qword ptr [eax+40], xmm7 movlps xmm6, qword ptr [edx+48] movhps xmm6, qword ptr [edx+72] movaps xmm7, xmm6 shufps xmm7, xmm7, 0F0h mulps xmm7, xmm0 shufps xmm6, xmm6, 0A5h movaps xmm1, xmm0 shufps xmm1, xmm1, 4Eh mulps xmm1, xmm6 addps xmm7, xmm1 movhps xmm6, qword ptr [edx+80] movlps xmm6, qword ptr [edx+56] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm2 mulps xmm6, xmm3 addps xmm7, xmm1 addps xmm7, xmm6 movlps xmm6, qword ptr [edx+64] movhps xmm6, qword ptr [edx+88] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm4 mulps xmm6, xmm5 addps xmm7, xmm1 addps xmm7, xmm6 movlps qword ptr [eax+64], xmm7 movhps qword ptr [eax+88], xmm7 movlps xmm6, qword ptr [edx+96] movhps xmm6, qword ptr [edx+120] movaps xmm7, xmm6 shufps xmm7, xmm7, 0F0h mulps xmm7, xmm0 shufps xmm6, xmm6, 0A5h movaps xmm1, xmm0 shufps xmm1, xmm1, 4Eh mulps xmm1, xmm6 addps xmm7, xmm1 movlps xmm6, qword ptr [edx+104] movhps xmm6, qword ptr [edx+128] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm2 mulps xmm6, xmm3 addps xmm7, xmm1 addps xmm7, xmm6 movlps xmm6, qword ptr [edx+112] movhps xmm6, qword ptr [edx+136] movaps xmm1, xmm6 shufps xmm1, xmm1, 0F0h shufps xmm6, xmm6, 0A5h mulps xmm1, xmm4 mulps xmm6, xmm5 addps xmm7, xmm1 addps xmm7, xmm6 movlps qword ptr [eax+112], xmm7 movhps qword ptr [eax+136], xmm7 } return; } } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l]; m2Ptr++; } m1Ptr += 6; } break; } default: { for ( i = 0; i < k; i++ ) { for ( j = 0; j < l; j++ ) { m2Ptr = m2.ToFloatPtr() + j; sum = m1Ptr[0] * m2Ptr[0]; for ( n = 1; n < m1.GetNumColumns(); n++ ) { m2Ptr += l; sum += m1Ptr[n] * m2Ptr[0]; } *dstPtr++ = sum; } m1Ptr += m1.GetNumColumns(); } break; } } } /* ============ idSIMD_SSE::MatX_TransposeMultiplyMatX optimizes the following transpose matrix multiplications: Nx6 * NxN 6xN * 6x6 with N in the range [1-6]. ============ */ void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) { int i, j, k, l, n; float *dstPtr; const float *m1Ptr, *m2Ptr; double sum; assert( m1.GetNumRows() == m2.GetNumRows() ); m1Ptr = m1.ToFloatPtr(); m2Ptr = m2.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); k = m1.GetNumColumns(); l = m2.GetNumColumns(); switch( m1.GetNumRows() ) { case 1: if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1 __asm { mov esi, m2Ptr mov edi, m1Ptr mov eax, dstPtr movss xmm0, [esi] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm1, xmm0 mulps xmm0, [edi] mulps xmm1, [edi+16] movaps [eax], xmm0 movlps [eax+16], xmm1 } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m2Ptr++; } m1Ptr++; } break; case 2: if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2 #define MUL_2xN_2x2_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi] \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm movlps xmm1, [esi+8] \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 1 ) #define MUL_2xN_2x2_ROW2( N, row ) \ __asm movlps xmm6, [edi+(row+0*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm movlps xmm7, [edi+(row+1*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm0 \ __asm mulps xmm7, xmm1 \ __asm addps xmm6, xmm7 \ __asm movaps [eax+(row*2)*4], xmm6 MUL_2xN_2x2_INIT MUL_2xN_2x2_ROW2( 6, 0 ) MUL_2xN_2x2_ROW2( 6, 2 ) MUL_2xN_2x2_ROW2( 6, 4 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l]; m2Ptr++; } m1Ptr++; } break; case 3: if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3 #define MUL_3xN_3x3_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movss xmm0, [esi+(0*3+0)*4] \ __asm movhps xmm0, [esi+(0*3+1)*4] \ __asm movss xmm1, [esi+(1*3+0)*4] \ __asm movhps xmm1, [esi+(1*3+1)*4] \ __asm movss xmm2, [esi+(2*3+0)*4] \ __asm movhps xmm2, [esi+(2*3+1)*4] #define MUL_3xN_3x3_INIT_ROW4 \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 2, 3, 0 ) \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 2, 3, 0 ) \ __asm shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 2, 3, 0 ) #define MUL_3xN_3x3_ROW4( N, row ) \ __asm movlps xmm3, [edi+(row+0*N+0)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+0)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 1 ) \ __asm movlps xmm5, [edi+(row+2*N+0)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 1 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+0)*4], xmm3 \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 3, 1 ) \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 3, 1 ) \ __asm shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 1 ) \ __asm movlps xmm3, [edi+(row+0*N+1)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+1)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm movlps xmm5, [edi+(row+2*N+1)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+4)*4], xmm3 \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 3, 1 ) \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 3, 1 ) \ __asm shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 1 ) \ __asm movlps xmm3, [edi+(row+0*N+2)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 1, 1, 1 ) \ __asm movlps xmm4, [edi+(row+1*N+2)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 1, 1, 1 ) \ __asm movlps xmm5, [edi+(row+2*N+2)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 1, 1, 1 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movaps [eax+(row*3+8)*4], xmm3 #define MUL_3xN_3x3_INIT_ROW4_ROW4 \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 3, 0 ) \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 3, 0 ) \ __asm shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) #define MUL_3xN_3x3_INIT_ROW4_ROW \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 1, 2, 3 ) \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 2, 3 ) \ __asm shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 1, 2, 3 ) #define MUL_3xN_3x3_ROW( N, row ) \ __asm movss xmm3, [edi+(row+0*N)*4] \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm movss xmm4, [edi+(row+1*N)*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm movss xmm5, [edi+(row+2*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm3, xmm0 \ __asm mulps xmm4, xmm1 \ __asm mulps xmm5, xmm2 \ __asm addps xmm3, xmm4 \ __asm addps xmm3, xmm5 \ __asm movss [eax+(row*3+0)*4], xmm3 \ __asm movhps [eax+(row*3+1)*4], xmm3 MUL_3xN_3x3_INIT MUL_3xN_3x3_INIT_ROW4 MUL_3xN_3x3_ROW4( 6, 0 ) MUL_3xN_3x3_INIT_ROW4_ROW MUL_3xN_3x3_ROW( 6, 4 ) MUL_3xN_3x3_ROW( 6, 5 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l]; m2Ptr++; } m1Ptr++; } break; case 4: if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4 #define MUL_4xN_4x4_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movaps xmm0, [esi] \ __asm movaps xmm1, [esi+16] \ __asm movaps xmm2, [esi+32] \ __asm movaps xmm3, [esi+48] #define MUL_4xN_4x4_ROW( N, row ) \ __asm movss xmm7, [edi+(row+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+(row+1*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+2*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+3*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movaps [eax+row*16], xmm7 MUL_4xN_4x4_INIT MUL_4xN_4x4_ROW( 6, 0 ) MUL_4xN_4x4_ROW( 6, 1 ) MUL_4xN_4x4_ROW( 6, 2 ) MUL_4xN_4x4_ROW( 6, 3 ) MUL_4xN_4x4_ROW( 6, 4 ) MUL_4xN_4x4_ROW( 6, 5 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l]; m2Ptr++; } m1Ptr++; } break; case 5: if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5 #define MUL_5xN_5x5_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 5*4] \ __asm movhps xmm1, [esi+ 7*4] \ __asm movlps xmm2, [esi+10*4] \ __asm movhps xmm2, [esi+12*4] \ __asm movlps xmm3, [esi+15*4] \ __asm movhps xmm3, [esi+17*4] \ __asm movlps xmm4, [esi+20*4] \ __asm movhps xmm4, [esi+22*4] #define MUL_5xN_5x5_ROW( N, row ) \ __asm movss xmm6, [edi+(row+0*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm0 \ __asm fld dword ptr [edi+(row+0*N)*4] \ __asm fmul dword ptr [esi+ 4*4] \ __asm movss xmm5, [edi+(row+1*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm1 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+1*N)*4] \ __asm fmul dword ptr [esi+ 9*4] \ __asm faddp st(1),st \ __asm movss xmm5, [edi+(row+2*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm2 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+2*N)*4] \ __asm fmul dword ptr [esi+14*4] \ __asm faddp st(1),st \ __asm movss xmm5, [edi+(row+3*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm3 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+3*N)*4] \ __asm fmul dword ptr [esi+19*4] \ __asm faddp st(1),st \ __asm movss xmm5, [edi+(row+4*N)*4] \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm5, xmm4 \ __asm addps xmm6, xmm5 \ __asm fld dword ptr [edi+(row+4*N)*4] \ __asm fmul dword ptr [esi+24*4] \ __asm faddp st(1),st \ __asm fstp dword ptr [eax+(row*5+4)*4] \ __asm movlps [eax+(row*5+0)*4], xmm6 \ __asm movhps [eax+(row*5+2)*4], xmm6 MUL_5xN_5x5_INIT MUL_5xN_5x5_ROW( 6, 0 ) MUL_5xN_5x5_ROW( 6, 1 ) MUL_5xN_5x5_ROW( 6, 2 ) MUL_5xN_5x5_ROW( 6, 3 ) MUL_5xN_5x5_ROW( 6, 4 ) MUL_5xN_5x5_ROW( 6, 5 ) return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l]; m2Ptr++; } m1Ptr++; } break; case 6: if ( !(l^6) ) { switch( k ) { case 1: { // 6x1 * 6x6 #define MUL_6xN_6x6_FIRST4COLUMNS_INIT \ __asm mov esi, m2Ptr \ __asm mov edi, m1Ptr \ __asm mov eax, dstPtr \ __asm movlps xmm0, [esi+ 0*4] \ __asm movhps xmm0, [esi+ 2*4] \ __asm movlps xmm1, [esi+ 6*4] \ __asm movhps xmm1, [esi+ 8*4] \ __asm movlps xmm2, [esi+12*4] \ __asm movhps xmm2, [esi+14*4] \ __asm movlps xmm3, [esi+18*4] \ __asm movhps xmm3, [esi+20*4] \ __asm movlps xmm4, [esi+24*4] \ __asm movhps xmm4, [esi+26*4] \ __asm movlps xmm5, [esi+30*4] \ __asm movhps xmm5, [esi+32*4] #define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \ __asm movss xmm7, [edi+(row+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+(row+1*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+2*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+3*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+4*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(row+5*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movlps [eax+(row*6+0)*4], xmm7 \ __asm movhps [eax+(row*6+2)*4], xmm7 #define MUL_6xN_6x6_LAST2COLUMNS_INIT \ __asm movlps xmm0, [esi+ 4*4] \ __asm movlps xmm1, [esi+10*4] \ __asm shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm movlps xmm2, [esi+16*4] \ __asm movlps xmm3, [esi+22*4] \ __asm shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm movlps xmm4, [esi+28*4] \ __asm movlps xmm5, [esi+34*4] \ __asm shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 1, 0, 1 ) \ __asm shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 1, 0, 1 ) #define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \ __asm movlps xmm7, [edi+(row*2+0*N)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm7, xmm0 \ __asm movlps xmm6, [edi+(row*2+1*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+2*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+3*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+4*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movlps xmm6, [edi+(row*2+5*N)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 1, 1 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movlps [eax+(row*12+ 4)*4], xmm7 \ __asm movhps [eax+(row*12+10)*4], xmm7 #define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \ __asm movss xmm7, [edi+(1*N-1)*4] \ __asm shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm7, xmm0 \ __asm movss xmm6, [edi+(2*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm1 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(3*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm2 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(4*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm3 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(5*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm4 \ __asm addps xmm7, xmm6 \ __asm movss xmm6, [edi+(6*N-1)*4] \ __asm shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) \ __asm mulps xmm6, xmm5 \ __asm addps xmm7, xmm6 \ __asm movlps [eax+(row*6+4)*4], xmm7 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 ) return; } case 2: { // 6x2 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 ) return; } case 3: { // 6x3 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 ) return; } case 4: { // 6x4 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 ) return; } case 5: { // 6x5 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 ) MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 ) return; } case 6: { // 6x6 * 6x6 MUL_6xN_6x6_FIRST4COLUMNS_INIT MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 ) MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 ) MUL_6xN_6x6_LAST2COLUMNS_INIT MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 ) MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 ) return; } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l]; m2Ptr++; } m1Ptr++; } break; default: for ( i = 0; i < k; i++ ) { for ( j = 0; j < l; j++ ) { m1Ptr = m1.ToFloatPtr() + i; m2Ptr = m2.ToFloatPtr() + j; sum = m1Ptr[0] * m2Ptr[0]; for ( n = 1; n < m1.GetNumRows(); n++ ) { m1Ptr += k; m2Ptr += l; sum += m1Ptr[0] * m2Ptr[0]; } *dstPtr++ = sum; } } break; } } /* ============ idSIMD_SSE::MatX_LowerTriangularSolve solves x in Lx = b for the n * n sub-matrix of L if skip > 0 the first skip elements of x are assumed to be valid already L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) { int nc; const float *lptr; if ( skip >= n ) { return; } lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { #define NSKIP( n, s ) ((n<<3)|(s&7)) switch( NSKIP( n, skip ) ) { case NSKIP( 1, 0 ): x[0] = b[0]; return; case NSKIP( 2, 0 ): x[0] = b[0]; case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; return; case NSKIP( 3, 0 ): x[0] = b[0]; case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; return; case NSKIP( 4, 0 ): x[0] = b[0]; case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; return; case NSKIP( 5, 0 ): x[0] = b[0]; case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; return; case NSKIP( 6, 0 ): x[0] = b[0]; case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; return; case NSKIP( 7, 0 ): x[0] = b[0]; case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5]; return; } return; } // process first 4 rows switch( skip ) { case 0: x[0] = b[0]; case 1: x[1] = b[1] - lptr[1*nc+0] * x[0]; case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; skip = 4; } lptr = L[skip]; // this code assumes n > 4 __asm { push ebx mov eax, skip // eax = i shl eax, 2 // eax = i*4 mov edx, n // edx = n shl edx, 2 // edx = n*4 mov esi, x // esi = x mov edi, lptr // edi = lptr add esi, eax add edi, eax mov ebx, b // ebx = b // check for aligned memory mov ecx, nc shl ecx, 2 or ecx, esi or ecx, edi and ecx, 15 jnz loopurow // aligned looprow: mov ecx, eax neg ecx movaps xmm0, [esi+ecx] mulps xmm0, [edi+ecx] add ecx, 12*4 jg donedot8 dot8: movaps xmm1, [esi+ecx-(8*4)] mulps xmm1, [edi+ecx-(8*4)] addps xmm0, xmm1 movaps xmm3, [esi+ecx-(4*4)] mulps xmm3, [edi+ecx-(4*4)] addps xmm0, xmm3 add ecx, 8*4 jle dot8 donedot8: sub ecx, 4*4 jg donedot4 //dot4: movaps xmm1, [esi+ecx-(4*4)] mulps xmm1, [edi+ecx-(4*4)] addps xmm0, xmm1 add ecx, 4*4 donedot4: movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm1 sub ecx, 4*4 jz dot0 add ecx, 4 jz dot1 add ecx, 4 jz dot2 //dot3: movss xmm1, [esi-(3*4)] mulss xmm1, [edi-(3*4)] addss xmm0, xmm1 dot2: movss xmm3, [esi-(2*4)] mulss xmm3, [edi-(2*4)] addss xmm0, xmm3 dot1: movss xmm5, [esi-(1*4)] mulss xmm5, [edi-(1*4)] addss xmm0, xmm5 dot0: movss xmm1, [ebx+eax] subss xmm1, xmm0 movss [esi], xmm1 add eax, 4 cmp eax, edx jge done add esi, 4 mov ecx, nc shl ecx, 2 add edi, ecx add edi, 4 jmp looprow // unaligned loopurow: mov ecx, eax neg ecx movups xmm0, [esi+ecx] movups xmm1, [edi+ecx] mulps xmm0, xmm1 add ecx, 12*4 jg doneudot8 udot8: movups xmm1, [esi+ecx-(8*4)] movups xmm2, [edi+ecx-(8*4)] mulps xmm1, xmm2 addps xmm0, xmm1 movups xmm3, [esi+ecx-(4*4)] movups xmm4, [edi+ecx-(4*4)] mulps xmm3, xmm4 addps xmm0, xmm3 add ecx, 8*4 jle udot8 doneudot8: sub ecx, 4*4 jg doneudot4 //udot4: movups xmm1, [esi+ecx-(4*4)] movups xmm2, [edi+ecx-(4*4)] mulps xmm1, xmm2 addps xmm0, xmm1 add ecx, 4*4 doneudot4: movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm1 sub ecx, 4*4 jz udot0 add ecx, 4 jz udot1 add ecx, 4 jz udot2 //udot3: movss xmm1, [esi-(3*4)] movss xmm2, [edi-(3*4)] mulss xmm1, xmm2 addss xmm0, xmm1 udot2: movss xmm3, [esi-(2*4)] movss xmm4, [edi-(2*4)] mulss xmm3, xmm4 addss xmm0, xmm3 udot1: movss xmm5, [esi-(1*4)] movss xmm6, [edi-(1*4)] mulss xmm5, xmm6 addss xmm0, xmm5 udot0: movss xmm1, [ebx+eax] subss xmm1, xmm0 movss [esi], xmm1 add eax, 4 cmp eax, edx jge done add esi, 4 mov ecx, nc shl ecx, 2 add edi, ecx add edi, 4 jmp loopurow done: pop ebx } } /* ============ idSIMD_SSE::MatX_LowerTriangularSolveTranspose solves x in L'x = b for the n * n sub-matrix of L L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) { int nc; const float *lptr; lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[1] = b[1]; x[0] = b[0] - lptr[1*nc+0] * x[1]; return; case 3: x[2] = b[2]; x[1] = b[1] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 4: x[3] = b[3]; x[2] = b[2] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 5: x[4] = b[4]; x[3] = b[3] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 6: x[5] = b[5]; x[4] = b[4] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 7: x[6] = b[6]; x[5] = b[5] - lptr[6*nc+5] * x[6]; x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; } return; } #if 1 int i, j, m; float *xptr; double s0; // if the number of columns is not a multiple of 2 we're screwed for alignment. // however, if the number of columns is a multiple of 2 but the number of to be // processed rows is not a multiple of 2 we can still run 8 byte aligned m = n; if ( m & 1 ) { m--; x[m] = b[m]; lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; __asm { push ebx mov eax, m // eax = i mov esi, xptr // esi = xptr mov edi, lptr // edi = lptr mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 process4rows_1: movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3] xor ecx, ecx sub eax, m neg eax jz done4x4_1 process4x4_1: // process 4x4 blocks movlps xmm2, [edi+0] movhps xmm2, [edi+8] add edi, edx movss xmm1, [esi+4*ecx+0] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm3, [edi+0] movhps xmm3, [edi+8] add edi, edx mulps xmm1, xmm2 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm4, [edi+0] movhps xmm4, [edi+8] add edi, edx mulps xmm1, xmm3 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+8] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm5, [edi+0] movhps xmm5, [edi+8] add edi, edx mulps xmm1, xmm4 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+12] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) add ecx, 4 cmp ecx, eax mulps xmm1, xmm5 subps xmm0, xmm1 jl process4x4_1 done4x4_1: // process left over of the 4 rows movlps xmm2, [edi+0] movhps xmm2, [edi+8] movss xmm1, [esi+4*ecx] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm2 subps xmm0, xmm1 imul ecx, edx sub edi, ecx neg eax add eax, m sub eax, 4 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 2, 2, 2 ) movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 3, 3, 3, 3 ) sub edi, edx movss [esi-4], xmm3 // xptr[-1] = s3 movss xmm4, xmm3 movss xmm5, xmm3 mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3 mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3 mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3 subss xmm2, xmm3 movss [esi-8], xmm2 // xptr[-2] = s2 movss xmm6, xmm2 sub edi, edx subss xmm0, xmm5 subss xmm1, xmm4 mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2 mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2 subss xmm1, xmm2 movss [esi-12], xmm1 // xptr[-3] = s1 subss xmm0, xmm6 sub edi, edx cmp eax, 4 mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1 subss xmm0, xmm1 movss [esi-16], xmm0 // xptr[-4] = s0 jl done4rows_1 sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows_1 done4rows_1: pop ebx } } else { lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; __asm { push ebx mov eax, m // eax = i mov esi, xptr // esi = xptr mov edi, lptr // edi = lptr mov ebx, b // ebx = b mov edx, nc // edx = nc*sizeof(float) shl edx, 2 process4rows: movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1] movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3] sub eax, m jz done4x4 neg eax xor ecx, ecx process4x4: // process 4x4 blocks movlps xmm2, [edi+0] movhps xmm2, [edi+8] add edi, edx movss xmm1, [esi+4*ecx+0] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm3, [edi+0] movhps xmm3, [edi+8] add edi, edx mulps xmm1, xmm2 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm4, [edi+0] movhps xmm4, [edi+8] add edi, edx mulps xmm1, xmm3 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+8] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps xmm5, [edi+0] movhps xmm5, [edi+8] add edi, edx mulps xmm1, xmm4 subps xmm0, xmm1 movss xmm1, [esi+4*ecx+12] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) add ecx, 4 cmp ecx, eax mulps xmm1, xmm5 subps xmm0, xmm1 jl process4x4 imul ecx, edx sub edi, ecx neg eax done4x4: // process left over of the 4 rows add eax, m sub eax, 4 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 2, 2, 2 ) movaps xmm3, xmm0 shufps xmm3, xmm3, R_SHUFFLE_PS( 3, 3, 3, 3 ) sub edi, edx movss [esi-4], xmm3 // xptr[-1] = s3 movss xmm4, xmm3 movss xmm5, xmm3 mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3 mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3 mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3 subss xmm2, xmm3 movss [esi-8], xmm2 // xptr[-2] = s2 movss xmm6, xmm2 sub edi, edx subss xmm0, xmm5 subss xmm1, xmm4 mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2 mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2 subss xmm1, xmm2 movss [esi-12], xmm1 // xptr[-3] = s1 subss xmm0, xmm6 sub edi, edx cmp eax, 4 mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1 subss xmm0, xmm1 movss [esi-16], xmm0 // xptr[-4] = s0 jl done4rows sub edi, edx sub edi, 16 sub esi, 16 jmp process4rows done4rows: pop ebx } } // process left over rows for ( i = (m&3)-1; i >= 0; i-- ) { s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < n; j++ ) { s0 -= lptr[j*nc] * x[j]; } x[i] = s0; } #else int i, j, m; double s0, s1, s2, s3, t; const float *lptr2; float *xptr, *xptr2; m = n; if ( m & 1 ) { m--; x[m] = b[m]; lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; // process 4 rows at a time for ( i = m; i >= 4; i -= 4 ) { s0 = b[i-4]; s1 = b[i-3]; s2 = b[i-2]; s3 = b[i-1]; // process 4x4 blocks xptr2 = xptr; // x + i; lptr2 = lptr; // ptr = L[i] + i - 4; for ( j = 0; j < m-i; j += 4 ) { t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; } t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; // process left over of the 4 rows lptr -= nc; s0 -= lptr[0] * s3; s1 -= lptr[1] * s3; s2 -= lptr[2] * s3; lptr -= nc; s0 -= lptr[0] * s2; s1 -= lptr[1] * s2; lptr -= nc; s0 -= lptr[0] * s1; lptr -= nc; // store result xptr[-4] = s0; xptr[-3] = s1; xptr[-2] = s2; xptr[-1] = s3; // update pointers for next four rows lptr -= 4; xptr -= 4; } } else { lptr = L.ToFloatPtr() + m * nc + m - 4; xptr = x + m; // process 4 rows at a time for ( i = m; i >= 4; i -= 4 ) { s0 = b[i-4]; s1 = b[i-3]; s2 = b[i-2]; s3 = b[i-1]; // process 4x4 blocks xptr2 = xptr; // x + i; lptr2 = lptr; // ptr = L[i] + i - 4; for ( j = 0; j < m-i; j += 4 ) { t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; t = xptr2[0]; s0 -= lptr2[0] * t; s1 -= lptr2[1] * t; s2 -= lptr2[2] * t; s3 -= lptr2[3] * t; lptr2 += nc; xptr2++; } // process left over of the 4 rows lptr -= nc; s0 -= lptr[0] * s3; s1 -= lptr[1] * s3; s2 -= lptr[2] * s3; lptr -= nc; s0 -= lptr[0] * s2; s1 -= lptr[1] * s2; lptr -= nc; s0 -= lptr[0] * s1; lptr -= nc; // store result xptr[-4] = s0; xptr[-3] = s1; xptr[-2] = s2; xptr[-1] = s3; // update pointers for next four rows lptr -= 4; xptr -= 4; } } // process left over rows for ( i--; i >= 0; i-- ) { s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < m; j++ ) { s0 -= lptr[j*nc] * x[j]; } x[i] = s0; } #endif } /* ============ idSIMD_SSE::MatX_UpperTriangularSolve solves x in Ux = b for the n * n sub-matrix of U U has to be an upper triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE::MatX_UpperTriangularSolve( const idMatX &U, float *x, const float *b, const int n ) { int nc; const float *uptr; uptr = U.ToFloatPtr(); nc = U.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[1] = b[1]; x[0] = b[0] - uptr[0*nc+1] * x[1]; return; case 3: x[2] = b[2]; x[1] = b[1] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 4: x[3] = b[3]; x[2] = b[2] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 5: x[4] = b[4]; x[3] = b[3] - uptr[3*nc+4] * x[4]; x[2] = b[2] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 6: x[5] = b[5]; x[4] = b[4] - uptr[4*nc+5] * x[5]; x[3] = b[3] - uptr[3*nc+5] * x[5] - uptr[3*nc+4] * x[4]; x[2] = b[2] - uptr[2*nc+5] * x[5] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+5] * x[5] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+5] * x[5] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 7: x[6] = b[6]; x[5] = b[5] - uptr[5*nc+6] * x[6]; x[4] = b[4] - uptr[4*nc+6] * x[6] - uptr[4*nc+5] * x[5]; x[3] = b[3] - uptr[3*nc+6] * x[6] - uptr[3*nc+5] * x[5] - uptr[3*nc+4] * x[4]; x[2] = b[2] - uptr[2*nc+6] * x[6] - uptr[2*nc+5] * x[5] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+6] * x[6] - uptr[1*nc+5] * x[5] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+6] * x[6] - uptr[0*nc+5] * x[5] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; } return; } int i, j; register double s0, s1, s2, s3; // process the last four rows x[n-1] = b[n-1]; x[n-2] = b[n-2] - uptr[(n-2)*nc+(n-1)] * x[n-1]; x[n-3] = b[n-3] - uptr[(n-3)*nc+(n-1)] * x[n-1] - uptr[(n-3)*nc+(n-2)] * x[n-2]; x[n-4] = b[n-4] - uptr[(n-4)*nc+(n-1)] * x[n-1] - uptr[(n-4)*nc+(n-2)] * x[n-2] - uptr[(n-4)*nc+(n-3)] * x[n-3]; uptr = U[n - 5]; for ( i = n - 5; i >= 0; i-- ) { s0 = uptr[i+1] * x[i+1]; s1 = uptr[i+2] * x[i+2]; s2 = uptr[i+3] * x[i+3]; s3 = uptr[i+4] * x[i+4]; for ( j = i + 5; j < n-7; j += 8 ) { s0 += uptr[j+0] * x[j+0]; s1 += uptr[j+1] * x[j+1]; s2 += uptr[j+2] * x[j+2]; s3 += uptr[j+3] * x[j+3]; s0 += uptr[j+4] * x[j+4]; s1 += uptr[j+5] * x[j+5]; s2 += uptr[j+6] * x[j+6]; s3 += uptr[j+7] * x[j+7]; } switch( n - j ) { NODEFAULT; case 7: s0 += uptr[j+6] * x[j+6]; case 6: s1 += uptr[j+5] * x[j+5]; case 5: s2 += uptr[j+4] * x[j+4]; case 4: s3 += uptr[j+3] * x[j+3]; case 3: s0 += uptr[j+2] * x[j+2]; case 2: s1 += uptr[j+1] * x[j+1]; case 1: s2 += uptr[j+0] * x[j+0]; case 0: break; } double sum; sum = s3; sum += s2; sum += s1; sum += s0; sum -= b[i]; x[i] = -sum; uptr -= nc; } } /* ============ idSIMD_SSE::MatX_UpperTriangularSolveTranspose solves x in U'x = b for the n * n sub-matrix of U U has to be an upper triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_SSE::MatX_UpperTriangularSolveTranspose( const idMatX &U, float *x, const float *b, const int n ) { int nc; const float *uptr; uptr = U.ToFloatPtr(); nc = U.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; return; case 3: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; return; case 4: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; return; case 5: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3]; return; case 6: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3]; x[5] = b[5] - uptr[0*nc+5] * x[0] - uptr[1*nc+5] * x[1] - uptr[2*nc+5] * x[2] - uptr[3*nc+5] * x[3] - uptr[4*nc+5] * x[4]; return; case 7: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3]; x[5] = b[5] - uptr[0*nc+5] * x[0] - uptr[1*nc+5] * x[1] - uptr[2*nc+5] * x[2] - uptr[3*nc+5] * x[3] - uptr[4*nc+5] * x[4]; x[6] = b[6] - uptr[0*nc+6] * x[0] - uptr[1*nc+6] * x[1] - uptr[2*nc+6] * x[2] - uptr[3*nc+6] * x[3] - uptr[4*nc+6] * x[4] - uptr[5*nc+6] * x[5]; return; } return; } int i, j; register double s0, s1, s2, s3; uptr = U.ToFloatPtr(); // process 4 columns at a time for ( i = 0; i < n - 3; i += 4 ) { s0 = b[i+0]; s1 = b[i+1]; s2 = b[i+2]; s3 = b[i+3]; // process 4x4 blocks for ( j = 0; j < i-3; j += 4 ) { s0 -= uptr[(j+0)*nc+0] * x[j+0]; s1 -= uptr[(j+0)*nc+1] * x[j+0]; s2 -= uptr[(j+0)*nc+2] * x[j+0]; s3 -= uptr[(j+0)*nc+3] * x[j+0]; s0 -= uptr[(j+1)*nc+0] * x[j+1]; s1 -= uptr[(j+1)*nc+1] * x[j+1]; s2 -= uptr[(j+1)*nc+2] * x[j+1]; s3 -= uptr[(j+1)*nc+3] * x[j+1]; s0 -= uptr[(j+2)*nc+0] * x[j+2]; s1 -= uptr[(j+2)*nc+1] * x[j+2]; s2 -= uptr[(j+2)*nc+2] * x[j+2]; s3 -= uptr[(j+2)*nc+3] * x[j+2]; s0 -= uptr[(j+3)*nc+0] * x[j+3]; s1 -= uptr[(j+3)*nc+1] * x[j+3]; s2 -= uptr[(j+3)*nc+2] * x[j+3]; s3 -= uptr[(j+3)*nc+3] * x[j+3]; } // process left over of the 4 columns s1 -= uptr[(j+0)*nc+1] * s0; s2 -= uptr[(j+0)*nc+2] * s0; s2 -= uptr[(j+1)*nc+2] * s1; s3 -= uptr[(j+0)*nc+3] * s0; s3 -= uptr[(j+1)*nc+3] * s1; s3 -= uptr[(j+2)*nc+3] * s2; // store result x[i+0] = s0; x[i+1] = s1; x[i+2] = s2; x[i+3] = s3; // update pointer for next four columns uptr += 4; } // process left over columns for ( ; i < n; i++ ) { s0 = b[i]; uptr = U[0] + i; for ( j = 0; j < i; j++ ) { s0 -= uptr[j*nc] * x[j]; } x[i] = s0; } } /* ============ idSIMD_SSE::MatX_LU_Factor in-place factorization LU of the n * n sub-matrix of mat the reciprocal of the diagonal elements of U are stored in invDiag no pivoting is used ============ */ bool VPCALL idSIMD_SSE::MatX_LU_Factor( idMatX &mat, idVecX &invDiag, const int n ) { int i, j, k; float d1, d2, *ptr1, *ptr2; for ( i = 0; i < n; i++ ) { d1 = mat[i][i]; if ( d1 == 0.0f ) { return false; } invDiag[i] = d1 = 1.0f / d1; ptr1 = mat[i]; for ( j = i + 1; j < n; j++ ) { ptr2 = mat[j]; ptr2[i] = d2 = ptr2[i] * d1; for ( k = n - 1; k > i + 15; k -= 16 ) { ptr2[k-0] -= d2 * ptr1[k-0]; ptr2[k-1] -= d2 * ptr1[k-1]; ptr2[k-2] -= d2 * ptr1[k-2]; ptr2[k-3] -= d2 * ptr1[k-3]; ptr2[k-4] -= d2 * ptr1[k-4]; ptr2[k-5] -= d2 * ptr1[k-5]; ptr2[k-6] -= d2 * ptr1[k-6]; ptr2[k-7] -= d2 * ptr1[k-7]; ptr2[k-8] -= d2 * ptr1[k-8]; ptr2[k-9] -= d2 * ptr1[k-9]; ptr2[k-10] -= d2 * ptr1[k-10]; ptr2[k-11] -= d2 * ptr1[k-11]; ptr2[k-12] -= d2 * ptr1[k-12]; ptr2[k-13] -= d2 * ptr1[k-13]; ptr2[k-14] -= d2 * ptr1[k-14]; ptr2[k-15] -= d2 * ptr1[k-15]; } switch( k - i ) { NODEFAULT; case 15: ptr2[k-14] -= d2 * ptr1[k-14]; case 14: ptr2[k-13] -= d2 * ptr1[k-13]; case 13: ptr2[k-12] -= d2 * ptr1[k-12]; case 12: ptr2[k-11] -= d2 * ptr1[k-11]; case 11: ptr2[k-10] -= d2 * ptr1[k-10]; case 10: ptr2[k-9] -= d2 * ptr1[k-9]; case 9: ptr2[k-8] -= d2 * ptr1[k-8]; case 8: ptr2[k-7] -= d2 * ptr1[k-7]; case 7: ptr2[k-6] -= d2 * ptr1[k-6]; case 6: ptr2[k-5] -= d2 * ptr1[k-5]; case 5: ptr2[k-4] -= d2 * ptr1[k-4]; case 4: ptr2[k-3] -= d2 * ptr1[k-3]; case 3: ptr2[k-2] -= d2 * ptr1[k-2]; case 2: ptr2[k-1] -= d2 * ptr1[k-1]; case 1: ptr2[k-0] -= d2 * ptr1[k-0]; case 0: break; } } } return true; } /* ============ idSIMD_SSE::MatX_LDLT_Factor in-place factorization LDL' of the n * n sub-matrix of mat the reciprocal of the diagonal elements are stored in invDiag NOTE: the number of columns of mat must be a multiple of 4 ============ */ bool VPCALL idSIMD_SSE::MatX_LDLT_Factor( idMatX &mat, idVecX &invDiag, const int n ) { #if 1 int j, nc; float *v, *diag, *invDiagPtr, *mptr; double s0, s1, s2, sum, d; v = (float *) _alloca16( n * sizeof( float ) ); diag = (float *) _alloca16( n * sizeof( float ) ); invDiagPtr = invDiag.ToFloatPtr(); nc = mat.GetNumColumns(); assert( ( nc & 3 ) == 0 ); if ( n <= 0 ) { return true; } mptr = mat[0]; sum = mptr[0]; if ( sum == 0.0f ) { return false; } diag[0] = sum; invDiagPtr[0] = d = 1.0f / sum; if ( n <= 1 ) { return true; } mptr = mat[0]; for ( j = 1; j < n; j++ ) { mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; } mptr = mat[1]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { return false; } mat[1][1] = sum; diag[1] = sum; invDiagPtr[1] = d = 1.0f / sum; if ( n <= 2 ) { return true; } mptr = mat[0]; for ( j = 2; j < n; j++ ) { mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; } mptr = mat[2]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { return false; } mat[2][2] = sum; diag[2] = sum; invDiagPtr[2] = d = 1.0f / sum; if ( n <= 3 ) { return true; } mptr = mat[0]; for ( j = 3; j < n; j++ ) { mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; } mptr = mat[3]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { return false; } mat[3][3] = sum; diag[3] = sum; invDiagPtr[3] = d = 1.0f / sum; if ( n <= 4 ) { return true; } mptr = mat[0]; for ( j = 4; j < n; j++ ) { mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; } int ncf = nc * sizeof( float ); mptr = mat[0]; __asm { xorps xmm2, xmm2 xorps xmm3, xmm3 xorps xmm4, xmm4 push ebx mov ebx, 4 loopRow: cmp ebx, n jge done mov ecx, ebx // esi = i shl ecx, 2 // esi = i * 4 mov edx, diag // edx = diag add edx, ecx // edx = &diag[i] mov edi, ebx // edi = i imul edi, ncf // edi = i * nc * sizeof( float ) add edi, mptr // edi = mat[i] add edi, ecx // edi = &mat[i][i] mov esi, v // ecx = v add esi, ecx // ecx = &v[i] mov eax, invDiagPtr // eax = invDiagPtr add eax, ecx // eax = &invDiagPtr[i] neg ecx movaps xmm0, [edx+ecx] mulps xmm0, [edi+ecx] movaps [esi+ecx], xmm0 mulps xmm0, [edi+ecx] add ecx, 12*4 jg doneDot8 dot8: movaps xmm1, [edx+ecx-(8*4)] mulps xmm1, [edi+ecx-(8*4)] movaps [esi+ecx-(8*4)], xmm1 mulps xmm1, [edi+ecx-(8*4)] addps xmm0, xmm1 movaps xmm2, [edx+ecx-(4*4)] mulps xmm2, [edi+ecx-(4*4)] movaps [esi+ecx-(4*4)], xmm2 mulps xmm2, [edi+ecx-(4*4)] addps xmm0, xmm2 add ecx, 8*4 jle dot8 doneDot8: sub ecx, 4*4 jg doneDot4 movaps xmm1, [edx+ecx-(4*4)] mulps xmm1, [edi+ecx-(4*4)] movaps [esi+ecx-(4*4)], xmm1 mulps xmm1, [edi+ecx-(4*4)] addps xmm0, xmm1 add ecx, 4*4 doneDot4: sub ecx, 2*4 jg doneDot2 movlps xmm3, [edx+ecx-(2*4)] movlps xmm4, [edi+ecx-(2*4)] mulps xmm3, xmm4 movlps [esi+ecx-(2*4)], xmm3 mulps xmm3, xmm4 addps xmm0, xmm3 add ecx, 2*4 doneDot2: sub ecx, 1*4 jg doneDot1 movss xmm3, [edx+ecx-(1*4)] movss xmm4, [edi+ecx-(1*4)] mulss xmm3, xmm4 movss [esi+ecx-(1*4)], xmm3 mulss xmm3, xmm4 addss xmm0, xmm3 doneDot1: movhlps xmm2, xmm0 addps xmm0, xmm2 movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm2 movss xmm1, [edi] subss xmm1, xmm0 movss [edi], xmm1 // mptr[i] = sum; movss [edx], xmm1 // diag[i] = sum; // if ( sum == 0.0f ) return false; movaps xmm2, xmm1 cmpeqss xmm2, SIMD_SP_zero andps xmm2, SIMD_SP_tiny orps xmm1, xmm2 rcpss xmm7, xmm1 mulss xmm1, xmm7 mulss xmm1, xmm7 addss xmm7, xmm7 subss xmm7, xmm1 movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum; mov edx, n // edx = n sub edx, ebx // edx = n - i dec edx // edx = n - i - 1 jle doneSubRow // if ( i + 1 >= n ) return true; mov eax, ebx // eax = i shl eax, 2 // eax = i * 4 neg eax loopSubRow: add edi, ncf mov ecx, eax movaps xmm0, [esi+ecx] mulps xmm0, [edi+ecx] add ecx, 12*4 jg doneSubDot8 subDot8: movaps xmm1, [esi+ecx-(8*4)] mulps xmm1, [edi+ecx-(8*4)] addps xmm0, xmm1 movaps xmm2, [esi+ecx-(4*4)] mulps xmm2, [edi+ecx-(4*4)] addps xmm0, xmm2 add ecx, 8*4 jle subDot8 doneSubDot8: sub ecx, 4*4 jg doneSubDot4 movaps xmm1, [esi+ecx-(4*4)] mulps xmm1, [edi+ecx-(4*4)] addps xmm0, xmm1 add ecx, 4*4 doneSubDot4: sub ecx, 2*4 jg doneSubDot2 movlps xmm3, [esi+ecx-(2*4)] movlps xmm4, [edi+ecx-(2*4)] mulps xmm3, xmm4 addps xmm0, xmm3 add ecx, 2*4 doneSubDot2: sub ecx, 1*4 jg doneSubDot1 movss xmm3, [esi+ecx-(1*4)] movss xmm4, [edi+ecx-(1*4)] mulss xmm3, xmm4 addss xmm0, xmm3 doneSubDot1: movhlps xmm2, xmm0 addps xmm0, xmm2 movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 0, 0 ) addss xmm0, xmm2 movss xmm1, [edi] subss xmm1, xmm0 mulss xmm1, xmm7 movss [edi], xmm1 dec edx jg loopSubRow doneSubRow: inc ebx jmp loopRow done: pop ebx } return true; #else int i, j, k, nc; float *v, *diag, *mptr; double s0, s1, s2, s3, sum, d; v = (float *) _alloca16( n * sizeof( float ) ); diag = (float *) _alloca16( n * sizeof( float ) ); nc = mat.GetNumColumns(); if ( n <= 0 ) { return true; } mptr = mat[0]; sum = mptr[0]; if ( sum == 0.0f ) { return false; } diag[0] = sum; invDiag[0] = d = 1.0f / sum; if ( n <= 1 ) { return true; } mptr = mat[0]; for ( j = 1; j < n; j++ ) { mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; } mptr = mat[1]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { return false; } mat[1][1] = sum; diag[1] = sum; invDiag[1] = d = 1.0f / sum; if ( n <= 2 ) { return true; } mptr = mat[0]; for ( j = 2; j < n; j++ ) { mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; } mptr = mat[2]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { return false; } mat[2][2] = sum; diag[2] = sum; invDiag[2] = d = 1.0f / sum; if ( n <= 3 ) { return true; } mptr = mat[0]; for ( j = 3; j < n; j++ ) { mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; } mptr = mat[3]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { return false; } mat[3][3] = sum; diag[3] = sum; invDiag[3] = d = 1.0f / sum; if ( n <= 4 ) { return true; } mptr = mat[0]; for ( j = 4; j < n; j++ ) { mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; } for ( i = 4; i < n; i++ ) { mptr = mat[i]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3]; for ( k = 4; k < i-3; k += 4 ) { v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0]; v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2]; v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3]; } switch( i - k ) { NODEFAULT; case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2]; case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0]; case 0: break; } sum = s3; sum += s2; sum += s1; sum += s0; sum = mptr[i] - sum; if ( sum == 0.0f ) { return false; } mat[i][i] = sum; diag[i] = sum; invDiag[i] = d = 1.0f / sum; if ( i + 1 >= n ) { return true; } mptr = mat[i+1]; for ( j = i+1; j < n; j++ ) { s0 = mptr[0] * v[0]; s1 = mptr[1] * v[1]; s2 = mptr[2] * v[2]; s3 = mptr[3] * v[3]; for ( k = 4; k < i-7; k += 8 ) { s0 += mptr[k+0] * v[k+0]; s1 += mptr[k+1] * v[k+1]; s2 += mptr[k+2] * v[k+2]; s3 += mptr[k+3] * v[k+3]; s0 += mptr[k+4] * v[k+4]; s1 += mptr[k+5] * v[k+5]; s2 += mptr[k+6] * v[k+6]; s3 += mptr[k+7] * v[k+7]; } switch( i - k ) { NODEFAULT; case 7: s0 += mptr[k+6] * v[k+6]; case 6: s1 += mptr[k+5] * v[k+5]; case 5: s2 += mptr[k+4] * v[k+4]; case 4: s3 += mptr[k+3] * v[k+3]; case 3: s0 += mptr[k+2] * v[k+2]; case 2: s1 += mptr[k+1] * v[k+1]; case 1: s2 += mptr[k+0] * v[k+0]; case 0: break; } sum = s3; sum += s2; sum += s1; sum += s0; mptr[i] = ( mptr[i] - sum ) * d; mptr += nc; } } return true; #endif } /* ============ idSIMD_SSE::DecompressJoints ============ */ #pragma warning( disable : 4731 ) // frame pointer register 'ebp' modified by inline assembly code void VPCALL idSIMD_SSE::DecompressJoints( idJointQuat *joints, const idCompressedJointQuat *compressedJoints, const int *index, const int numJoints ) { #if 1 __asm { mov esi, compressedJoints mov edi, joints mov edx, index mov eax, numJoints shl eax, 2 jz doney add edx, eax neg eax movaps xmm7, SIMD_SP_clearLast movaps xmm6, SIMD_SP_decofs movaps xmm5, SIMD_SP_decquat movaps xmm4, SIMD_SP_one push ebp loopy: mov ebx, [edx + eax] mov ebp, ebx lea ebx, [ebx*2+ebx] shl ebx, 2 movsx ecx, word ptr [esi + ebx + 0]; cvtsi2ss xmm0, ecx movsx ecx, word ptr [esi + ebx + 2]; cvtsi2ss xmm1, ecx movsx ecx, word ptr [esi + ebx + 4]; cvtsi2ss xmm2, ecx shufps xmm1, xmm1, SHUFFLE_PS( 0, 0, 0, 0 ) unpcklps xmm0, xmm1 movlhps xmm0, xmm2 andps xmm0, xmm7 mulps xmm0, xmm5 movaps xmm1, xmm0 movaps xmm2, xmm0 shufps xmm2, xmm2, SHUFFLE_PS( 1, 1, 1, 1 ) movhlps xmm3, xmm0 mulss xmm1, xmm1 mulss xmm2, xmm2 mulss xmm3, xmm3 addss xmm1, xmm2 addss xmm1, xmm3 // xmm1 = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] movaps xmm2, xmm4 subss xmm2, xmm1 // 1 - xmm1 andps xmm2, SIMD_SP_absMask // make sure the values are positive addps xmm2, SIMD_SP_tiny //xmm1 movaps xmm1, xmm2 rsqrtss xmm3, xmm1 mulps xmm2, xmm3 shufps xmm2, xmm2, SHUFFLE_PS( 0, 0, 0, 0 ) andps xmm2, SIMD_SP_clearFirstThree orps xmm0, xmm2 // xmm0 contains quat movsx ecx, word ptr [esi + ebx + 6]; cvtsi2ss xmm1, ecx movsx ecx, word ptr [esi + ebx + 8]; cvtsi2ss xmm2, ecx movsx ecx, word ptr [esi + ebx + 10]; cvtsi2ss xmm3, ecx shufps xmm2, xmm2, SHUFFLE_PS( 0, 0, 0, 0 ) unpcklps xmm1, xmm2 movlhps xmm1, xmm3 andps xmm1, xmm7 mulps xmm1, xmm6 shl ebp, 5; movaps [edi+ebp+0], xmm0 movaps [edi+ebp+16], xmm1 add eax, 4 jl loopy pop ebp doney: } #else // FIXME: write SSE code for this for ( int i = 0; i < numJoints; i++ ) { int j = index[i]; float *q = joints[j].q.ToFloatPtr(); float *t = joints[j].t.ToFloatPtr(); q[0] = idCompressedJointQuat::ShortToQuat( compressedJoints[j].q[0] ); q[1] = idCompressedJointQuat::ShortToQuat( compressedJoints[j].q[1] ); q[2] = idCompressedJointQuat::ShortToQuat( compressedJoints[j].q[2] ); // take the absolute value because floating point rounding may cause the dot of x,y,z to be larger than 1 q[3] = idMath::Sqrt( idMath::Fabs( 1.0f - ( q[0] * q[0] + q[1] * q[1] + q[2] * q[2] ) ) ); t[0] = idCompressedJointQuat::ShortToOffset( compressedJoints[j].t[0] ); t[1] = idCompressedJointQuat::ShortToOffset( compressedJoints[j].t[1] ); t[2] = idCompressedJointQuat::ShortToOffset( compressedJoints[j].t[2] ); t[3] = 0.0f; } #endif } #pragma warning( default : 4731 ) // frame pointer register 'ebp' modified by inline assembly code /* ============ idSIMD_SSE::BlendJoints ============ */ void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( blendJoints ); assert_16_byte_aligned( JOINTQUAT_Q_OFFSET ); assert_16_byte_aligned( JOINTQUAT_T_OFFSET ); ALIGN16( float jointQuat0[4]; ) ALIGN16( float jointQuat1[4]; ) ALIGN16( float jointQuat2[4]; ) ALIGN16( float jointQuat3[4]; ) ALIGN16( float blendQuat0[4]; ) ALIGN16( float blendQuat1[4]; ) ALIGN16( float blendQuat2[4]; ) ALIGN16( float blendQuat3[4]; ) int a0, a1, a2, a3; __asm { movss xmm7, lerp cmpnless xmm7, SIMD_SP_zero movmskps ecx, xmm7 test ecx, 1 jz done1 mov eax, numJoints shl eax, 2 mov esi, joints mov edi, blendJoints mov edx, index add edx, eax neg eax jz done1 movss xmm7, lerp cmpnltss xmm7, SIMD_SP_one movmskps ecx, xmm7 test ecx, 1 jz lerpJoints loopCopy: mov ecx, [edx+eax] shl ecx, JOINTQUAT_SIZE_SHIFT add eax, 1*4 movaps xmm0, [edi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm1, [edi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_Q_OFFSET], xmm0 movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm1 jl loopCopy jmp done1 lerpJoints: add eax, 4*4 jge done4 loopJoint4: movss xmm3, lerp shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov ecx, [edx+eax-4*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a0, ecx // lerp first translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load first quaternions movaps xmm0, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm4, [edi+ecx+JOINTQUAT_Q_OFFSET] mov ecx, [edx+eax-3*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a1, ecx // lerp second translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load second quaternions movaps xmm1, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm5, [edi+ecx+JOINTQUAT_Q_OFFSET] mov ecx, [edx+eax-2*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a2, ecx // lerp third translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load third quaternions movaps xmm2, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm6, [edi+ecx+JOINTQUAT_Q_OFFSET] mov ecx, [edx+eax-1*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a3, ecx // lerp fourth translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load fourth quaternions movaps xmm3, [esi+ecx+JOINTQUAT_Q_OFFSET] TRANSPOSE_4x4( xmm0, xmm1, xmm2, xmm3, xmm7 ) movaps jointQuat0, xmm0 movaps jointQuat1, xmm1 movaps jointQuat2, xmm2 movaps jointQuat3, xmm3 movaps xmm7, [edi+ecx+JOINTQUAT_Q_OFFSET] TRANSPOSE_4x4( xmm4, xmm5, xmm6, xmm7, xmm3 ) movaps blendQuat0, xmm4 movaps blendQuat1, xmm5 movaps blendQuat2, xmm6 movaps blendQuat3, xmm7 // lerp quaternions mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 mulps xmm2, xmm6 addps xmm0, xmm2 movaps xmm3, jointQuat3 mulps xmm3, blendQuat3 addps xmm0, xmm3 // xmm0 = cosom movaps xmm1, xmm0 movaps xmm2, xmm0 andps xmm1, SIMD_SP_signBit // xmm1 = signBit xorps xmm0, xmm1 mulps xmm2, xmm2 xorps xmm4, xmm4 movaps xmm3, SIMD_SP_one subps xmm3, xmm2 // xmm3 = scale0 cmpeqps xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 movaps xmm2, xmm3 rsqrtps xmm4, xmm2 mulps xmm2, xmm4 mulps xmm2, xmm4 subps xmm2, SIMD_SP_rsqrt_c0 mulps xmm4, SIMD_SP_rsqrt_c1 mulps xmm2, xmm4 mulps xmm3, xmm2 // xmm3 = sqrt( scale0 ) // omega0 = atan2( xmm3, xmm0 ) movaps xmm4, xmm0 minps xmm0, xmm3 maxps xmm3, xmm4 cmpeqps xmm4, xmm0 rcpps xmm5, xmm3 mulps xmm3, xmm5 mulps xmm3, xmm5 addps xmm5, xmm5 subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x mulps xmm0, xmm5 // xmm0 = x / y or y / x movaps xmm3, xmm4 andps xmm3, SIMD_SP_signBit xorps xmm0, xmm3 // xmm0 = -x / y or y / x andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f movaps xmm3, xmm0 mulps xmm3, xmm3 // xmm3 = s movaps xmm5, SIMD_SP_atan_c0 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c1 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c2 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c3 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c4 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c5 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c6 mulps xmm5, xmm3 addps xmm5, SIMD_SP_atan_c7 mulps xmm5, xmm3 addps xmm5, SIMD_SP_one mulps xmm5, xmm0 addps xmm5, xmm4 // xmm5 = omega0 movss xmm6, lerp // xmm6 = lerp shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm5 // xmm6 = omega1 subps xmm5, xmm6 // xmm5 = omega0 // scale0 = sin( xmm5 ) * xmm2 // scale1 = sin( xmm6 ) * xmm2 movaps xmm3, xmm5 movaps xmm7, xmm6 mulps xmm3, xmm3 mulps xmm7, xmm7 movaps xmm4, SIMD_SP_sin_c0 movaps xmm0, SIMD_SP_sin_c0 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c1 addps xmm0, SIMD_SP_sin_c1 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c2 addps xmm0, SIMD_SP_sin_c2 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c3 addps xmm0, SIMD_SP_sin_c3 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_sin_c4 addps xmm0, SIMD_SP_sin_c4 mulps xmm4, xmm3 mulps xmm0, xmm7 addps xmm4, SIMD_SP_one addps xmm0, SIMD_SP_one mulps xmm5, xmm4 mulps xmm6, xmm0 mulps xmm5, xmm2 // xmm5 = scale0 mulps xmm6, xmm2 // xmm6 = scale1 xorps xmm6, xmm1 movaps xmm0, jointQuat0 mulps xmm0, xmm5 movaps xmm1, blendQuat0 mulps xmm1, xmm6 addps xmm0, xmm1 movaps xmm1, jointQuat1 mulps xmm1, xmm5 movaps xmm2, blendQuat1 mulps xmm2, xmm6 addps xmm1, xmm2 movaps xmm2, jointQuat2 mulps xmm2, xmm5 movaps xmm3, blendQuat2 mulps xmm3, xmm6 addps xmm2, xmm3 movaps xmm3, jointQuat3 mulps xmm3, xmm5 movaps xmm4, blendQuat3 mulps xmm4, xmm6 addps xmm3, xmm4 add eax, 4*4 // transpose xmm0, xmm1, xmm2, xmm3 to memory movaps xmm7, xmm0 movaps xmm6, xmm2 unpcklps xmm0, xmm1 unpcklps xmm2, xmm3 mov ecx, a0 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm0 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm2 mov ecx, a1 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm0 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm2 unpckhps xmm7, xmm1 unpckhps xmm6, xmm3 mov ecx, a2 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm7 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm6 mov ecx, a3 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm7 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm6 jle loopJoint4 done4: sub eax, 4*4 jz done1 loopJoint1: movss xmm3, lerp shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov ecx, [edx+eax] shl ecx, JOINTQUAT_SIZE_SHIFT // lerp first translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load first quaternions movaps xmm0, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm1, [edi+ecx+JOINTQUAT_Q_OFFSET] movaps jointQuat0, xmm0 movaps blendQuat0, xmm1 // lerp quaternions mulps xmm1, xmm0 movhlps xmm0, xmm1 addps xmm1, xmm0 movaps xmm0, xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 0, 2, 3 ) addss xmm0, xmm1 // xmm0 = cosom movss xmm1, xmm0 movss xmm2, xmm0 andps xmm1, SIMD_SP_signBit // xmm1 = signBit xorps xmm0, xmm1 mulss xmm2, xmm2 xorps xmm4, xmm4 movss xmm3, SIMD_SP_one subss xmm3, xmm2 // xmm3 = scale0 cmpeqss xmm4, xmm3 andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number andps xmm3, SIMD_SP_absMask // make sure the values are positive orps xmm3, xmm4 movss xmm2, xmm3 rsqrtss xmm4, xmm2 mulss xmm2, xmm4 mulss xmm2, xmm4 subss xmm2, SIMD_SP_rsqrt_c0 mulss xmm4, SIMD_SP_rsqrt_c1 mulss xmm2, xmm4 mulss xmm3, xmm2 // xmm3 = sqrt( scale0 ) // omega0 = atan2( xmm3, xmm0 ) movss xmm4, xmm0 minss xmm0, xmm3 maxss xmm3, xmm4 cmpeqss xmm4, xmm0 rcpss xmm5, xmm3 mulss xmm3, xmm5 mulss xmm3, xmm5 addss xmm5, xmm5 subss xmm5, xmm3 // xmm5 = 1 / y or 1 / x mulss xmm0, xmm5 // xmm0 = x / y or y / x movss xmm3, xmm4 andps xmm3, SIMD_SP_signBit xorps xmm0, xmm3 // xmm0 = -x / y or y / x andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f movss xmm3, xmm0 mulss xmm3, xmm3 // xmm3 = s movss xmm5, SIMD_SP_atan_c0 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c1 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c2 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c3 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c4 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c5 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c6 mulss xmm5, xmm3 addss xmm5, SIMD_SP_atan_c7 mulss xmm5, xmm3 addss xmm5, SIMD_SP_one mulss xmm5, xmm0 addss xmm5, xmm4 // xmm5 = omega0 movss xmm6, lerp // xmm6 = lerp mulss xmm6, xmm5 // xmm6 = omega1 subss xmm5, xmm6 // xmm5 = omega0 // scale0 = sin( xmm5 ) * xmm2 // scale1 = sin( xmm6 ) * xmm2 movss xmm3, xmm5 movss xmm7, xmm6 mulss xmm3, xmm3 mulss xmm7, xmm7 movss xmm4, SIMD_SP_sin_c0 movss xmm0, SIMD_SP_sin_c0 mulss xmm4, xmm3 mulss xmm0, xmm7 addss xmm4, SIMD_SP_sin_c1 addss xmm0, SIMD_SP_sin_c1 mulss xmm4, xmm3 mulss xmm0, xmm7 addss xmm4, SIMD_SP_sin_c2 addss xmm0, SIMD_SP_sin_c2 mulss xmm4, xmm3 mulss xmm0, xmm7 addss xmm4, SIMD_SP_sin_c3 addss xmm0, SIMD_SP_sin_c3 mulss xmm4, xmm3 mulss xmm0, xmm7 addss xmm4, SIMD_SP_sin_c4 addss xmm0, SIMD_SP_sin_c4 mulss xmm4, xmm3 mulss xmm0, xmm7 addss xmm4, SIMD_SP_one addss xmm0, SIMD_SP_one mulss xmm5, xmm4 mulss xmm6, xmm0 mulss xmm5, xmm2 // xmm5 = scale0 mulss xmm6, xmm2 // xmm6 = scale1 xorps xmm6, xmm1 shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, jointQuat0 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, blendQuat0 addps xmm5, xmm6 movaps [esi+ecx+JOINTQUAT_Q_OFFSET], xmm5 add eax, 1*4 jl loopJoint1 done1: } #else int i; if ( lerp <= 0.0f ) { return; } else if ( lerp >= 1.0f ) { for ( i = 0; i < numJoints; i++ ) { int j = index[i]; joints[j] = blendJoints[j]; } return; } for ( i = 0; i <= numJoints - 4; i += 4 ) { ALIGN16( float jointQuat0[4]; ) ALIGN16( float jointQuat1[4]; ) ALIGN16( float jointQuat2[4]; ) ALIGN16( float jointQuat3[4]; ) ALIGN16( float blendQuat0[4]; ) ALIGN16( float blendQuat1[4]; ) ALIGN16( float blendQuat2[4]; ) ALIGN16( float blendQuat3[4]; ) for ( int j = 0; j < 4; j++ ) { int n = index[i+j]; joints[n].t[0] = joints[n].t[0] + lerp * ( blendJoints[n].t[0] - joints[n].t[0] ); joints[n].t[1] = joints[n].t[1] + lerp * ( blendJoints[n].t[1] - joints[n].t[1] ); joints[n].t[2] = joints[n].t[2] + lerp * ( blendJoints[n].t[2] - joints[n].t[2] ); jointQuat0[j] = joints[n].q[0]; jointQuat1[j] = joints[n].q[1]; jointQuat2[j] = joints[n].q[2]; jointQuat3[j] = joints[n].q[3]; blendQuat0[j] = blendJoints[n].q[0]; blendQuat1[j] = blendJoints[n].q[1]; blendQuat2[j] = blendJoints[n].q[2]; blendQuat3[j] = blendJoints[n].q[3]; } ALIGN16( float cosom[4]; ) ALIGN16( float sinom[4]; ) ALIGN16( float omega0[4]; ) ALIGN16( float omega1[4]; ) ALIGN16( float scale0[4]; ) ALIGN16( float scale1[4]; ) ALIGN16( unsigned long signBit[4]; ) cosom[0] = jointQuat0[0] * blendQuat0[0]; cosom[1] = jointQuat0[1] * blendQuat0[1]; cosom[2] = jointQuat0[2] * blendQuat0[2]; cosom[3] = jointQuat0[3] * blendQuat0[3]; cosom[0] += jointQuat1[0] * blendQuat1[0]; cosom[1] += jointQuat1[1] * blendQuat1[1]; cosom[2] += jointQuat1[2] * blendQuat1[2]; cosom[3] += jointQuat1[3] * blendQuat1[3]; cosom[0] += jointQuat2[0] * blendQuat2[0]; cosom[1] += jointQuat2[1] * blendQuat2[1]; cosom[2] += jointQuat2[2] * blendQuat2[2]; cosom[3] += jointQuat2[3] * blendQuat2[3]; cosom[0] += jointQuat3[0] * blendQuat3[0]; cosom[1] += jointQuat3[1] * blendQuat3[1]; cosom[2] += jointQuat3[2] * blendQuat3[2]; cosom[3] += jointQuat3[3] * blendQuat3[3]; signBit[0] = (*(unsigned long *)&cosom[0]) & ( 1 << 31 ); signBit[1] = (*(unsigned long *)&cosom[1]) & ( 1 << 31 ); signBit[2] = (*(unsigned long *)&cosom[2]) & ( 1 << 31 ); signBit[3] = (*(unsigned long *)&cosom[3]) & ( 1 << 31 ); (*(unsigned long *)&cosom[0]) ^= signBit[0]; (*(unsigned long *)&cosom[1]) ^= signBit[1]; (*(unsigned long *)&cosom[2]) ^= signBit[2]; (*(unsigned long *)&cosom[3]) ^= signBit[3]; scale0[0] = 1.0f - cosom[0] * cosom[0]; scale0[1] = 1.0f - cosom[1] * cosom[1]; scale0[2] = 1.0f - cosom[2] * cosom[2]; scale0[3] = 1.0f - cosom[3] * cosom[3]; scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0]; scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1]; scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2]; scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3]; sinom[0] = SSE_ReciprocalSqrt( scale0[0] ); sinom[1] = SSE_ReciprocalSqrt( scale0[1] ); sinom[2] = SSE_ReciprocalSqrt( scale0[2] ); sinom[3] = SSE_ReciprocalSqrt( scale0[3] ); scale0[0] *= sinom[0]; scale0[1] *= sinom[1]; scale0[2] *= sinom[2]; scale0[3] *= sinom[3]; // NOTE: scale0 and cosom are always positive omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] ); omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] ); omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] ); omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] ); omega1[0] = lerp * omega0[0]; omega1[1] = lerp * omega0[1]; omega1[2] = lerp * omega0[2]; omega1[3] = lerp * omega0[3]; omega0[0] -= omega1[0]; omega0[1] -= omega1[1]; omega0[2] -= omega1[2]; omega0[3] -= omega1[3]; // NOTE: omega0 is always in the range [0, PI/2] scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0]; scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1]; scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2]; scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3]; // NOTE: omega1 is always in the range [0, PI/2] scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0]; scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1]; scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2]; scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3]; (*(unsigned long *)&scale1[0]) ^= signBit[0]; (*(unsigned long *)&scale1[1]) ^= signBit[1]; (*(unsigned long *)&scale1[2]) ^= signBit[2]; (*(unsigned long *)&scale1[3]) ^= signBit[3]; jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0]; jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1]; jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2]; jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3]; jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0]; jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1]; jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2]; jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3]; jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0]; jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1]; jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2]; jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3]; jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0]; jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1]; jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2]; jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3]; for ( int j = 0; j < 4; j++ ) { int n = index[i+j]; joints[n].q[0] = jointQuat0[j]; joints[n].q[1] = jointQuat1[j]; joints[n].q[2] = jointQuat2[j]; joints[n].q[3] = jointQuat3[j]; } } for ( ; i < numJoints; i++ ) { int n = index[i]; idVec3 &jointVert = joints[n].t; const idVec3 &blendVert = blendJoints[n].t; jointVert[0] += lerp * ( blendVert[0] - jointVert[0] ); jointVert[1] += lerp * ( blendVert[1] - jointVert[1] ); jointVert[2] += lerp * ( blendVert[2] - jointVert[2] ); idQuat &jointQuat = joints[n].q; const idQuat &blendQuat = blendJoints[n].q; float cosom; float sinom; float omega; float scale0; float scale1; unsigned long signBit; cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w; signBit = (*(unsigned long *)&cosom) & ( 1 << 31 ); (*(unsigned long *)&cosom) ^= signBit; scale0 = 1.0f - cosom * cosom; scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0; sinom = SSE_ReciprocalSqrt( scale0 ); omega = SSE_ATanPositive( scale0 * sinom, cosom ); scale0 = SSE_SinZeroHalfPI( ( 1.0f - lerp ) * omega ) * sinom; scale1 = SSE_SinZeroHalfPI( lerp * omega ) * sinom; (*(unsigned long *)&scale1) ^= signBit; jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x; jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y; jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z; jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w; } #endif } /* ============ idSIMD_SSE::BlendJointsFast ============ */ void VPCALL idSIMD_SSE::BlendJointsFast( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( blendJoints ); assert_16_byte_aligned( JOINTQUAT_Q_OFFSET ); assert_16_byte_aligned( JOINTQUAT_T_OFFSET ); ALIGN16( float jointQuat3[4]; ) ALIGN16( float blendQuat3[4]; ) ALIGN16( float scaledLerp; ) int a0, a1, a2, a3; __asm { movss xmm7, lerp cmpnless xmm7, SIMD_SP_zero movmskps ecx, xmm7 test ecx, 1 jz done1 mov eax, numJoints shl eax, 2 mov esi, joints mov edi, blendJoints mov edx, index add edx, eax neg eax jz done1 movss xmm7, lerp cmpnltss xmm7, SIMD_SP_one movmskps ecx, xmm7 test ecx, 1 jz lerpJoints loopCopy: mov ecx, [edx+eax] shl ecx, JOINTQUAT_SIZE_SHIFT add eax, 1*4 movaps xmm0, [edi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm1, [edi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_Q_OFFSET], xmm0 movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm1 jl loopCopy jmp done1 lerpJoints: movss xmm7, lerp movss xmm6, SIMD_SP_one subss xmm6, xmm7 divss xmm7, xmm6 movss scaledLerp, xmm7 add eax, 4*4 jge done4 loopJoint4: movss xmm3, lerp shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov ecx, [edx+eax-4*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a0, ecx // lerp first translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load first quaternions movaps xmm0, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm4, [edi+ecx+JOINTQUAT_Q_OFFSET] mov ecx, [edx+eax-3*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a1, ecx // lerp second translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load second quaternions movaps xmm1, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm5, [edi+ecx+JOINTQUAT_Q_OFFSET] mov ecx, [edx+eax-2*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a2, ecx // lerp third translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load third quaternions movaps xmm2, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm6, [edi+ecx+JOINTQUAT_Q_OFFSET] mov ecx, [edx+eax-1*4] shl ecx, JOINTQUAT_SIZE_SHIFT mov a3, ecx // lerp fourth translations movaps xmm7, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm7, xmm3 addps xmm7, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm7 // load fourth quaternions movaps xmm3, [esi+ecx+JOINTQUAT_Q_OFFSET] TRANSPOSE_4x4( xmm0, xmm1, xmm2, xmm3, xmm7 ) movaps jointQuat3, xmm3 movaps xmm7, [edi+ecx+JOINTQUAT_Q_OFFSET] TRANSPOSE_4x4( xmm4, xmm5, xmm6, xmm7, xmm3 ) movaps blendQuat3, xmm7 // lerp quaternions movaps xmm3, xmm0 mulps xmm3, xmm4 movaps xmm7, xmm1 mulps xmm7, xmm5 addps xmm3, xmm7 movaps xmm7, xmm2 mulps xmm7, xmm6 addps xmm3, xmm7 movaps xmm7, jointQuat3 mulps xmm7, blendQuat3 addps xmm3, xmm7 // xmm3 = cosom andps xmm3, SIMD_SP_signBit // xmm3 = signBit movss xmm7, scaledLerp shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) xorps xmm7, xmm3 // xmm7 = scaledLerp ^ signBit mulps xmm4, xmm7 addps xmm4, xmm0 movaps xmm0, xmm4 mulps xmm0, xmm0 mulps xmm5, xmm7 addps xmm5, xmm1 movaps xmm1, xmm5 mulps xmm1, xmm1 addps xmm0, xmm1 mulps xmm6, xmm7 addps xmm6, xmm2 movaps xmm2, xmm6 mulps xmm2, xmm2 addps xmm0, xmm2 mulps xmm7, blendQuat3 addps xmm7, jointQuat3 movaps xmm1, xmm7 mulps xmm1, xmm1 addps xmm0, xmm1 rsqrtps xmm2, xmm0 mulps xmm0, xmm2 mulps xmm0, xmm2 subps xmm0, SIMD_SP_rsqrt_c0 mulps xmm2, SIMD_SP_rsqrt_c1 mulps xmm0, xmm2 mulps xmm4, xmm0 mulps xmm5, xmm0 mulps xmm6, xmm0 mulps xmm7, xmm0 add eax, 4*4 // transpose xmm4, xmm5, xmm6, xmm7 to memory movaps xmm2, xmm4 movaps xmm3, xmm6 unpcklps xmm4, xmm5 unpcklps xmm6, xmm7 mov ecx, a0 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm4 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm6 mov ecx, a1 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm4 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm6 unpckhps xmm2, xmm5 unpckhps xmm3, xmm7 mov ecx, a2 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm2 movlps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm3 mov ecx, a3 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+0], xmm2 movhps [esi+ecx+JOINTQUAT_Q_OFFSET+8], xmm3 jle loopJoint4 done4: sub eax, 4*4 jz done1 movss xmm6, lerp shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm7, scaledLerp shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) loopJoint1: mov ecx, [edx+eax] shl ecx, JOINTQUAT_SIZE_SHIFT // lerp translations movaps xmm3, [edi+ecx+JOINTQUAT_T_OFFSET] subps xmm3, [esi+ecx+JOINTQUAT_T_OFFSET] mulps xmm3, xmm6 addps xmm3, [esi+ecx+JOINTQUAT_T_OFFSET] movaps [esi+ecx+JOINTQUAT_T_OFFSET], xmm3 // load quaternions movaps xmm0, [esi+ecx+JOINTQUAT_Q_OFFSET] movaps xmm1, [edi+ecx+JOINTQUAT_Q_OFFSET] // lerp quaternions movaps xmm2, xmm0 mulps xmm2, xmm1 movhlps xmm3, xmm2 addps xmm2, xmm3 movaps xmm3, xmm2 shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 0, 2, 3 ) addss xmm3, xmm2 // xmm3 = cosom shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) andps xmm3, SIMD_SP_signBit // xmm3 = signBit xorps xmm3, xmm7 // xmm3 = scaledLerp ^ signBit mulps xmm1, xmm3 addps xmm1, xmm0 // xmm1 = jointQuat + scale * blendQuat movaps xmm0, xmm1 mulps xmm0, xmm0 movhlps xmm2, xmm0 addps xmm0, xmm2 movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 2, 3 ) addss xmm0, xmm2 rsqrtss xmm2, xmm0 mulss xmm0, xmm2 mulss xmm0, xmm2 subss xmm0, SIMD_SP_rsqrt_c0 mulss xmm2, SIMD_SP_rsqrt_c1 mulss xmm0, xmm2 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm0 movaps [esi+ecx+JOINTQUAT_Q_OFFSET], xmm1 add eax, 1*4 jl loopJoint1 done1: } #else int i; if ( lerp <= 0.0f ) { return; } else if ( lerp >= 1.0f ) { for ( i = 0; i < numJoints; i++ ) { int j = index[i]; joints[j] = blendJoints[j]; } return; } float scaledLerp = lerp / ( 1.0f - lerp ); for ( i = 0; i <= numJoints - 4; i += 4 ) { ALIGN16( float jointQuat0[4]; ) ALIGN16( float jointQuat1[4]; ) ALIGN16( float jointQuat2[4]; ) ALIGN16( float jointQuat3[4]; ) ALIGN16( float blendQuat0[4]; ) ALIGN16( float blendQuat1[4]; ) ALIGN16( float blendQuat2[4]; ) ALIGN16( float blendQuat3[4]; ) for ( int j = 0; j < 4; j++ ) { int n = index[i+j]; joints[n].t[0] = joints[n].t[0] + lerp * ( blendJoints[n].t[0] - joints[n].t[0] ); joints[n].t[1] = joints[n].t[1] + lerp * ( blendJoints[n].t[1] - joints[n].t[1] ); joints[n].t[2] = joints[n].t[2] + lerp * ( blendJoints[n].t[2] - joints[n].t[2] ); jointQuat0[j] = joints[n].q[0]; jointQuat1[j] = joints[n].q[1]; jointQuat2[j] = joints[n].q[2]; jointQuat3[j] = joints[n].q[3]; blendQuat0[j] = blendJoints[n].q[0]; blendQuat1[j] = blendJoints[n].q[1]; blendQuat2[j] = blendJoints[n].q[2]; blendQuat3[j] = blendJoints[n].q[3]; } ALIGN16( float cosom[4]; ) ALIGN16( float scale[4]; ) ALIGN16( float s[4]; ) cosom[0] = jointQuat0[0] * blendQuat0[0] + jointQuat1[0] * blendQuat1[0] + jointQuat2[0] * blendQuat2[0] + jointQuat3[0] * blendQuat3[0]; cosom[1] = jointQuat0[1] * blendQuat0[1] + jointQuat1[1] * blendQuat1[1] + jointQuat2[1] * blendQuat2[1] + jointQuat3[1] * blendQuat3[1]; cosom[2] = jointQuat0[2] * blendQuat0[2] + jointQuat1[2] * blendQuat1[2] + jointQuat2[2] * blendQuat2[2] + jointQuat3[2] * blendQuat3[2]; cosom[3] = jointQuat0[3] * blendQuat0[3] + jointQuat1[3] * blendQuat1[3] + jointQuat2[3] * blendQuat2[3] + jointQuat3[3] * blendQuat3[3]; (*(unsigned long *)&scale[0]) = (*(unsigned long *)&scaledLerp) ^ ( (*(unsigned long *)&cosom[0]) & ( 1 << 31 ) ); (*(unsigned long *)&scale[1]) = (*(unsigned long *)&scaledLerp) ^ ( (*(unsigned long *)&cosom[1]) & ( 1 << 31 ) ); (*(unsigned long *)&scale[2]) = (*(unsigned long *)&scaledLerp) ^ ( (*(unsigned long *)&cosom[2]) & ( 1 << 31 ) ); (*(unsigned long *)&scale[3]) = (*(unsigned long *)&scaledLerp) ^ ( (*(unsigned long *)&cosom[3]) & ( 1 << 31 ) ); jointQuat0[0] += scale[0] * blendQuat0[0]; jointQuat0[1] += scale[1] * blendQuat0[1]; jointQuat0[2] += scale[2] * blendQuat0[2]; jointQuat0[3] += scale[3] * blendQuat0[3]; jointQuat1[0] += scale[0] * blendQuat1[0]; jointQuat1[1] += scale[1] * blendQuat1[1]; jointQuat1[2] += scale[2] * blendQuat1[2]; jointQuat1[3] += scale[3] * blendQuat1[3]; jointQuat2[0] += scale[0] * blendQuat2[0]; jointQuat2[1] += scale[1] * blendQuat2[1]; jointQuat2[2] += scale[2] * blendQuat2[2]; jointQuat2[3] += scale[3] * blendQuat2[3]; jointQuat3[0] += scale[0] * blendQuat3[0]; jointQuat3[1] += scale[1] * blendQuat3[1]; jointQuat3[2] += scale[2] * blendQuat3[2]; jointQuat3[3] += scale[3] * blendQuat3[3]; s[0] = jointQuat0[0] * jointQuat0[0] + jointQuat1[0] * jointQuat1[0] + jointQuat2[0] * jointQuat2[0] + jointQuat3[0] * jointQuat3[0]; s[1] = jointQuat0[1] * jointQuat0[1] + jointQuat1[1] * jointQuat1[1] + jointQuat2[1] * jointQuat2[1] + jointQuat3[1] * jointQuat3[1]; s[2] = jointQuat0[2] * jointQuat0[2] + jointQuat1[2] * jointQuat1[2] + jointQuat2[2] * jointQuat2[2] + jointQuat3[2] * jointQuat3[2]; s[3] = jointQuat0[3] * jointQuat0[3] + jointQuat1[3] * jointQuat1[3] + jointQuat2[3] * jointQuat2[3] + jointQuat3[3] * jointQuat3[3]; s[0] = SSE_ReciprocalSqrt( s[0] ); s[1] = SSE_ReciprocalSqrt( s[1] ); s[2] = SSE_ReciprocalSqrt( s[2] ); s[3] = SSE_ReciprocalSqrt( s[3] ); jointQuat0[0] *= s[0]; jointQuat0[1] *= s[1]; jointQuat0[2] *= s[2]; jointQuat0[3] *= s[3]; jointQuat1[0] *= s[0]; jointQuat1[1] *= s[1]; jointQuat1[2] *= s[2]; jointQuat1[3] *= s[3]; jointQuat2[0] *= s[0]; jointQuat2[1] *= s[1]; jointQuat2[2] *= s[2]; jointQuat2[3] *= s[3]; jointQuat3[0] *= s[0]; jointQuat3[1] *= s[1]; jointQuat3[2] *= s[2]; jointQuat3[3] *= s[3]; for ( int j = 0; j < 4; j++ ) { int n = index[i+j]; joints[n].q[0] = jointQuat0[j]; joints[n].q[1] = jointQuat1[j]; joints[n].q[2] = jointQuat2[j]; joints[n].q[3] = jointQuat3[j]; } } for ( ; i < numJoints; i++ ) { int n = index[i]; idVec3 &jointVert = joints[n].t; const idVec3 &blendVert = blendJoints[n].t; jointVert[0] += lerp * ( blendVert[0] - jointVert[0] ); jointVert[1] += lerp * ( blendVert[1] - jointVert[1] ); jointVert[2] += lerp * ( blendVert[2] - jointVert[2] ); idQuat &jointQuat = joints[n].q; const idQuat &blendQuat = blendJoints[n].q; float cosom, scale, s; cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w; (*(unsigned long *)&scale) = (*(unsigned long *)&scaledLerp) ^ ( (*(unsigned long *)&cosom) & ( 1 << 31 ) ); jointQuat.x += scale * blendQuat.x; jointQuat.y += scale * blendQuat.y; jointQuat.z += scale * blendQuat.z; jointQuat.w += scale * blendQuat.w; s = jointQuat.x * jointQuat.x + jointQuat.y * jointQuat.y + jointQuat.z * jointQuat.z + jointQuat.w * jointQuat.w; s = SSE_ReciprocalSqrt( s ); jointQuat.x *= s; jointQuat.y *= s; jointQuat.z *= s; jointQuat.w *= s; } #endif } /* ============ idSIMD_SSE::ConvertJointQuatsToJointMats ============ */ void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) { #if 1 assert_16_byte_aligned( jointMats ); assert_16_byte_aligned( jointQuats ); __asm { mov eax, numJoints shl eax, JOINTQUAT_SIZE_SHIFT mov esi, jointQuats mov edi, jointMats add esi, eax neg eax jz done loopQuat: movaps xmm0, [esi+eax+JOINTQUAT_Q_OFFSET] // xmm0 = q.x, q.y, q.z, q.w movaps xmm6, [esi+eax+JOINTQUAT_T_OFFSET] // xmm6 = t.x, t.y, t.z, w add edi, JOINTMAT_SIZE movaps xmm1, xmm0 // xmm1 = x, y, z, w addps xmm1, xmm1 // xmm1 = x2, y2, z2, w2 add eax, JOINTQUAT_SIZE movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 0, 0, 1 ) // xmm2 = y, x, x, y movaps xmm3, xmm1 shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 1, 2, 2 ) // xmm3 = y2, y2, z2, z2 mulps xmm2, xmm3 // xmm2 = yy2, xy2, xz2, yz2 movaps xmm4, xmm0 shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 3, 3, 3 ) // xmm4 = z, w, w, w movaps xmm5, xmm1 shufps xmm5, xmm5, R_SHUFFLE_PS( 2, 2, 1, 0 ) // xmm5 = z2, z2, y2, x2 mulps xmm4, xmm5 // xmm4 = zz2, wz2, wy2, wx2 mulss xmm0, xmm1 // xmm0 = xx2, y2, z2, w2 // calculate the last two elements of the third row movss xmm7, SIMD_SP_one // xmm7 = 1, 0, 0, 0 subss xmm7, xmm0 // xmm7 = -xx2+1, 0, 0, 0 subss xmm7, xmm2 // xmm7 = -xx2-yy2+1, 0, 0, 0 shufps xmm7, xmm6, R_SHUFFLE_PS( 0, 1, 2, 3 ) // xmm7 = -xx2-yy2+1, 0, t.z, w // calcluate first row xorps xmm2, SIMD_SP_quat2mat_x0 // xmm2 = yy2, -xy2, -xz2, -yz2 xorps xmm4, SIMD_SP_quat2mat_x1 // xmm4 = -zz2, wz2, -wy2, -wx2 addss xmm4, SIMD_SP_one // xmm4 = -zz2+1, wz2, -wy2, -wx2 movaps xmm3, xmm4 // xmm3 = -zz2+1, wz2, -wy2, -wx2 subps xmm3, xmm2 // xmm3 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2 movaps [edi-JOINTMAT_SIZE+0*16+0*4], xmm3 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, yz2-wx2 movss [edi-JOINTMAT_SIZE+0*16+3*4], xmm6 // row0 = -yy2-zz2+1, xy2+wz2, xz2-wy2, t.x // calculate second row movss xmm2, xmm0 // xmm2 = xx2, -xy2, -xz2, -yz2 xorps xmm4, SIMD_SP_quat2mat_x2 // xmm4 = -zz2+1, -wz2, wy2, wx2 subps xmm4, xmm2 // xmm4 = -xx2-zz2+1, xy2-wz2, xz2+wy2, yz2+wx2 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) // xmm6 = t.y, t.z, w, t.x shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 0, 3, 2 ) // xmm4 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2 movaps [edi-JOINTMAT_SIZE+1*16+0*4], xmm4 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, xz2+wy2 movss [edi-JOINTMAT_SIZE+1*16+3*4], xmm6 // row1 = xy2-wz2, -xx2-zz2+1, yz2+wx2, t.y // calculate third row movhlps xmm3, xmm4 // xmm3 = yz2+wx2, xz2+wy2, xz2-wy2, yz2-wx2 shufps xmm3, xmm7, R_SHUFFLE_PS( 1, 3, 0, 2 ) // xmm3 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z movaps [edi-JOINTMAT_SIZE+2*16+0*4], xmm3 // row2 = xz2+wy2, yz2-wx2, -xx2-yy2+1, t.z jl loopQuat done: } #else for ( int i = 0; i < numJoints; i++ ) { const float *q = jointQuats[i].q.ToFloatPtr(); float *m = jointMats[i].ToFloatPtr(); float x2 = q[0] + q[0]; float y2 = q[1] + q[1]; float z2 = q[2] + q[2]; float w2 = q[3] + q[3]; float yy2 = q[1] * y2; float xy2 = q[0] * y2; float xz2 = q[0] * z2; float yz2 = q[1] * z2; float zz2 = q[2] * z2; float wz2 = q[3] * z2; float wy2 = q[3] * y2; float wx2 = q[3] * x2; float xx2 = q[0] * x2; m[0*4+0] = - yy2 - zz2 + 1.0f; m[0*4+1] = xy2 + wz2; m[0*4+2] = xz2 - wy2; m[0*4+3] = q[4]; m[1*4+0] = xy2 - wz2; m[1*4+1] = - xx2 - zz2 + 1.0f; m[1*4+2] = yz2 + wx2; m[1*4+3] = q[5]; m[2*4+0] = xz2 + wy2; m[2*4+1] = yz2 - wx2; m[2*4+2] = - xx2 - yy2 + 1.0f; m[2*4+3] = q[6]; } #endif } /* ============ idSIMD_SSE::ConvertJointMatsToJointQuats ============ */ void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) { #if 1 ALIGN16( byte shuffle[16]; ) __asm { mov eax, numJoints mov esi, jointMats mov edi, jointQuats and eax, ~3 jz done4 imul eax, JOINTMAT_SIZE add esi, eax neg eax loopMat4: movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4] movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4] movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4] shufps xmm5, xmm5, R_SHUFFLE_PS( 3, 0, 1, 2 ) shufps xmm6, xmm6, R_SHUFFLE_PS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 3, 0, 1, 2 ) movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4] movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4] movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4] movss xmm5, xmm0 movss xmm6, xmm1 movss xmm7, xmm2 shufps xmm5, xmm5, R_SHUFFLE_PS( 3, 0, 1, 2 ) shufps xmm6, xmm6, R_SHUFFLE_PS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 3, 0, 1, 2 ) movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4] movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4] movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4] movss xmm5, xmm0 movss xmm6, xmm1 movss xmm7, xmm2 shufps xmm5, xmm5, R_SHUFFLE_PS( 3, 0, 1, 2 ) shufps xmm6, xmm6, R_SHUFFLE_PS( 3, 0, 1, 2 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 3, 0, 1, 2 ) movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4] movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4] movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4] movss xmm5, xmm0 movss xmm6, xmm1 movss xmm7, xmm2 // ------------------- movaps xmm0, xmm5 addps xmm0, xmm6 addps xmm0, xmm7 cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f movaps xmm1, xmm5 movaps xmm2, xmm5 cmpnltps xmm1, xmm6 cmpnltps xmm2, xmm7 andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] movaps xmm4, xmm6 cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2] movaps xmm1, xmm0 andnps xmm1, xmm2 orps xmm2, xmm0 movaps xmm3, xmm2 andnps xmm2, xmm4 orps xmm3, xmm2 xorps xmm3, SIMD_SP_not andps xmm0, SIMD_DW_mat2quatShuffle0 movaps xmm4, xmm1 andps xmm4, SIMD_DW_mat2quatShuffle1 orps xmm0, xmm4 movaps xmm4, xmm2 andps xmm4, SIMD_DW_mat2quatShuffle2 orps xmm0, xmm4 movaps xmm4, xmm3 andps xmm4, SIMD_DW_mat2quatShuffle3 orps xmm4, xmm0 movaps shuffle, xmm4 movaps xmm0, xmm2 orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0 orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2 orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1 andps xmm0, SIMD_SP_signBit andps xmm1, SIMD_SP_signBit andps xmm2, SIMD_SP_signBit xorps xmm5, xmm0 xorps xmm6, xmm1 xorps xmm7, xmm2 addps xmm5, xmm6 addps xmm7, SIMD_SP_one addps xmm5, xmm7 // xmm5 = t movaps xmm7, xmm5 // xmm7 = t rsqrtps xmm6, xmm5 mulps xmm5, xmm6 mulps xmm5, xmm6 subps xmm5, SIMD_SP_rsqrt_c0 mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1 mulps xmm6, xmm5 // xmm5 = s mulps xmm7, xmm6 // xmm7 = s * t xorps xmm6, SIMD_SP_signBit // xmm6 = -s // ------------------- add edi, 4*JOINTQUAT_SIZE movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0 movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[0*4+1] // edx = k1 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2 movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[0*4+3] // edx = k3 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4] mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4] mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4] mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; mov dword ptr [edi-4*JOINTQUAT_SIZE+28], 0 // q[7] = 0.0f; shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0 movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[1*4+1] // edx = k1 movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2 movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[1*4+3] // edx = k3 movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4] mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4] mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4] mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; mov dword ptr [edi-3*JOINTQUAT_SIZE+28], 0 // q[7] = 0.0f; shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0 movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[2*4+1] // edx = k1 movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2 movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[2*4+3] // edx = k3 movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4] mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4] mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4] mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; mov dword ptr [edi-2*JOINTQUAT_SIZE+28], 0 // q[7] = 0.0f; shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm0, xmm0, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 2, 3, 0 ) shufps xmm2, xmm2, R_SHUFFLE_PS( 1, 2, 3, 0 ) movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[3*4+1] // edx = k1 movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2 movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[3*4+3] // edx = k3 movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4] mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4] mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4] mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; mov dword ptr [edi-1*JOINTQUAT_SIZE+28], 0 // q[7] = 0.0f; add eax, 4*JOINTMAT_SIZE jl loopMat4 done4: mov eax, numJoints and eax, 3 jz done1 imul eax, JOINTMAT_SIZE add esi, eax neg eax loopMat1: movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4] movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4] movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4] // ------------------- movaps xmm0, xmm5 addss xmm0, xmm6 addss xmm0, xmm7 cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f movaps xmm1, xmm5 movaps xmm2, xmm5 cmpnltss xmm1, xmm6 cmpnltss xmm2, xmm7 andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] movaps xmm4, xmm6 cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2] movaps xmm1, xmm0 andnps xmm1, xmm2 orps xmm2, xmm0 movaps xmm3, xmm2 andnps xmm2, xmm4 orps xmm3, xmm2 xorps xmm3, SIMD_SP_not andps xmm0, SIMD_DW_mat2quatShuffle0 movaps xmm4, xmm1 andps xmm4, SIMD_DW_mat2quatShuffle1 orps xmm0, xmm4 movaps xmm4, xmm2 andps xmm4, SIMD_DW_mat2quatShuffle2 orps xmm0, xmm4 movaps xmm4, xmm3 andps xmm4, SIMD_DW_mat2quatShuffle3 orps xmm4, xmm0 movss shuffle, xmm4 movaps xmm0, xmm2 orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0 orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2 orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1 andps xmm0, SIMD_SP_signBit andps xmm1, SIMD_SP_signBit andps xmm2, SIMD_SP_signBit xorps xmm5, xmm0 xorps xmm6, xmm1 xorps xmm7, xmm2 addss xmm5, xmm6 addss xmm7, SIMD_SP_one addss xmm5, xmm7 // xmm5 = t movss xmm7, xmm5 // xmm7 = t rsqrtss xmm6, xmm5 mulss xmm5, xmm6 mulss xmm5, xmm6 subss xmm5, SIMD_SP_rsqrt_c0 mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1 mulss xmm6, xmm5 // xmm5 = s mulss xmm7, xmm6 // xmm7 = s * t xorps xmm6, SIMD_SP_signBit // xmm6 = -s // ------------------- movzx ecx, byte ptr shuffle[0] // ecx = k0 add edi, JOINTQUAT_SIZE movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t; movzx edx, byte ptr shuffle[1] // edx = k1 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4] xorps xmm4, xmm2 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; movzx ecx, byte ptr shuffle[2] // ecx = k2 movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4] xorps xmm3, xmm1 subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4] mulss xmm3, xmm6 movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; movzx edx, byte ptr shuffle[3] // edx = k3 movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4] xorps xmm4, xmm0 subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4] mulss xmm4, xmm6 movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4] mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3]; mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4] mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3]; mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4] mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3]; mov dword ptr [edi-1*JOINTQUAT_SIZE+28], 0 // q[7] = 0.0f; add eax, JOINTMAT_SIZE jl loopMat1 done1: } #elif 0 int k0 = (3<<0)|(2<<2)|(1<<4)|(0<<6); int k1 = (0<<0)|(1<<2)|(2<<4)|(3<<6); int k2 = (1<<0)|(0<<2)|(3<<4)|(2<<6); int k3 = (2<<0)|(3<<2)|(0<<4)|(1<<6); for ( int i = 0; i < numJoints; i++ ) { float *q = jointQuats[i].q.ToFloatPtr(); const float *m = jointMats[i].ToFloatPtr(); int b0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f; int b1 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]; int b2 = m[1 * 4 + 1] > m[2 * 4 + 2]; int m0 = -( b0 ); int m1 = -( !b0 & b1 ); int m2 = -( !( b0 | b1 ) & b2 ); int m3 = -( !( b0 | b1 | b2 ) ); int i0 = 0x3F800000 ^ ( ( m2 | m3 ) << 31 ); int i1 = 0x3F800000 ^ ( ( m1 | m3 ) << 31 ); int i2 = 0x3F800000 ^ ( ( m1 | m2 ) << 31 ); float s0 = *(float *)&i0; float s1 = *(float *)&i1; float s2 = *(float *)&i2; int index = ( m0 & k0 ) | ( m1 & k1 ) | ( m2 & k2 ) | ( m3 & k3 ); float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f; float s = __frsqrte( t ); s = ( t * s * s + -3.0f ) * ( s * -0.25f ); q[(index>>0)&3] = s * t; q[(index>>2)&3] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; q[(index>>4)&3] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; q[(index>>6)&3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; q[4] = m[0 * 4 + 3]; q[5] = m[1 * 4 + 3]; q[6] = m[2 * 4 + 3]; q[7] = 0.0f; } #else compile_time_assert( (UINT_PTR)(&((idJointQuat *)0)->t) == (UINT_PTR)(&((idJointQuat *)0)->q) + (UINT_PTR)sizeof( ((idJointQuat *)0)->q ) ); for ( int i = 0; i < numJoints; i++ ) { float s0, s1, s2; int k0, k1, k2, k3; float *q = jointQuats[i].q.ToFloatPtr(); const float *m = jointMats[i].ToFloatPtr(); if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) { k0 = 3; k1 = 2; k2 = 1; k3 = 0; s0 = 1.0f; s1 = 1.0f; s2 = 1.0f; } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) { k0 = 0; k1 = 1; k2 = 2; k3 = 3; s0 = 1.0f; s1 = -1.0f; s2 = -1.0f; } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) { k0 = 1; k1 = 0; k2 = 3; k3 = 2; s0 = -1.0f; s1 = 1.0f; s2 = -1.0f; } else { k0 = 2; k1 = 3; k2 = 0; k3 = 1; s0 = -1.0f; s1 = -1.0f; s2 = 1.0f; } float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[k0] = s * t; q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s; q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s; q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s; q[4] = m[0 * 4 + 3]; q[5] = m[1 * 4 + 3]; q[6] = m[2 * 4 + 3]; q[7] = 0.0f; } #endif } /* ============ idSIMD_SSE::TransformJoints ============ */ void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { #if 1 assert_16_byte_aligned( jointMats ); __asm { mov ecx, firstJoint mov eax, lastJoint sub eax, ecx jl done shl ecx, 2 // ecx = firstJoint * 4 mov edi, parents add edi, ecx // edx = &parents[firstJoint] lea ecx, [ecx+ecx*2] shl ecx, 2 // ecx = firstJoint * JOINTMAT_SIZE mov esi, jointMats // esi = jointMats shl eax, 2 // eax = ( lastJoint - firstJoint ) * 4 add edi, eax neg eax loopJoint: mov edx, [edi+eax] movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0 lea edx, [edx+edx*2] movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1 shl edx, 4 // edx = parents[i] * JOINTMAT_SIZE movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2 movaps xmm7, [esi+edx+ 0] movaps xmm4, xmm7 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm0 movaps xmm5, xmm7 shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm5, xmm1 addps xmm4, xmm5 add ecx, JOINTMAT_SIZE add eax, 4 movaps xmm6, xmm7 shufps xmm6, xmm6, R_SHUFFLE_PS( 2, 2, 2, 2 ) mulps xmm6, xmm2 addps xmm4, xmm6 andps xmm7, SIMD_SP_clearFirstThree addps xmm4, xmm7 movaps [esi+ecx-JOINTMAT_SIZE+ 0], xmm4 movaps xmm3, [esi+edx+16] movaps xmm5, xmm3 shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm0 movaps xmm6, xmm3 shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm6, xmm1 addps xmm5, xmm6 movaps xmm4, xmm3 shufps xmm4, xmm4, R_SHUFFLE_PS( 2, 2, 2, 2 ) mulps xmm4, xmm2 addps xmm5, xmm4 andps xmm3, SIMD_SP_clearFirstThree addps xmm5, xmm3 movaps [esi+ecx-JOINTMAT_SIZE+16], xmm5 movaps xmm7, [esi+edx+32] movaps xmm6, xmm7 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm4, xmm7 shufps xmm4, xmm4, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm4, xmm1 addps xmm6, xmm4 movaps xmm3, xmm7 shufps xmm3, xmm3, R_SHUFFLE_PS( 2, 2, 2, 2 ) mulps xmm3, xmm2 addps xmm6, xmm3 andps xmm7, SIMD_SP_clearFirstThree addps xmm6, xmm7 movaps [esi+ecx-JOINTMAT_SIZE+32], xmm6 jle loopJoint done: } #else int i; for( i = firstJoint; i <= lastJoint; i++ ) { assert( parents[i] < i ); jointMats[i] *= jointMats[parents[i]]; } #endif } /* ============ idSIMD_SSE::UntransformJoints ============ */ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { #if 1 assert_16_byte_aligned( jointMats ); __asm { mov edx, firstJoint mov eax, lastJoint mov ecx, eax sub eax, edx jl done mov esi, jointMats // esi = jointMats lea ecx, [ecx+ecx*2] shl ecx, 4 // ecx = lastJoint * JOINTMAT_SIZE shl edx, 2 mov edi, parents add edi, edx // edi = &parents[firstJoint] shl eax, 2 // eax = ( lastJoint - firstJoint ) * 4 loopJoint: mov edx, [edi+eax] movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0 lea edx, [edx+edx*2] movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1 shl edx, 4 // edx = parents[i] * JOINTMAT_SIZE movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2 movss xmm6, [esi+edx+12] shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) subps xmm0, xmm6 movss xmm7, [esi+edx+28] shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 2, 3, 0 ) subps xmm1, xmm7 movss xmm3, [esi+edx+44] shufps xmm3, xmm3, R_SHUFFLE_PS( 1, 2, 3, 0 ) subps xmm2, xmm3 sub ecx, JOINTMAT_SIZE sub eax, 4 movss xmm4, [esi+edx+ 0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm0 movss xmm5, [esi+edx+16] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm1 addps xmm4, xmm5 movss xmm6, [esi+edx+32] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm2 addps xmm4, xmm6 movaps [esi+ecx+JOINTMAT_SIZE+ 0], xmm4 movss xmm5, [esi+edx+ 4] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm0 movss xmm6, [esi+edx+20] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm1 addps xmm5, xmm6 movss xmm7, [esi+edx+36] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm5, xmm7 movaps [esi+ecx+JOINTMAT_SIZE+16], xmm5 movss xmm6, [esi+edx+ 8] shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movss xmm7, [esi+edx+24] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movss xmm3, [esi+edx+40] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm2 addps xmm6, xmm3 movaps [esi+ecx+JOINTMAT_SIZE+32], xmm6 jge loopJoint done: } #else int i; for( i = lastJoint; i >= firstJoint; i-- ) { assert( parents[i] < i ); jointMats[i] /= jointMats[parents[i]]; } #endif } /* ============ idSIMD_SSE::MultiplyJoints ============ */ void VPCALL idSIMD_SSE::MultiplyJoints( idJointMat *result, const idJointMat *joints1, const idJointMat *joints2, const int numJoints ) { #if 1 assert_16_byte_aligned( result ); assert_16_byte_aligned( joints1 ); assert_16_byte_aligned( joints2 ); __asm { mov eax, numJoints test eax, eax jz done mov ecx, joints1 mov edx, joints2 mov edi, result imul eax, JOINTMAT_SIZE add ecx, eax add edx, eax add edi, eax neg eax loopJoint: movaps xmm0, [edx+eax+0] movaps xmm1, [edx+eax+16] movaps xmm2, [edx+eax+32] movss xmm3, [ecx+eax+0] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm0 movss xmm4, [ecx+eax+4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 movss xmm5, [ecx+eax+8] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm2 addps xmm3, xmm5 movss xmm6, [ecx+eax+12] shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) addps xmm3, xmm6 movaps [edi+eax+0], xmm3 movss xmm7, [ecx+eax+16] shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm0 movss xmm4, [ecx+eax+20] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm7, xmm4 movss xmm5, [ecx+eax+24] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm2 addps xmm7, xmm5 movss xmm6, [ecx+eax+28] shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) addps xmm7, xmm6 movaps [edi+eax+16], xmm7 movss xmm3, [ecx+eax+32] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm3, xmm0 movss xmm4, [ecx+eax+36] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm4, xmm1 addps xmm3, xmm4 movss xmm5, [ecx+eax+40] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm5, xmm2 addps xmm3, xmm5 movss xmm6, [ecx+eax+44] shufps xmm6, xmm6, R_SHUFFLE_PS( 1, 2, 3, 0 ) addps xmm3, xmm6 movaps [edi+eax+32], xmm3 add eax, JOINTMAT_SIZE jl loopJoint done: } #else int i; for ( i = 0; i < numJoints; i++ ) { idJointMat::Multiply( result[i], joints1[i], joints2[i] ); } #endif } /* ============ idSIMD_SSE::TransformVerts ============ */ void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( base ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, weights mov esi, base mov edi, joints add ecx, eax neg eax loopVert: mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET] movaps xmm2, [esi] add edx, JOINTWEIGHT_SIZE movaps xmm0, xmm2 add esi, BASEVECTOR_SIZE movaps xmm1, xmm2 mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE je doneWeight loopWeight: mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET] movaps xmm5, [esi] add edx, JOINTWEIGHT_SIZE movaps xmm3, xmm5 add esi, BASEVECTOR_SIZE movaps xmm4, xmm5 mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2 cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 jne loopWeight doneWeight: add eax, DRAWVERT_SIZE movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4 unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1 addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2 movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6 movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2 shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8 addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm5 jl loopVert done: } #else int i, j; const byte *jointsPtr = (byte *)joints; for( j = 0, i = 0; i < numVerts; i++, j++ ) { idVec3 v; v = ( *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ) ) * base[j]; while( weights[j].nextVertexOffset != JOINTWEIGHT_SIZE ) { j++; v += ( *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ) ) * base[j]; } verts[i].xyz = v; } #endif } /* ============ idSIMD_SSE::TransformShadowVerts ============ */ void VPCALL idSIMD_SSE::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const jointWeight_t *weights, const int numWeights ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( base ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, weights mov esi, base mov edi, joints add ecx, eax neg eax movaps xmm0, SIMD_SP_clearLast movaps xmm1, SIMD_SP_lastOne loopVert: add esi, DRAWVERT_SIZE mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET] add edx, dword ptr [edx+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET] add eax, DRAWVERT_SIZE movaps xmm3, [esi-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] andps xmm3, xmm0 orps xmm3, xmm1 movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7 jl loopVert done: } #else int i; const byte *jointsPtr = (byte *)joints; const byte *weightsPtr = (byte *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset ); weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset; mat.Mul( verts[i].xyz, base[i] ); } #endif } /* ============ idSIMD_SSE::TransformShadowVerts ============ */ void VPCALL idSIMD_SSE::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( base ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, weights mov esi, base mov edi, joints add ecx, eax neg eax movaps xmm0, SIMD_SP_clearLast movaps xmm1, SIMD_SP_lastOne loopVert: add esi, DRAWVERT_SIZE movzx ebx, word ptr [edx] add edx, 2 add eax, DRAWVERT_SIZE movaps xmm3, [esi-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] andps xmm3, xmm0 orps xmm3, xmm1 movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7 jl loopVert done: } #else int i; const byte *jointsPtr = (byte *)joints; const byte *weightsPtr = (byte *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + (*weightsPtr)*16); weightsPtr++; mat.Mul( verts[i].xyz, base[i].xyz ); } #endif } /* ============ idSIMD_SSE::TransformShadowVerts ============ */ void VPCALL idSIMD_SSE::TransformShadowVerts( shadowCache_t *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( base ); __asm { mov eax, numVerts test eax, eax jz done imul eax, SHADOWVERT_SIZE mov ecx, verts mov edx, weights mov esi, base mov edi, joints add ecx, eax neg eax movaps xmm0, SIMD_SP_clearLast movaps xmm1, SIMD_SP_lastOne loopVert: add esi, DRAWVERT_SIZE movzx ebx, word ptr [edx] add edx, 2 add eax, SHADOWVERT_SIZE movaps xmm3, [esi-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] andps xmm3, xmm0 orps xmm3, xmm1 movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-SHADOWVERT_SIZE+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-SHADOWVERT_SIZE+8], xmm7 jl loopVert done: } #else int i; const byte *jointsPtr = (byte *)joints; const byte *weightsPtr = (byte *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + (*weightsPtr)*16); weightsPtr++; mat.Mul( verts[i].xyz.ToVec3(), base[i].xyz ); } #endif } #if !defined(SD_USE_DRAWVERT_SIZE_32) /* ============ idSIMD_SSE::TransformVertsAndTangents ============ */ void VPCALL idSIMD_SSE::TransformVertsAndTangents( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( base ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, weights mov esi, base mov edi, joints add ecx, eax neg eax loopVert: movss xmm2, [edx+JOINTWEIGHT_WEIGHT_OFFSET] mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) add edx, JOINTWEIGHT_SIZE movaps xmm0, xmm2 movaps xmm1, xmm2 mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE je doneWeight loopWeight: movss xmm5, [edx+JOINTWEIGHT_WEIGHT_OFFSET] mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) add edx, JOINTWEIGHT_SIZE movaps xmm3, xmm5 movaps xmm4, xmm5 mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0 mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1 mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2 cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE addps xmm0, xmm3 addps xmm1, xmm4 addps xmm2, xmm5 jne loopWeight doneWeight: add esi, 3*BASEVECTOR_SIZE add eax, DRAWVERT_SIZE // transform vertex movaps xmm3, [esi-3*BASEVECTOR_SIZE] movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, xmm0 mulps xmm4, xmm1 mulps xmm5, xmm2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7 // transform normal movaps xmm3, [esi-2*BASEVECTOR_SIZE] movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, xmm0 mulps xmm4, xmm1 mulps xmm5, xmm2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7 // transform first tangent movaps xmm3, [esi-1*BASEVECTOR_SIZE] movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, xmm0 mulps xmm4, xmm1 mulps xmm5, xmm2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+8], xmm7 jl loopVert done: } #else int i, j; const byte *jointsPtr = (byte *)joints; for( j = i = 0; i < numVerts; i++, j++ ) { idJointMat mat; idJointMat::Mul( mat, *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ), weights[j].weight ); while( weights[j].nextVertexOffset != JOINTWEIGHT_SIZE ) { j++; idJointMat::Mad( mat, *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ), weights[j].weight ); } verts[i].xyz = mat * base[i*3+0]; verts[i].normal = mat * base[i*3+1]; verts[i].tangent.ToVec3() = mat * base[i*3+2]; } #endif } /* ============ idSIMD_SSE::TransformVertsAndTangentsFast ============ */ void VPCALL idSIMD_SSE::TransformVertsAndTangentsFast( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) { #if 1 assert_16_byte_aligned( joints ); assert_16_byte_aligned( base ); __asm { mov eax, numVerts test eax, eax jz done imul eax, DRAWVERT_SIZE mov ecx, verts mov edx, weights mov esi, base mov edi, joints add ecx, eax neg eax loopVert: mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET] add esi, 3*BASEVECTOR_SIZE movaps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0 movaps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1 movaps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2 add edx, dword ptr [edx+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET] add eax, DRAWVERT_SIZE // transform vertex movaps xmm3, [esi-3*BASEVECTOR_SIZE] movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, xmm0 mulps xmm4, xmm1 mulps xmm5, xmm2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7 // transform normal movaps xmm3, [esi-2*BASEVECTOR_SIZE] movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, xmm0 mulps xmm4, xmm1 mulps xmm5, xmm2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7 // transform first tangent movaps xmm3, [esi-1*BASEVECTOR_SIZE] movaps xmm4, xmm3 movaps xmm5, xmm3 mulps xmm3, xmm0 mulps xmm4, xmm1 mulps xmm5, xmm2 movaps xmm6, xmm3 // xmm6 = m0, m1, m2, t0 unpcklps xmm6, xmm4 // xmm6 = m0, m3, m1, m4 unpckhps xmm3, xmm4 // xmm4 = m2, m5, t0, t1 addps xmm6, xmm3 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1 movaps xmm7, xmm5 // xmm7 = m6, m7, m8, t2 movlhps xmm5, xmm6 // xmm5 = m6, m7, m0+m2, m3+m5 movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1 addps xmm6, xmm5 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1 movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+0], xmm6 movaps xmm7, xmm6 // xmm7 = m6+m8, m7+t2 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 2, 3 ) // xmm7 = m7+t2, m6+m8 addss xmm7, xmm6 // xmm7 = m6+m8+m7+t2 movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+8], xmm7 jl loopVert done: } #else int i; const byte *jointsPtr = (byte *)joints; const byte *weightsPtr = (byte *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset ); weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset; verts[i].xyz = mat * base[i*3+0]; verts[i].normal = mat * base[i*3+1]; verts[i].tangent.ToVec3() = mat * base[i*3+2]; } #endif } #endif #pragma warning( default : 4731 ) // frame pointer register 'ebx' modified by inline assembly code #if SD_SUPPORT_UNSMOOTHEDTANGENTS /* ============ idSIMD_SSE::DeriveUnsmoothedTangents ============ */ #define DERIVE_UNSMOOTHED_BITANGENT void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) { int i, j; for ( i = 0; i <= numVerts - 4; i += 4 ) { ALIGN16( float s0[4]; ) ALIGN16( float s1[4]; ) ALIGN16( float s2[4]; ) ALIGN16( float d0[4]; ) ALIGN16( float d1[4]; ) ALIGN16( float d2[4]; ) ALIGN16( float d3[4]; ) ALIGN16( float d4[4]; ) ALIGN16( float d5[4]; ) ALIGN16( float d6[4]; ) ALIGN16( float d7[4]; ) ALIGN16( float d8[4]; ) ALIGN16( float d9[4]; ) ALIGN16( float n0[4]; ) ALIGN16( float n1[4]; ) ALIGN16( float n2[4]; ) ALIGN16( float t0[4]; ) ALIGN16( float t1[4]; ) ALIGN16( float t2[4]; ) ALIGN16( float t3[4]; ) ALIGN16( float t4[4]; ) ALIGN16( float t5[4]; ) for ( j = 0; j < 4; j++ ) { const idDrawVert *a, *b, *c; const dominantTri_s &dt = dominantTris[i+j]; s0[j] = dt.normalizationScale[0]; s1[j] = dt.normalizationScale[1]; s2[j] = dt.normalizationScale[2]; a = verts + i + j; b = verts + dt.v2; c = verts + dt.v3; d0[j] = b->xyz[0] - a->xyz[0]; d1[j] = b->xyz[1] - a->xyz[1]; d2[j] = b->xyz[2] - a->xyz[2]; d3[j] = b->_st[0] - a->_st[0]; d4[j] = b->_st[1] - a->_st[1]; #if defined( SD_USE_DRAWVERT_SIZE_32 ) d3[j] *= ST_TO_FLOAT; d4[j] *= ST_TO_FLOAT; #endif d5[j] = c->xyz[0] - a->xyz[0]; d6[j] = c->xyz[1] - a->xyz[1]; d7[j] = c->xyz[2] - a->xyz[2]; d8[j] = c->_st[0] - a->_st[0]; d9[j] = c->_st[1] - a->_st[1]; #if defined( SD_USE_DRAWVERT_SIZE_32 ) d8[j] *= ST_TO_FLOAT; d9[j] *= ST_TO_FLOAT; #endif } #if 1 __asm { movaps xmm0, d6 mulps xmm0, d2 movaps xmm1, d7 mulps xmm1, d1 movaps xmm2, d7 mulps xmm2, d0 movaps xmm3, d5 mulps xmm3, d2 movaps xmm4, d5 mulps xmm4, d1 movaps xmm5, d6 mulps xmm5, d0 subps xmm0, xmm1 subps xmm2, xmm3 movaps xmm7, s2 subps xmm4, xmm5 mulps xmm0, xmm7 movaps n0, xmm0 mulps xmm2, xmm7 movaps n1, xmm2 mulps xmm4, xmm7 movaps n2, xmm4 movaps xmm0, d0 mulps xmm0, d9 movaps xmm1, d4 mulps xmm1, d5 movaps xmm2, d1 mulps xmm2, d9 movaps xmm3, d4 mulps xmm3, d6 movaps xmm4, d2 mulps xmm4, d9 movaps xmm5, d4 mulps xmm5, d7 subps xmm0, xmm1 subps xmm2, xmm3 movaps xmm7, s0 subps xmm4, xmm5 mulps xmm0, xmm7 movaps t0, xmm0 mulps xmm2, xmm7 movaps t1, xmm2 mulps xmm4, xmm7 movaps t2, xmm4 #ifndef DERIVE_UNSMOOTHED_BITANGENT movaps xmm0, d3 mulps xmm0, d5 movaps xmm1, d0 mulps xmm1, d8 movaps xmm2, d3 mulps xmm2, d6 movaps xmm3, d1 mulps xmm3, d8 movaps xmm4, d3 mulps xmm4, d7 movaps xmm5, d2 mulps xmm5, d8 #else movaps xmm0, n2 mulps xmm0, t1 movaps xmm1, n1 mulps xmm1, t2 movaps xmm2, n0 mulps xmm2, t2 movaps xmm3, n2 mulps xmm3, t0 movaps xmm4, n1 mulps xmm4, t0 movaps xmm5, n0 mulps xmm5, t1 #endif subps xmm0, xmm1 subps xmm2, xmm3 movaps xmm7, s1 subps xmm4, xmm5 mulps xmm0, xmm7 movaps t3, xmm0 mulps xmm2, xmm7 movaps t4, xmm2 mulps xmm4, xmm7 movaps t5, xmm4 } #else n0[0] = d6[0] * d2[0]; n0[1] = d6[1] * d2[1]; n0[2] = d6[2] * d2[2]; n0[3] = d6[3] * d2[3]; n1[0] = d7[0] * d0[0]; n1[1] = d7[1] * d0[1]; n1[2] = d7[2] * d0[2]; n1[3] = d7[3] * d0[3]; n2[0] = d5[0] * d1[0]; n2[1] = d5[1] * d1[1]; n2[2] = d5[2] * d1[2]; n2[3] = d5[3] * d1[3]; n0[0] -= d7[0] * d1[0]; n0[1] -= d7[1] * d1[1]; n0[2] -= d7[2] * d1[2]; n0[3] -= d7[3] * d1[3]; n1[0] -= d5[0] * d2[0]; n1[1] -= d5[1] * d2[1]; n1[2] -= d5[2] * d2[2]; n1[3] -= d5[3] * d2[3]; n2[0] -= d6[0] * d0[0]; n2[1] -= d6[1] * d0[1]; n2[2] -= d6[2] * d0[2]; n2[3] -= d6[3] * d0[3]; n0[0] *= s2[0]; n0[1] *= s2[1]; n0[2] *= s2[2]; n0[3] *= s2[3]; n1[0] *= s2[0]; n1[1] *= s2[1]; n1[2] *= s2[2]; n1[3] *= s2[3]; n2[0] *= s2[0]; n2[1] *= s2[1]; n2[2] *= s2[2]; n2[3] *= s2[3]; t0[0] = d0[0] * d9[0]; t0[1] = d0[1] * d9[1]; t0[2] = d0[2] * d9[2]; t0[3] = d0[3] * d9[3]; t1[0] = d1[0] * d9[0]; t1[1] = d1[1] * d9[1]; t1[2] = d1[2] * d9[2]; t1[3] = d1[3] * d9[3]; t2[0] = d2[0] * d9[0]; t2[1] = d2[1] * d9[1]; t2[2] = d2[2] * d9[2]; t2[3] = d2[3] * d9[3]; t0[0] -= d4[0] * d5[0]; t0[1] -= d4[1] * d5[1]; t0[2] -= d4[2] * d5[2]; t0[3] -= d4[3] * d5[3]; t1[0] -= d4[0] * d6[0]; t1[1] -= d4[1] * d6[1]; t1[2] -= d4[2] * d6[2]; t1[3] -= d4[3] * d6[3]; t2[0] -= d4[0] * d7[0]; t2[1] -= d4[1] * d7[1]; t2[2] -= d4[2] * d7[2]; t2[3] -= d4[3] * d7[3]; t0[0] *= s0[0]; t0[1] *= s0[1]; t0[2] *= s0[2]; t0[3] *= s0[3]; t1[0] *= s0[0]; t1[1] *= s0[1]; t1[2] *= s0[2]; t1[3] *= s0[3]; t2[0] *= s0[0]; t2[1] *= s0[1]; t2[2] *= s0[2]; t2[3] *= s0[3]; #ifndef DERIVE_UNSMOOTHED_BITANGENT t3[0] = d3[0] * d5[0]; t3[1] = d3[1] * d5[1]; t3[2] = d3[2] * d5[2]; t3[3] = d3[3] * d5[3]; t4[0] = d3[0] * d6[0]; t4[1] = d3[1] * d6[1]; t4[2] = d3[2] * d6[2]; t4[3] = d3[3] * d6[3]; t5[0] = d3[0] * d7[0]; t5[1] = d3[1] * d7[1]; t5[2] = d3[2] * d7[2]; t5[3] = d3[3] * d7[3]; t3[0] -= d0[0] * d8[0]; t3[1] -= d0[1] * d8[1]; t3[2] -= d0[2] * d8[2]; t3[3] -= d0[3] * d8[3]; t4[0] -= d1[0] * d8[0]; t4[1] -= d1[1] * d8[1]; t4[2] -= d1[2] * d8[2]; t4[3] -= d1[3] * d8[3]; t5[0] -= d2[0] * d8[0]; t5[1] -= d2[1] * d8[1]; t5[2] -= d2[2] * d8[2]; t5[3] -= d2[3] * d8[3]; #else t3[0] = n2[0] * t1[0]; t3[1] = n2[1] * t1[1]; t3[2] = n2[2] * t1[2]; t3[3] = n2[3] * t1[3]; t4[0] = n0[0] * t2[0]; t4[1] = n0[1] * t2[1]; t4[2] = n0[2] * t2[2]; t4[3] = n0[3] * t2[3]; t5[0] = n1[0] * t0[0]; t5[1] = n1[1] * t0[1]; t5[2] = n1[2] * t0[2]; t5[3] = n1[3] * t0[3]; t3[0] -= n1[0] * t2[0]; t3[1] -= n1[1] * t2[1]; t3[2] -= n1[2] * t2[2]; t3[3] -= n1[3] * t2[3]; t4[0] -= n2[0] * t0[0]; t4[1] -= n2[1] * t0[1]; t4[2] -= n2[2] * t0[2]; t4[3] -= n2[3] * t0[3]; t5[0] -= n0[0] * t1[0]; t5[1] -= n0[1] * t1[1]; t5[2] -= n0[2] * t1[2]; t5[3] -= n0[3] * t1[3]; #endif t3[0] *= s1[0]; t3[1] *= s1[1]; t3[2] *= s1[2]; t3[3] *= s1[3]; t4[0] *= s1[0]; t4[1] *= s1[1]; t4[2] *= s1[2]; t4[3] *= s1[3]; t5[0] *= s1[0]; t5[1] *= s1[1]; t5[2] *= s1[2]; t5[3] *= s1[3]; #endif for ( j = 0; j < 4; j++ ) { idDrawVert *a; a = verts + i + j; a->SetNormal( n0[j], n1[j], n2[j] ); a->SetTangent( t0[j], t1[j], t2[j] ); a->SetBiTangent( t3[j], t4[j], t5[j] ); } } for ( ; i < numVerts; i++ ) { idDrawVert *a, *b, *c; float d0, d1, d2, d3, d4; float d5, d6, d7, d8, d9; float s0, s1, s2; float n0, n1, n2; float t0, t1, t2; float t3, t4, t5; const dominantTri_s &dt = dominantTris[i]; s0 = dt.normalizationScale[0]; s1 = dt.normalizationScale[1]; s2 = dt.normalizationScale[2]; a = verts + i; b = verts + dt.v2; c = verts + dt.v3; d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; d3 = b->_st[0] - a->_st[0]; d4 = b->_st[1] - a->_st[1]; #if defined( SD_USE_DRAWVERT_SIZE_32 ) d3 *= ST_TO_FLOAT; d4 *= ST_TO_FLOAT; #endif d5 = c->xyz[0] - a->xyz[0]; d6 = c->xyz[1] - a->xyz[1]; d7 = c->xyz[2] - a->xyz[2]; d8 = c->_st[0] - a->_st[0]; d9 = c->_st[1] - a->_st[1]; #if defined( SD_USE_DRAWVERT_SIZE_32 ) d8 *= ST_TO_FLOAT; d9 *= ST_TO_FLOAT; #endif #if 1 __asm { movss xmm0, d6 mulss xmm0, d2 movss xmm1, d7 mulss xmm1, d1 movss xmm2, d7 mulss xmm2, d0 movss xmm3, d5 mulss xmm3, d2 movss xmm4, d5 mulss xmm4, d1 movss xmm5, d6 mulss xmm5, d0 subss xmm0, xmm1 subss xmm2, xmm3 movss xmm7, s2 subss xmm4, xmm5 mulss xmm0, xmm7 movss n0, xmm0 mulss xmm2, xmm7 movss n1, xmm2 mulss xmm4, xmm7 movss n2, xmm4 movss xmm0, d0 mulss xmm0, d9 movss xmm1, d4 mulss xmm1, d5 movss xmm2, d1 mulss xmm2, d9 movss xmm3, d4 mulss xmm3, d6 movss xmm4, d2 mulss xmm4, d9 movss xmm5, d4 mulss xmm5, d7 subss xmm0, xmm1 subss xmm2, xmm3 movss xmm7, s0 subss xmm4, xmm5 mulss xmm0, xmm7 movss t0, xmm0 mulss xmm2, xmm7 movss t1, xmm2 mulss xmm4, xmm7 movss t2, xmm4 #ifndef DERIVE_UNSMOOTHED_BITANGENT movss xmm0, d3 mulss xmm0, d5 movss xmm1, d0 mulss xmm1, d8 movss xmm2, d3 mulss xmm2, d6 movss xmm3, d1 mulss xmm3, d8 movss xmm4, d3 mulss xmm4, d7 movss xmm5, d2 mulss xmm5, d8 #else movss xmm0, n2 mulss xmm0, t1 movss xmm1, n1 mulss xmm1, t2 movss xmm2, n0 mulss xmm2, t2 movss xmm3, n2 mulss xmm3, t0 movss xmm4, n1 mulss xmm4, t0 movss xmm5, n0 mulss xmm5, t1 #endif subss xmm0, xmm1 subss xmm2, xmm3 movss xmm7, s1 subss xmm4, xmm5 mulss xmm0, xmm7 movss t3, xmm0 mulss xmm2, xmm7 movss t4, xmm2 mulss xmm4, xmm7 movss t5, xmm4 } #else n0 = s2 * ( d6 * d2 - d7 * d1 ); n1 = s2 * ( d7 * d0 - d5 * d2 ); n2 = s2 * ( d5 * d1 - d6 * d0 ); t0 = s0 * ( d0 * d9 - d4 * d5 ); t1 = s0 * ( d1 * d9 - d4 * d6 ); t2 = s0 * ( d2 * d9 - d4 * d7 ); #ifndef DERIVE_UNSMOOTHED_BITANGENT t3 = s1 * ( d3 * d5 - d0 * d8 ); t4 = s1 * ( d3 * d6 - d1 * d8 ); t5 = s1 * ( d3 * d7 - d2 * d8 ); #else t3 = s1 * ( n2 * t1 - n1 * t2 ); t4 = s1 * ( n0 * t2 - n2 * t0 ); t5 = s1 * ( n1 * t0 - n0 * t1 ); #endif #endif a->SetNormal( n0, n1, n2 ); a->SetTangent( t0, t1, t2 ); a->SetBiTangent( t3, t4, t5 ); } } #endif // SD_SUPPORT_UNSMOOTHEDTANGENTS /* ============ idSIMD_SSE::TracePointCull ============ */ void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { #if 1 __asm { push ebx mov eax, numVerts test eax, eax jz done mov edi, planes movlps xmm1, [edi] // xmm1 = 0, 1, X, X movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movss xmm7, radius shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) xor edx, edx mov esi, verts mov edi, cullBits imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert: movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4] mulps xmm4, xmm0 shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulps xmm5, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) addps xmm4, xmm5 mulps xmm6, xmm2 addps xmm4, xmm3 addps xmm4, xmm6 movaps xmm5, xmm4 xorps xmm5, SIMD_SP_signBit cmpltps xmm4, xmm7 movmskps ecx, xmm4 cmpltps xmm5, xmm7 movmskps ebx, xmm5 shl cx, 4 or cl, bl inc edi or dl, cl add eax, DRAWVERT_SIZE mov byte ptr [edi-1], cl jl loopVert done: mov esi, totalOr mov byte ptr [esi], dl pop ebx } #else int i; byte tOr; tOr = 0; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1, d2, d3, t; const idVec3 &v = verts[i].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; t = d0 + radius; bits = FLOATSIGNBITSET( t ) << 0; t = d1 + radius; bits |= FLOATSIGNBITSET( t ) << 1; t = d2 + radius; bits |= FLOATSIGNBITSET( t ) << 2; t = d3 + radius; bits |= FLOATSIGNBITSET( t ) << 3; t = d0 - radius; bits |= FLOATSIGNBITSET( t ) << 4; t = d1 - radius; bits |= FLOATSIGNBITSET( t ) << 5; t = d2 - radius; bits |= FLOATSIGNBITSET( t ) << 6; t = d3 - radius; bits |= FLOATSIGNBITSET( t ) << 7; bits ^= 0x0F; // flip lower four bits tOr |= bits; cullBits[i] = bits; } totalOr = tOr; #endif } void VPCALL idSIMD_SSE::TracePointCullShadowVerts( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const struct shadowCache_s *verts, const int numVerts ) { #if 1 __asm { push ebx mov eax, numVerts test eax, eax jz done mov edi, planes movlps xmm1, [edi] // xmm1 = 0, 1, X, X movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movss xmm7, radius shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) xor edx, edx mov esi, verts mov edi, cullBits imul eax, SHADOWVERT_SIZE add esi, eax neg eax loopVert: movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4] mulps xmm4, xmm0 shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8] mulps xmm5, xmm1 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) addps xmm4, xmm5 mulps xmm6, xmm2 addps xmm4, xmm3 addps xmm4, xmm6 movaps xmm5, xmm4 xorps xmm5, SIMD_SP_signBit cmpltps xmm4, xmm7 movmskps ecx, xmm4 cmpltps xmm5, xmm7 movmskps ebx, xmm5 shl cx, 4 or cl, bl inc edi or dl, cl add eax, SHADOWVERT_SIZE mov byte ptr [edi-1], cl jl loopVert done: mov esi, totalOr mov byte ptr [esi], dl pop ebx } #else int i; byte tOr; tOr = 0; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1, d2, d3, t; const idVec3 &v = verts[i].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; t = d0 + radius; bits = FLOATSIGNBITSET( t ) << 0; t = d1 + radius; bits |= FLOATSIGNBITSET( t ) << 1; t = d2 + radius; bits |= FLOATSIGNBITSET( t ) << 2; t = d3 + radius; bits |= FLOATSIGNBITSET( t ) << 3; t = d0 - radius; bits |= FLOATSIGNBITSET( t ) << 4; t = d1 - radius; bits |= FLOATSIGNBITSET( t ) << 5; t = d2 - radius; bits |= FLOATSIGNBITSET( t ) << 6; t = d3 - radius; bits |= FLOATSIGNBITSET( t ) << 7; bits ^= 0x0F; // flip lower four bits tOr |= bits; cullBits[i] = bits; } totalOr = tOr; #endif } /* ============ idSIMD_SSE::DecalPointCull ============ */ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { #if 1 ALIGN16( float p0[4]; ) ALIGN16( float p1[4]; ) ALIGN16( float p2[4]; ) ALIGN16( float p3[4]; ) ALIGN16( float p4[4]; ) ALIGN16( float p5[4]; ) ALIGN16( float p6[4]; ) ALIGN16( float p7[4]; ) __asm { mov ecx, planes movlps xmm1, [ecx] // xmm1 = 0, 1, X, X movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movaps p0, xmm0 movaps p1, xmm1 movaps p2, xmm2 movaps p3, xmm3 movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51 movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50 shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51 movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53 movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53 movaps p4, xmm4 movaps p5, xmm5 movaps p6, xmm6 movaps p7, xmm7 mov esi, verts mov edi, cullBits mov eax, numVerts and eax, ~1 jz done2 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert2: movaps xmm6, p0 movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 movaps xmm6, p0 movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm3 movaps xmm7, p1 movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movaps xmm7, p2 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps edx, xmm6 mov ch, dl shufps xmm0, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, p4 shufps xmm1, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, p5 addps xmm0, xmm1 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 add edi, 2 mov dh, dl shl dl, 4 shl dh, 2 and edx, (3<<4)|(3<<12) or ecx, edx add eax, 2*DRAWVERT_SIZE mov word ptr [edi-2], cx jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movaps xmm6, p0 movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 mulps xmm0, p4 mulps xmm1, p5 addps xmm0, xmm1 mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 and edx, 3 shl edx, 4 or ecx, edx mov byte ptr [edi], cl done: } #else int i; for ( i = 0; i < numVerts - 1; i += 2 ) { unsigned short bits0, bits1; float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; const idVec3 &v0 = verts[i+0].xyz; const idVec3 &v1 = verts[i+1].xyz; d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3]; d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3]; d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3]; d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3]; d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3]; d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3]; d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3]; d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3]; d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3]; d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3]; d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3]; d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3]; bits0 = FLOATSIGNBITSET( d0 ) << (0+0); bits0 |= FLOATSIGNBITSET( d1 ) << (0+1); bits0 |= FLOATSIGNBITSET( d2 ) << (0+2); bits0 |= FLOATSIGNBITSET( d3 ) << (0+3); bits0 |= FLOATSIGNBITSET( d4 ) << (0+4); bits0 |= FLOATSIGNBITSET( d5 ) << (0+5); bits1 = FLOATSIGNBITSET( d6 ) << (8+0); bits1 |= FLOATSIGNBITSET( d7 ) << (8+1); bits1 |= FLOATSIGNBITSET( d8 ) << (8+2); bits1 |= FLOATSIGNBITSET( d9 ) << (8+3); bits1 |= FLOATSIGNBITSET( d10 ) << (8+4); bits1 |= FLOATSIGNBITSET( d11 ) << (8+5); *(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F; } if ( numVerts & 1 ) { byte bits; float d0, d1, d2, d3, d4, d5; const idVec3 &v = verts[numVerts - 1].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3]; d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3]; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits } #endif } /* ============ idSIMD_SSE::DecalPointCull ============ */ void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts, int *indexes, int numIndexes ) { #if 0 ALIGN16( float p0[4]; ) ALIGN16( float p1[4]; ) ALIGN16( float p2[4]; ) ALIGN16( float p3[4]; ) ALIGN16( float p4[4]; ) ALIGN16( float p5[4]; ) ALIGN16( float p6[4]; ) ALIGN16( float p7[4]; ) __asm { mov ecx, planes movlps xmm1, [ecx] // xmm1 = 0, 1, X, X movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movaps p0, xmm0 movaps p1, xmm1 movaps p2, xmm2 movaps p3, xmm3 movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51 movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50 shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51 movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53 movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53 movaps p4, xmm4 movaps p5, xmm5 movaps p6, xmm6 movaps p7, xmm7 mov esi, verts mov edi, cullBits mov eax, numVerts and eax, ~1 jz done2 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert2: movaps xmm6, p0 movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 movaps xmm6, p0 movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm3 movaps xmm7, p1 movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movaps xmm7, p2 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps edx, xmm6 mov ch, dl shufps xmm0, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, p4 shufps xmm1, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, p5 addps xmm0, xmm1 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 add edi, 2 mov dh, dl shl dl, 4 shl dh, 2 and edx, (3<<4)|(3<<12) or ecx, edx add eax, 2*DRAWVERT_SIZE mov word ptr [edi-2], cx jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movaps xmm6, p0 movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 mulps xmm0, p4 mulps xmm1, p5 addps xmm0, xmm1 mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 and edx, 3 shl edx, 4 or ecx, edx mov byte ptr [edi], cl done: } #else int i; for ( i = 0; i < numIndexes - 1; i += 2 ) { unsigned short bits0, bits1; float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; int idx0 = indexes[i+0]; int idx1 = indexes[i+1]; const idVec3 &v0 = verts[idx0].xyz; const idVec3 &v1 = verts[idx1].xyz; d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3]; d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3]; d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3]; d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3]; d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3]; d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3]; d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3]; d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3]; d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3]; d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3]; d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3]; d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3]; bits0 = FLOATSIGNBITSET( d0 ) << (0+0); bits0 |= FLOATSIGNBITSET( d1 ) << (0+1); bits0 |= FLOATSIGNBITSET( d2 ) << (0+2); bits0 |= FLOATSIGNBITSET( d3 ) << (0+3); bits0 |= FLOATSIGNBITSET( d4 ) << (0+4); bits0 |= FLOATSIGNBITSET( d5 ) << (0+5); bits1 = FLOATSIGNBITSET( d6 ) << (8+0); bits1 |= FLOATSIGNBITSET( d7 ) << (8+1); bits1 |= FLOATSIGNBITSET( d8 ) << (8+2); bits1 |= FLOATSIGNBITSET( d9 ) << (8+3); bits1 |= FLOATSIGNBITSET( d10 ) << (8+4); bits1 |= FLOATSIGNBITSET( d11 ) << (8+5); cullBits[idx0] = ( bits0 ) ^ 0x3F; cullBits[idx1] = ( bits1 ) ^ 0x3F; } if ( numVerts & 1 ) { byte bits; float d0, d1, d2, d3, d4, d5; int idx = indexes[ numIndexes-1 ]; const idVec3 &v = verts[idx].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3]; d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3]; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[idx] = bits ^ 0x3F; // flip lower 6 bits } #endif } void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts, unsigned short *indexes, int numIndexes ) { #if 0 ALIGN16( float p0[4]; ) ALIGN16( float p1[4]; ) ALIGN16( float p2[4]; ) ALIGN16( float p3[4]; ) ALIGN16( float p4[4]; ) ALIGN16( float p5[4]; ) ALIGN16( float p6[4]; ) ALIGN16( float p7[4]; ) __asm { mov ecx, planes movlps xmm1, [ecx] // xmm1 = 0, 1, X, X movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5 movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7 movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13 movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15 movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5 shufps xmm0, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12 shufps xmm1, xmm4, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13 movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14 shufps xmm3, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15 movaps p0, xmm0 movaps p1, xmm1 movaps p2, xmm2 movaps p3, xmm3 movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51 movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51 shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50 shufps xmm5, xmm5, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51 movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53 movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53 shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53 movaps p4, xmm4 movaps p5, xmm5 movaps p6, xmm6 movaps p7, xmm7 mov esi, verts mov edi, cullBits mov eax, numVerts and eax, ~1 jz done2 imul eax, DRAWVERT_SIZE add esi, eax neg eax loopVert2: movaps xmm6, p0 movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 movaps xmm6, p0 movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm3 movaps xmm7, p1 movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm4 addps xmm6, xmm7 movaps xmm7, p2 movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm5 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps edx, xmm6 mov ch, dl shufps xmm0, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, p4 shufps xmm1, xmm4, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, p5 addps xmm0, xmm1 shufps xmm2, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 add edi, 2 mov dh, dl shl dl, 4 shl dh, 2 and edx, (3<<4)|(3<<12) or ecx, edx add eax, 2*DRAWVERT_SIZE mov word ptr [edi-2], cx jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movaps xmm6, p0 movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm6, xmm0 movaps xmm7, p1 movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm1 addps xmm6, xmm7 movaps xmm7, p2 movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm7, xmm2 addps xmm6, xmm7 addps xmm6, p3 cmpnltps xmm6, SIMD_SP_zero movmskps ecx, xmm6 mulps xmm0, p4 mulps xmm1, p5 addps xmm0, xmm1 mulps xmm2, p6 addps xmm0, xmm2 addps xmm0, p7 cmpnltps xmm0, SIMD_SP_zero movmskps edx, xmm0 and edx, 3 shl edx, 4 or ecx, edx mov byte ptr [edi], cl done: } #else int i; for ( i = 0; i < numIndexes - 1; i += 2 ) { unsigned short bits0, bits1; float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11; int idx0 = indexes[i+0]; int idx1 = indexes[i+1]; const idVec3 &v0 = verts[idx0].xyz; const idVec3 &v1 = verts[idx1].xyz; d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3]; d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3]; d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3]; d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3]; d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3]; d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3]; d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3]; d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3]; d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3]; d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3]; d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3]; d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3]; bits0 = FLOATSIGNBITSET( d0 ) << (0+0); bits0 |= FLOATSIGNBITSET( d1 ) << (0+1); bits0 |= FLOATSIGNBITSET( d2 ) << (0+2); bits0 |= FLOATSIGNBITSET( d3 ) << (0+3); bits0 |= FLOATSIGNBITSET( d4 ) << (0+4); bits0 |= FLOATSIGNBITSET( d5 ) << (0+5); bits1 = FLOATSIGNBITSET( d6 ) << (8+0); bits1 |= FLOATSIGNBITSET( d7 ) << (8+1); bits1 |= FLOATSIGNBITSET( d8 ) << (8+2); bits1 |= FLOATSIGNBITSET( d9 ) << (8+3); bits1 |= FLOATSIGNBITSET( d10 ) << (8+4); bits1 |= FLOATSIGNBITSET( d11 ) << (8+5); cullBits[idx0] = ( bits0 ) ^ 0x3F; cullBits[idx1] = ( bits1 ) ^ 0x3F; } if ( numVerts & 1 ) { byte bits; float d0, d1, d2, d3, d4, d5; int idx = indexes[ numIndexes-1 ]; const idVec3 &v = verts[idx].xyz; d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3]; d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3]; d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3]; d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3]; d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3]; d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3]; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[idx] = bits ^ 0x3F; // flip lower 6 bits } #endif } /* ============ idSIMD_SSE::OverlayPointCull ============ */ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { #if 1 __asm { mov eax, numVerts mov edx, verts mov esi, texCoords mov edi, cullBits mov ecx, planes movss xmm4, [ecx+ 0] movss xmm5, [ecx+16] shufps xmm4, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) movss xmm5, [ecx+ 4] movss xmm6, [ecx+20] shufps xmm5, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) movss xmm6, [ecx+ 8] movss xmm7, [ecx+24] shufps xmm6, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) movss xmm7, [ecx+12] movss xmm0, [ecx+28] shufps xmm7, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) and eax, ~1 jz done2 add edi, eax neg eax loopVert2: movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, xmm4 movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm5 movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm7 movaps [esi], xmm0 movaps xmm1, xmm0 movaps xmm2, SIMD_SP_one subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) shufps xmm1, xmm2, R_SHUFFLE_PS( 2, 3, 2, 3 ) add edx, 2*DRAWVERT_SIZE movmskps ecx, xmm0 mov byte ptr [edi+eax+0], cl add esi, 4*4 movmskps ecx, xmm1 mov byte ptr [edi+eax+1], cl add eax, 2 jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, xmm4 movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm5 movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm7 movlps [esi], xmm0 movaps xmm1, xmm0 movaps xmm2, SIMD_SP_one subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) movmskps ecx, xmm0 mov byte ptr [edi], cl done: } #else const idPlane &p0 = planes[0]; const idPlane &p1 = planes[1]; for ( int i = 0; i < numVerts - 1; i += 2 ) { unsigned short bits; float d0, d1, d2, d3; const idVec3 &v0 = verts[i+0].xyz; const idVec3 &v1 = verts[i+1].xyz; d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3]; d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3]; texCoords[i+0][0] = d0; texCoords[i+0][1] = d1; texCoords[i+1][0] = d2; texCoords[i+1][1] = d3; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 8; bits |= FLOATSIGNBITSET( d3 ) << 9; d0 = 1.0f - d0; d1 = 1.0f - d1; d2 = 1.0f - d2; d3 = 1.0f - d3; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; bits |= FLOATSIGNBITSET( d2 ) << 10; bits |= FLOATSIGNBITSET( d3 ) << 11; *(unsigned short *)(cullBits + i) = bits; } if ( numVerts & 1 ) { byte bits; float d0, d1; const idPlane &p0 = planes[0]; const idPlane &p1 = planes[1]; const idVec3 &v0 = verts[numVerts - 1].xyz; d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; texCoords[numVerts - 1][0] = d0; texCoords[numVerts - 1][1] = d1; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; d0 = 1.0f - d0; d1 = 1.0f - d1; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; cullBits[numVerts - 1] = bits; } #endif } /* ============ idSIMD_SSE::OverlayPointCull ============ */ void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const shadowCache_t *verts, const int numVerts ) { #if 1 __asm { mov eax, numVerts mov edx, verts mov esi, texCoords mov edi, cullBits mov ecx, planes movss xmm4, [ecx+ 0] movss xmm5, [ecx+16] shufps xmm4, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 2, 0, 2 ) movss xmm5, [ecx+ 4] movss xmm6, [ecx+20] shufps xmm5, xmm6, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 2, 0, 2 ) movss xmm6, [ecx+ 8] movss xmm7, [ecx+24] shufps xmm6, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) movss xmm7, [ecx+12] movss xmm0, [ecx+28] shufps xmm7, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) and eax, ~1 jz done2 add edi, eax neg eax loopVert2: movss xmm0, [edx+0*SHADOWVERT_SIZE+0] movss xmm1, [edx+1*SHADOWVERT_SIZE+0] shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, xmm4 movss xmm1, [edx+0*SHADOWVERT_SIZE+4] movss xmm2, [edx+1*SHADOWVERT_SIZE+4] shufps xmm1, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm5 movss xmm2, [edx+0*SHADOWVERT_SIZE+8] movss xmm3, [edx+1*SHADOWVERT_SIZE+8] shufps xmm2, xmm3, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm7 movaps [esi], xmm0 movaps xmm1, xmm0 movaps xmm2, SIMD_SP_one subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) shufps xmm1, xmm2, R_SHUFFLE_PS( 2, 3, 2, 3 ) add edx, 2*SHADOWVERT_SIZE movmskps ecx, xmm0 mov byte ptr [edi+eax+0], cl add esi, 4*4 movmskps ecx, xmm1 mov byte ptr [edi+eax+1], cl add eax, 2 jl loopVert2 done2: mov eax, numVerts and eax, 1 jz done movss xmm0, [edx+0*SHADOWVERT_SIZE+0] shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, xmm4 movss xmm1, [edx+0*SHADOWVERT_SIZE+4] shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm5 movss xmm2, [edx+0*SHADOWVERT_SIZE+8] shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm2, xmm6 addps xmm0, xmm1 addps xmm0, xmm2 addps xmm0, xmm7 movlps [esi], xmm0 movaps xmm1, xmm0 movaps xmm2, SIMD_SP_one subps xmm2, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) movmskps ecx, xmm0 mov byte ptr [edi], cl done: } #else const idPlane &p0 = planes[0]; const idPlane &p1 = planes[1]; for ( int i = 0; i < numVerts - 1; i += 2 ) { unsigned short bits; float d0, d1, d2, d3; const idVec3 &v0 = verts[i+0].xyz.ToVec3(); const idVec3 &v1 = verts[i+1].xyz.ToVec3(); d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3]; d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3]; texCoords[i+0][0] = d0; texCoords[i+0][1] = d1; texCoords[i+1][0] = d2; texCoords[i+1][1] = d3; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 8; bits |= FLOATSIGNBITSET( d3 ) << 9; d0 = 1.0f - d0; d1 = 1.0f - d1; d2 = 1.0f - d2; d3 = 1.0f - d3; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; bits |= FLOATSIGNBITSET( d2 ) << 10; bits |= FLOATSIGNBITSET( d3 ) << 11; *(unsigned short *)(cullBits + i) = bits; } if ( numVerts & 1 ) { byte bits; float d0, d1; const idPlane &p0 = planes[0]; const idPlane &p1 = planes[1]; const idVec3 &v0 = verts[numVerts - 1].xyz.ToVec3(); d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3]; d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3]; texCoords[numVerts - 1][0] = d0; texCoords[numVerts - 1][1] = d1; bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; d0 = 1.0f - d0; d1 = 1.0f - d1; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; cullBits[numVerts - 1] = bits; } #endif } /* ============ idSIMD_SSE::DeriveTriPlanes ============ */ #pragma warning( disable : 4731 ) // frame pointer register 'ebx' modified by inline assembly code void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const vertIndex_t *indexes, const int numIndexes ) { #if defined( GL_INDEX_SHORT ) assert_sizeof( vertIndex_t, 2 ); int d, a; int n = numIndexes / 3; ALIGN16( float x0[4] ); ALIGN16( float x1[4] ); ALIGN16( float x2[4] ); __asm { push ebx mov eax, n shl eax, 4 mov esi, verts mov edi, indexes mov edx, planes add edx, eax neg eax mov d, edx add eax, 4*16 jge done4 loopPlane4: mov a, eax movzx ecx, word ptr [edi+0*6+0] shl ecx, DRAWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+1*6+0] shl ebx, DRAWVERT_SIZE_SHIFT movzx edx, word ptr [edi+2*6+0] shl edx, DRAWVERT_SIZE_SHIFT movzx eax, word ptr [edi+3*6+0] shl eax, DRAWVERT_SIZE_SHIFT movlps xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ movzx ecx, word ptr [edi+0*6+2] shl ecx, DRAWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+1*6+2] shl ebx, DRAWVERT_SIZE_SHIFT movzx edx, word ptr [edi+2*6+2] shl edx, DRAWVERT_SIZE_SHIFT movzx eax, word ptr [edi+3*6+2] shl eax, DRAWVERT_SIZE_SHIFT movaps x0, xmm3 movaps x1, xmm4 movaps x2, xmm5 movlps xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, X, X */ movss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, X, X, X */ movhps xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, 4, 5 */ movhps xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, X, 6, X */ movlps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, 14, X */ movaps xmm0, xmm1 /* xmm0 = 0, 1, 4, 5 */ shufps xmm0, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm0 = 0, 4, 8, 12 */ shufps xmm1, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm1 = 1, 5, 9, 13 */ shufps xmm2, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm2 = 2, 6, 10, 14 */ movzx ecx, word ptr [edi+0*6+4] shl ecx, DRAWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+1*6+4] shl ebx, DRAWVERT_SIZE_SHIFT movzx edx, word ptr [edi+2*6+4] shl edx, DRAWVERT_SIZE_SHIFT movzx eax, word ptr [edi+3*6+4] shl eax, DRAWVERT_SIZE_SHIFT subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 movlps xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ mov eax, a mov edx, d add edi, 4*6 subps xmm3, x0 subps xmm4, x1 subps xmm5, x2 movaps xmm6, xmm4 mulps xmm6, xmm2 movaps xmm7, xmm5 mulps xmm7, xmm1 subps xmm6, xmm7 mulps xmm5, xmm0 mulps xmm2, xmm3 subps xmm5, xmm2 mulps xmm3, xmm1 mulps xmm4, xmm0 subps xmm3, xmm4 add eax, 4*16 movaps xmm0, xmm6 mulps xmm6, xmm6 movaps xmm1, xmm5 mulps xmm5, xmm5 movaps xmm2, xmm3 mulps xmm3, xmm3 addps xmm3, xmm5 addps xmm3, xmm6 rsqrtps xmm3, xmm3 mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movaps xmm4, x0 movaps xmm5, x1 movaps xmm6, x2 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 addps xmm4, xmm5 addps xmm4, xmm6 xorps xmm4, SIMD_SP_signBit // transpose xmm0, xmm1, xmm2, xmm4 to memory movaps xmm7, xmm0 movaps xmm5, xmm2 unpcklps xmm0, xmm1 unpcklps xmm2, xmm4 movlps [edx+eax-8*16+0], xmm0 movlps [edx+eax-8*16+8], xmm2 movhps [edx+eax-7*16+0], xmm0 movhps [edx+eax-7*16+8], xmm2 unpckhps xmm7, xmm1 unpckhps xmm5, xmm4 movlps [edx+eax-6*16+0], xmm7 movlps [edx+eax-6*16+8], xmm5 movhps [edx+eax-5*16+0], xmm7 movhps [edx+eax-5*16+8], xmm5 jle loopPlane4 done4: sub eax, 4*16 jge done loopPlane1: movzx ecx, word ptr [edi+0] shl ecx, DRAWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+2] shl ebx, DRAWVERT_SIZE_SHIFT movzx edx, word ptr [edi+4] shl edx, DRAWVERT_SIZE_SHIFT movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm3, [esi+edx+DRAWVERT_XYZ_OFFSET+0] subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+4] subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+8] subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm6, xmm4 mulss xmm6, xmm2 movss xmm7, xmm5 mulss xmm7, xmm1 subss xmm6, xmm7 add edi, 1*6 mulss xmm5, xmm0 mulss xmm2, xmm3 subss xmm5, xmm2 mulss xmm3, xmm1 mulss xmm4, xmm0 subss xmm3, xmm4 mov edx, d movss xmm0, xmm6 mulss xmm6, xmm6 movss xmm1, xmm5 mulss xmm5, xmm5 movss xmm2, xmm3 mulss xmm3, xmm3 add eax, 1*16 addss xmm3, xmm5 addss xmm3, xmm6 rsqrtss xmm3, xmm3 mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [edx+eax-1*16+0], xmm0 movss [edx+eax-1*16+4], xmm1 movss [edx+eax-1*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_firstSignBit subss xmm0, xmm1 subss xmm0, xmm2 movss [edx+eax-1*16+12], xmm0 jl loopPlane1 done: pop ebx } #elif defined ( GL_INDEX_INT ) assert_sizeof( vertIndex_t, 4 ); int d, a; int n = numIndexes / 3; ALIGN16( float x0[4] ); ALIGN16( float x1[4] ); ALIGN16( float x2[4] ); __asm { push ebx mov eax, n shl eax, 4 mov esi, verts mov edi, indexes mov edx, planes add edx, eax neg eax mov d, edx add eax, 4*16 jge done4 loopPlane4: mov a, eax mov ecx, dword ptr [edi+0*12+0] shl ecx, DRAWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+1*12+0] shl ebx, DRAWVERT_SIZE_SHIFT mov edx, dword ptr [edi+2*12+0] shl edx, DRAWVERT_SIZE_SHIFT mov eax, dword ptr [edi+3*12+0] shl eax, DRAWVERT_SIZE_SHIFT movlps xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ mov ecx, dword ptr [edi+0*12+4] shl ecx, DRAWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+1*12+4] shl ebx, DRAWVERT_SIZE_SHIFT mov edx, dword ptr [edi+2*12+4] shl edx, DRAWVERT_SIZE_SHIFT mov eax, dword ptr [edi+3*12+4] shl eax, DRAWVERT_SIZE_SHIFT movaps x0, xmm3 movaps x1, xmm4 movaps x2, xmm5 movlps xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, X, X */ movss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, X, X, X */ movhps xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] /* xmm1 = 0, 1, 4, 5 */ movhps xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] /* xmm2 = 2, X, 6, X */ movlps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, 14, X */ movaps xmm0, xmm1 /* xmm0 = 0, 1, 4, 5 */ shufps xmm0, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm0 = 0, 4, 8, 12 */ shufps xmm1, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm1 = 1, 5, 9, 13 */ shufps xmm2, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm2 = 2, 6, 10, 14 */ mov ecx, dword ptr [edi+0*12+8] shl ecx, DRAWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+1*12+8] shl ebx, DRAWVERT_SIZE_SHIFT mov edx, dword ptr [edi+2*12+8] shl edx, DRAWVERT_SIZE_SHIFT mov eax, dword ptr [edi+3*12+8] shl eax, DRAWVERT_SIZE_SHIFT subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 movlps xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+DRAWVERT_XYZ_OFFSET+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ mov eax, a mov edx, d add edi, 4*12 subps xmm3, x0 subps xmm4, x1 subps xmm5, x2 movaps xmm6, xmm4 mulps xmm6, xmm2 movaps xmm7, xmm5 mulps xmm7, xmm1 subps xmm6, xmm7 mulps xmm5, xmm0 mulps xmm2, xmm3 subps xmm5, xmm2 mulps xmm3, xmm1 mulps xmm4, xmm0 subps xmm3, xmm4 add eax, 4*16 movaps xmm0, xmm6 mulps xmm6, xmm6 movaps xmm1, xmm5 mulps xmm5, xmm5 movaps xmm2, xmm3 mulps xmm3, xmm3 addps xmm3, xmm5 addps xmm3, xmm6 rsqrtps xmm3, xmm3 mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movaps xmm4, x0 movaps xmm5, x1 movaps xmm6, x2 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 addps xmm4, xmm5 addps xmm4, xmm6 xorps xmm4, SIMD_SP_signBit // transpose xmm0, xmm1, xmm2, xmm4 to memory movaps xmm7, xmm0 movaps xmm5, xmm2 unpcklps xmm0, xmm1 unpcklps xmm2, xmm4 movlps [edx+eax-8*16+0], xmm0 movlps [edx+eax-8*16+8], xmm2 movhps [edx+eax-7*16+0], xmm0 movhps [edx+eax-7*16+8], xmm2 unpckhps xmm7, xmm1 unpckhps xmm5, xmm4 movlps [edx+eax-6*16+0], xmm7 movlps [edx+eax-6*16+8], xmm5 movhps [edx+eax-5*16+0], xmm7 movhps [edx+eax-5*16+8], xmm5 jle loopPlane4 done4: sub eax, 4*16 jge done loopPlane1: mov ecx, dword ptr [edi+0] shl ecx, DRAWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+4] shl ebx, DRAWVERT_SIZE_SHIFT mov edx, dword ptr [edi+8] shl edx, DRAWVERT_SIZE_SHIFT movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0] subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4] subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8] subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm3, [esi+edx+DRAWVERT_XYZ_OFFSET+0] subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+4] subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+8] subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] movss xmm6, xmm4 mulss xmm6, xmm2 movss xmm7, xmm5 mulss xmm7, xmm1 subss xmm6, xmm7 add edi, 1*12 mulss xmm5, xmm0 mulss xmm2, xmm3 subss xmm5, xmm2 mulss xmm3, xmm1 mulss xmm4, xmm0 subss xmm3, xmm4 mov edx, d movss xmm0, xmm6 mulss xmm6, xmm6 movss xmm1, xmm5 mulss xmm5, xmm5 movss xmm2, xmm3 mulss xmm3, xmm3 add eax, 1*16 addss xmm3, xmm5 addss xmm3, xmm6 rsqrtss xmm3, xmm3 mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [edx+eax-1*16+0], xmm0 movss [edx+eax-1*16+4], xmm1 movss [edx+eax-1*16+8], xmm2 mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0] mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4] mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8] xorps xmm0, SIMD_SP_firstSignBit subss xmm0, xmm1 subss xmm0, xmm2 movss [edx+eax-1*16+12], xmm0 jl loopPlane1 done: pop ebx } #else int i, j; for ( i = 0; i <= numIndexes - 12; i += 12 ) { ALIGN16( float d0[4]; ) ALIGN16( float d1[4]; ) ALIGN16( float d2[4]; ) ALIGN16( float d3[4]; ) ALIGN16( float d4[4]; ) ALIGN16( float d5[4]; ) ALIGN16( float n0[4]; ) ALIGN16( float n1[4]; ) ALIGN16( float n2[4]; ) for ( j = 0; j < 4; j++ ) { const idDrawVert *a, *b, *c; a = verts + indexes[i + j * 3 + 0]; b = verts + indexes[i + j * 3 + 1]; c = verts + indexes[i + j * 3 + 2]; d0[j] = b->xyz[0] - a->xyz[0]; d1[j] = b->xyz[1] - a->xyz[1]; d2[j] = b->xyz[2] - a->xyz[2]; d3[j] = c->xyz[0] - a->xyz[0]; d4[j] = c->xyz[1] - a->xyz[1]; d5[j] = c->xyz[2] - a->xyz[2]; } ALIGN16( float tmp[4]; ) n0[0] = d4[0] * d2[0]; n0[1] = d4[1] * d2[1]; n0[2] = d4[2] * d2[2]; n0[3] = d4[3] * d2[3]; n0[0] -= d5[0] * d1[0]; n0[1] -= d5[1] * d1[1]; n0[2] -= d5[2] * d1[2]; n0[3] -= d5[3] * d1[3]; n1[0] = d5[0] * d0[0]; n1[1] = d5[1] * d0[1]; n1[2] = d5[2] * d0[2]; n1[3] = d5[3] * d0[3]; n1[0] -= d3[0] * d2[0]; n1[1] -= d3[1] * d2[1]; n1[2] -= d3[2] * d2[2]; n1[3] -= d3[3] * d2[3]; n2[0] = d3[0] * d1[0]; n2[1] = d3[1] * d1[1]; n2[2] = d3[2] * d1[2]; n2[3] = d3[3] * d1[3]; n2[0] -= d4[0] * d0[0]; n2[1] -= d4[1] * d0[1]; n2[2] -= d4[2] * d0[2]; n2[3] -= d4[3] * d0[3]; tmp[0] = n0[0] * n0[0]; tmp[1] = n0[1] * n0[1]; tmp[2] = n0[2] * n0[2]; tmp[3] = n0[3] * n0[3]; tmp[0] += n1[0] * n1[0]; tmp[1] += n1[1] * n1[1]; tmp[2] += n1[2] * n1[2]; tmp[3] += n1[3] * n1[3]; tmp[0] += n2[0] * n2[0]; tmp[1] += n2[1] * n2[1]; tmp[2] += n2[2] * n2[2]; tmp[3] += n2[3] * n2[3]; tmp[0] = idMath::RSqrt( tmp[0] ); tmp[1] = idMath::RSqrt( tmp[1] ); tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); n0[0] *= tmp[0]; n0[1] *= tmp[1]; n0[2] *= tmp[2]; n0[3] *= tmp[3]; n1[0] *= tmp[0]; n1[1] *= tmp[1]; n1[2] *= tmp[2]; n1[3] *= tmp[3]; n2[0] *= tmp[0]; n2[1] *= tmp[1]; n2[2] *= tmp[2]; n2[3] *= tmp[3]; for ( j = 0; j < 4; j++ ) { const idDrawVert *a; a = verts + indexes[i + j * 3]; planes->Normal()[0] = n0[j]; planes->Normal()[1] = n1[j]; planes->Normal()[2] = n2[j]; planes->FitThroughPoint( a->xyz ); planes++; } } for ( ; i < numIndexes; i += 3 ) { const idDrawVert *a, *b, *c; float d0, d1, d2, d3, d4, d5; float n0, n1, n2; a = verts + indexes[i + 0]; b = verts + indexes[i + 1]; c = verts + indexes[i + 2]; d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; d3 = c->xyz[0] - a->xyz[0]; d4 = c->xyz[1] - a->xyz[1]; d5 = c->xyz[2] - a->xyz[2]; float tmp; n0 = d4 * d2 - d5 * d1; n1 = d5 * d0 - d3 * d2; n2 = d3 * d1 - d4 * d0; tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 ); n0 *= tmp; n1 *= tmp; n2 *= tmp; planes->Normal()[0] = n0; planes->Normal()[1] = n1; planes->Normal()[2] = n2; planes->FitThroughPoint( a->xyz ); planes++; } #endif } void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const shadowCache_t *verts, const int numVerts, const vertIndex_t *indexes, const int numIndexes ) { #if defined( GL_INDEX_SHORT ) assert_sizeof( vertIndex_t, 2 ); int d, a; int n = numIndexes / 3; ALIGN16( float x0[4] ); ALIGN16( float x1[4] ); ALIGN16( float x2[4] ); __asm { push ebx mov eax, n shl eax, 4 mov esi, verts mov edi, indexes mov edx, planes add edx, eax neg eax mov d, edx add eax, 4*16 jge done4 loopPlane4: mov a, eax movzx ecx, word ptr [edi+0*6+0] shl ecx, SHADOWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+1*6+0] shl ebx, SHADOWVERT_SIZE_SHIFT movzx edx, word ptr [edi+2*6+0] shl edx, SHADOWVERT_SIZE_SHIFT movzx eax, word ptr [edi+3*6+0] shl eax, SHADOWVERT_SIZE_SHIFT movlps xmm4, [esi+ecx+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ movzx ecx, word ptr [edi+0*6+2] shl ecx, SHADOWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+1*6+2] shl ebx, SHADOWVERT_SIZE_SHIFT movzx edx, word ptr [edi+2*6+2] shl edx, SHADOWVERT_SIZE_SHIFT movzx eax, word ptr [edi+3*6+2] shl eax, SHADOWVERT_SIZE_SHIFT movaps x0, xmm3 movaps x1, xmm4 movaps x2, xmm5 movlps xmm1, [esi+ecx+0] /* xmm1 = 0, 1, X, X */ movss xmm2, [esi+ecx+8] /* xmm2 = 2, X, X, X */ movhps xmm1, [esi+ebx+0] /* xmm1 = 0, 1, 4, 5 */ movhps xmm2, [esi+ebx+8] /* xmm2 = 2, X, 6, X */ movlps xmm6, [esi+edx+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+8] /* xmm6 = 10, X, 14, X */ movaps xmm0, xmm1 /* xmm0 = 0, 1, 4, 5 */ shufps xmm0, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm0 = 0, 4, 8, 12 */ shufps xmm1, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm1 = 1, 5, 9, 13 */ shufps xmm2, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm2 = 2, 6, 10, 14 */ movzx ecx, word ptr [edi+0*6+4] shl ecx, SHADOWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+1*6+4] shl ebx, SHADOWVERT_SIZE_SHIFT movzx edx, word ptr [edi+2*6+4] shl edx, SHADOWVERT_SIZE_SHIFT movzx eax, word ptr [edi+3*6+4] shl eax, SHADOWVERT_SIZE_SHIFT subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 movlps xmm4, [esi+ecx+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ mov eax, a mov edx, d add edi, 4*6 subps xmm3, x0 subps xmm4, x1 subps xmm5, x2 movaps xmm6, xmm4 mulps xmm6, xmm2 movaps xmm7, xmm5 mulps xmm7, xmm1 subps xmm6, xmm7 mulps xmm5, xmm0 mulps xmm2, xmm3 subps xmm5, xmm2 mulps xmm3, xmm1 mulps xmm4, xmm0 subps xmm3, xmm4 add eax, 4*16 movaps xmm0, xmm6 mulps xmm6, xmm6 movaps xmm1, xmm5 mulps xmm5, xmm5 movaps xmm2, xmm3 mulps xmm3, xmm3 addps xmm3, xmm5 addps xmm3, xmm6 rsqrtps xmm3, xmm3 mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movaps xmm4, x0 movaps xmm5, x1 movaps xmm6, x2 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 addps xmm4, xmm5 addps xmm4, xmm6 xorps xmm4, SIMD_SP_signBit // transpose xmm0, xmm1, xmm2, xmm4 to memory movaps xmm7, xmm0 movaps xmm5, xmm2 unpcklps xmm0, xmm1 unpcklps xmm2, xmm4 movlps [edx+eax-8*16+0], xmm0 movlps [edx+eax-8*16+8], xmm2 movhps [edx+eax-7*16+0], xmm0 movhps [edx+eax-7*16+8], xmm2 unpckhps xmm7, xmm1 unpckhps xmm5, xmm4 movlps [edx+eax-6*16+0], xmm7 movlps [edx+eax-6*16+8], xmm5 movhps [edx+eax-5*16+0], xmm7 movhps [edx+eax-5*16+8], xmm5 jle loopPlane4 done4: sub eax, 4*16 jge done loopPlane1: movzx ecx, word ptr [edi+0] shl ecx, SHADOWVERT_SIZE_SHIFT movzx ebx, word ptr [edi+2] shl ebx, SHADOWVERT_SIZE_SHIFT movzx edx, word ptr [edi+4] shl edx, SHADOWVERT_SIZE_SHIFT movss xmm0, [esi+ebx+0] subss xmm0, [esi+ecx+0] movss xmm1, [esi+ebx+4] subss xmm1, [esi+ecx+4] movss xmm2, [esi+ebx+8] subss xmm2, [esi+ecx+8] movss xmm3, [esi+edx+0] subss xmm3, [esi+ecx+0] movss xmm4, [esi+edx+4] subss xmm4, [esi+ecx+4] movss xmm5, [esi+edx+8] subss xmm5, [esi+ecx+8] movss xmm6, xmm4 mulss xmm6, xmm2 movss xmm7, xmm5 mulss xmm7, xmm1 subss xmm6, xmm7 add edi, 1*6 mulss xmm5, xmm0 mulss xmm2, xmm3 subss xmm5, xmm2 mulss xmm3, xmm1 mulss xmm4, xmm0 subss xmm3, xmm4 mov edx, d movss xmm0, xmm6 mulss xmm6, xmm6 movss xmm1, xmm5 mulss xmm5, xmm5 movss xmm2, xmm3 mulss xmm3, xmm3 add eax, 1*16 addss xmm3, xmm5 addss xmm3, xmm6 rsqrtss xmm3, xmm3 mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [edx+eax-1*16+0], xmm0 movss [edx+eax-1*16+4], xmm1 movss [edx+eax-1*16+8], xmm2 mulss xmm0, [esi+ecx+0] mulss xmm1, [esi+ecx+4] mulss xmm2, [esi+ecx+8] xorps xmm0, SIMD_SP_firstSignBit subss xmm0, xmm1 subss xmm0, xmm2 movss [edx+eax-1*16+12], xmm0 jl loopPlane1 done: pop ebx } #elif defined ( GL_INDEX_INT ) assert_sizeof( vertIndex_t, 4 ); int d, a; int n = numIndexes / 3; ALIGN16( float x0[4] ); ALIGN16( float x1[4] ); ALIGN16( float x2[4] ); __asm { push ebx mov eax, n shl eax, 4 mov esi, verts mov edi, indexes mov edx, planes add edx, eax neg eax mov d, edx add eax, 4*16 jge done4 loopPlane4: mov a, eax mov ecx, dword ptr [edi+0*12+0] shl ecx, SHADOWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+1*12+0] shl ebx, SHADOWVERT_SIZE_SHIFT mov edx, dword ptr [edi+2*12+0] shl edx, SHADOWVERT_SIZE_SHIFT mov eax, dword ptr [edi+3*12+0] shl eax, SHADOWVERT_SIZE_SHIFT movlps xmm4, [esi+ecx+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ mov ecx, dword ptr [edi+0*12+4] shl ecx, SHADOWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+1*12+4] shl ebx, SHADOWVERT_SIZE_SHIFT mov edx, dword ptr [edi+2*12+4] shl edx, SHADOWVERT_SIZE_SHIFT mov eax, dword ptr [edi+3*12+4] shl eax, SHADOWVERT_SIZE_SHIFT movaps x0, xmm3 movaps x1, xmm4 movaps x2, xmm5 movlps xmm1, [esi+ecx+0] /* xmm1 = 0, 1, X, X */ movss xmm2, [esi+ecx+8] /* xmm2 = 2, X, X, X */ movhps xmm1, [esi+ebx+0] /* xmm1 = 0, 1, 4, 5 */ movhps xmm2, [esi+ebx+8] /* xmm2 = 2, X, 6, X */ movlps xmm6, [esi+edx+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+8] /* xmm6 = 10, X, 14, X */ movaps xmm0, xmm1 /* xmm0 = 0, 1, 4, 5 */ shufps xmm0, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm0 = 0, 4, 8, 12 */ shufps xmm1, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm1 = 1, 5, 9, 13 */ shufps xmm2, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm2 = 2, 6, 10, 14 */ mov ecx, dword ptr [edi+0*12+8] shl ecx, SHADOWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+1*12+8] shl ebx, SHADOWVERT_SIZE_SHIFT mov edx, dword ptr [edi+2*12+8] shl edx, SHADOWVERT_SIZE_SHIFT mov eax, dword ptr [edi+3*12+8] shl eax, SHADOWVERT_SIZE_SHIFT subps xmm0, xmm3 subps xmm1, xmm4 subps xmm2, xmm5 movlps xmm4, [esi+ecx+0] /* xmm4 = 0, 1, X, X */ movss xmm5, [esi+ecx+8] /* xmm5 = 2, X, X, X */ movhps xmm4, [esi+ebx+0] /* xmm4 = 0, 1, 4, 5 */ movhps xmm5, [esi+ebx+8] /* xmm5 = 2, X, 6, X */ movlps xmm6, [esi+edx+0] /* xmm6 = 8, 9, X, X */ movss xmm7, [esi+edx+8] /* xmm6 = 10, X, X, X */ movhps xmm6, [esi+eax+0] /* xmm6 = 8, 9, 12, 13 */ movhps xmm7, [esi+eax+8] /* xmm6 = 10, X, 14, X */ movaps xmm3, xmm4 /* xmm3 = 0, 1, 4, 5 */ shufps xmm3, xmm6, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm3 = 0, 4, 8, 12 */ shufps xmm4, xmm6, R_SHUFFLE_PS( 1, 3, 1, 3 ) /* xmm4 = 1, 5, 9, 13 */ shufps xmm5, xmm7, R_SHUFFLE_PS( 0, 2, 0, 2 ) /* xmm5 = 2, 6, 10, 14 */ mov eax, a mov edx, d add edi, 4*12 subps xmm3, x0 subps xmm4, x1 subps xmm5, x2 movaps xmm6, xmm4 mulps xmm6, xmm2 movaps xmm7, xmm5 mulps xmm7, xmm1 subps xmm6, xmm7 mulps xmm5, xmm0 mulps xmm2, xmm3 subps xmm5, xmm2 mulps xmm3, xmm1 mulps xmm4, xmm0 subps xmm3, xmm4 add eax, 4*16 movaps xmm0, xmm6 mulps xmm6, xmm6 movaps xmm1, xmm5 mulps xmm5, xmm5 movaps xmm2, xmm3 mulps xmm3, xmm3 addps xmm3, xmm5 addps xmm3, xmm6 rsqrtps xmm3, xmm3 mulps xmm0, xmm3 mulps xmm1, xmm3 mulps xmm2, xmm3 movaps xmm4, x0 movaps xmm5, x1 movaps xmm6, x2 mulps xmm4, xmm0 mulps xmm5, xmm1 mulps xmm6, xmm2 addps xmm4, xmm5 addps xmm4, xmm6 xorps xmm4, SIMD_SP_signBit // transpose xmm0, xmm1, xmm2, xmm4 to memory movaps xmm7, xmm0 movaps xmm5, xmm2 unpcklps xmm0, xmm1 unpcklps xmm2, xmm4 movlps [edx+eax-8*16+0], xmm0 movlps [edx+eax-8*16+8], xmm2 movhps [edx+eax-7*16+0], xmm0 movhps [edx+eax-7*16+8], xmm2 unpckhps xmm7, xmm1 unpckhps xmm5, xmm4 movlps [edx+eax-6*16+0], xmm7 movlps [edx+eax-6*16+8], xmm5 movhps [edx+eax-5*16+0], xmm7 movhps [edx+eax-5*16+8], xmm5 jle loopPlane4 done4: sub eax, 4*16 jge done loopPlane1: mov ecx, dword ptr [edi+0] shl ecx, SHADOWVERT_SIZE_SHIFT mov ebx, dword ptr [edi+4] shl ebx, SHADOWVERT_SIZE_SHIFT mov edx, dword ptr [edi+8] shl edx, SHADOWVERT_SIZE_SHIFT movss xmm0, [esi+ebx+0] subss xmm0, [esi+ecx+0] movss xmm1, [esi+ebx+4] subss xmm1, [esi+ecx+4] movss xmm2, [esi+ebx+8] subss xmm2, [esi+ecx+8] movss xmm3, [esi+edx+0] subss xmm3, [esi+ecx+0] movss xmm4, [esi+edx+4] subss xmm4, [esi+ecx+4] movss xmm5, [esi+edx+8] subss xmm5, [esi+ecx+8] movss xmm6, xmm4 mulss xmm6, xmm2 movss xmm7, xmm5 mulss xmm7, xmm1 subss xmm6, xmm7 add edi, 1*12 mulss xmm5, xmm0 mulss xmm2, xmm3 subss xmm5, xmm2 mulss xmm3, xmm1 mulss xmm4, xmm0 subss xmm3, xmm4 mov edx, d movss xmm0, xmm6 mulss xmm6, xmm6 movss xmm1, xmm5 mulss xmm5, xmm5 movss xmm2, xmm3 mulss xmm3, xmm3 add eax, 1*16 addss xmm3, xmm5 addss xmm3, xmm6 rsqrtss xmm3, xmm3 mulss xmm0, xmm3 mulss xmm1, xmm3 mulss xmm2, xmm3 movss [edx+eax-1*16+0], xmm0 movss [edx+eax-1*16+4], xmm1 movss [edx+eax-1*16+8], xmm2 mulss xmm0, [esi+ecx+0] mulss xmm1, [esi+ecx+4] mulss xmm2, [esi+ecx+8] xorps xmm0, SIMD_SP_firstSignBit subss xmm0, xmm1 subss xmm0, xmm2 movss [edx+eax-1*16+12], xmm0 jl loopPlane1 done: pop ebx } #else int i, j; for ( i = 0; i <= numIndexes - 12; i += 12 ) { ALIGN16( float d0[4]; ) ALIGN16( float d1[4]; ) ALIGN16( float d2[4]; ) ALIGN16( float d3[4]; ) ALIGN16( float d4[4]; ) ALIGN16( float d5[4]; ) ALIGN16( float n0[4]; ) ALIGN16( float n1[4]; ) ALIGN16( float n2[4]; ) for ( j = 0; j < 4; j++ ) { const shadowCache_t *a, *b, *c; a = verts + indexes[i + j * 3 + 0]; b = verts + indexes[i + j * 3 + 1]; c = verts + indexes[i + j * 3 + 2]; d0[j] = b->xyz[0] - a->xyz[0]; d1[j] = b->xyz[1] - a->xyz[1]; d2[j] = b->xyz[2] - a->xyz[2]; d3[j] = c->xyz[0] - a->xyz[0]; d4[j] = c->xyz[1] - a->xyz[1]; d5[j] = c->xyz[2] - a->xyz[2]; } ALIGN16( float tmp[4]; ) n0[0] = d4[0] * d2[0]; n0[1] = d4[1] * d2[1]; n0[2] = d4[2] * d2[2]; n0[3] = d4[3] * d2[3]; n0[0] -= d5[0] * d1[0]; n0[1] -= d5[1] * d1[1]; n0[2] -= d5[2] * d1[2]; n0[3] -= d5[3] * d1[3]; n1[0] = d5[0] * d0[0]; n1[1] = d5[1] * d0[1]; n1[2] = d5[2] * d0[2]; n1[3] = d5[3] * d0[3]; n1[0] -= d3[0] * d2[0]; n1[1] -= d3[1] * d2[1]; n1[2] -= d3[2] * d2[2]; n1[3] -= d3[3] * d2[3]; n2[0] = d3[0] * d1[0]; n2[1] = d3[1] * d1[1]; n2[2] = d3[2] * d1[2]; n2[3] = d3[3] * d1[3]; n2[0] -= d4[0] * d0[0]; n2[1] -= d4[1] * d0[1]; n2[2] -= d4[2] * d0[2]; n2[3] -= d4[3] * d0[3]; tmp[0] = n0[0] * n0[0]; tmp[1] = n0[1] * n0[1]; tmp[2] = n0[2] * n0[2]; tmp[3] = n0[3] * n0[3]; tmp[0] += n1[0] * n1[0]; tmp[1] += n1[1] * n1[1]; tmp[2] += n1[2] * n1[2]; tmp[3] += n1[3] * n1[3]; tmp[0] += n2[0] * n2[0]; tmp[1] += n2[1] * n2[1]; tmp[2] += n2[2] * n2[2]; tmp[3] += n2[3] * n2[3]; tmp[0] = idMath::RSqrt( tmp[0] ); tmp[1] = idMath::RSqrt( tmp[1] ); tmp[2] = idMath::RSqrt( tmp[2] ); tmp[3] = idMath::RSqrt( tmp[3] ); n0[0] *= tmp[0]; n0[1] *= tmp[1]; n0[2] *= tmp[2]; n0[3] *= tmp[3]; n1[0] *= tmp[0]; n1[1] *= tmp[1]; n1[2] *= tmp[2]; n1[3] *= tmp[3]; n2[0] *= tmp[0]; n2[1] *= tmp[1]; n2[2] *= tmp[2]; n2[3] *= tmp[3]; for ( j = 0; j < 4; j++ ) { const shadowCache_t *a; a = verts + indexes[i + j * 3]; planes->Normal()[0] = n0[j]; planes->Normal()[1] = n1[j]; planes->Normal()[2] = n2[j]; planes->FitThroughPoint( a->xyz.ToVec3() ); planes++; } } for ( ; i < numIndexes; i += 3 ) { const shadowCache_t *a, *b, *c; float d0, d1, d2, d3, d4, d5; float n0, n1, n2; a = verts + indexes[i + 0]; b = verts + indexes[i + 1]; c = verts + indexes[i + 2]; d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; d3 = c->xyz[0] - a->xyz[0]; d4 = c->xyz[1] - a->xyz[1]; d5 = c->xyz[2] - a->xyz[2]; float tmp; n0 = d4 * d2 - d5 * d1; n1 = d5 * d0 - d3 * d2; n2 = d3 * d1 - d4 * d0; tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 ); n0 *= tmp; n1 *= tmp; n2 *= tmp; planes->Normal()[0] = n0; planes->Normal()[1] = n1; planes->Normal()[2] = n2; planes->FitThroughPoint( a->xyz.ToVec3() ); planes++; } #endif } /* ============ idSIMD_SSE::CalculateFacing ============ */ void VPCALL idSIMD_SSE::CalculateFacing( byte *facing, const idPlane *planes, const int numTriangles, const idVec4 &light ) { #if 1 assert_16_byte_aligned( planes ); __asm { mov esi, light mov edi, facing mov ecx, numTriangles shl ecx, 4 movlps xmm4, [esi+0] shufps xmm4, xmm4, R_SHUFFLE_PS( 0, 1, 0, 1 ) movlps xmm5, [esi+8] shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 1, 0, 1 ) xorps xmm7, xmm7 mov esi, planes add esi, ecx neg ecx cmp ecx, -16*4 jg start1 loop4: movlps xmm0, [esi+ecx+ 0] movlps xmm1, [esi+ecx+ 8] movhps xmm0, [esi+ecx+16] movhps xmm1, [esi+ecx+24] movlps xmm2, [esi+ecx+32] movlps xmm3, [esi+ecx+40] movhps xmm2, [esi+ecx+48] movhps xmm3, [esi+ecx+56] add edi, 1*4 add ecx, 16*4 mulps xmm0, xmm4 mulps xmm1, xmm5 addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0 mulps xmm2, xmm4 mulps xmm3, xmm5 addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2 movaps xmm6, xmm0 shufps xmm0, xmm2, R_SHUFFLE_PS( 0, 2, 0, 2 ) // x3+z3 x2+z2 x1+z1 x0+z0 shufps xmm6, xmm2, R_SHUFFLE_PS( 1, 3, 1, 3 ) // y3+w3 y2+w2 y1+w1 y0+w0 addps xmm6, xmm0 cmpnleps xmm6, SIMD_SP_zero andps xmm6, SIMD_DW_facing_mask movhlps xmm7, xmm6 orps xmm6, xmm7 movaps xmm7, xmm6 shufps xmm7, xmm7, R_SHUFFLE_PS( 1, 0, 1, 0 ) orps xmm6, xmm7 movss [edi-1*4], xmm6 jl loop4 start1: shufps xmm4, xmm5, R_SHUFFLE_PS( 0, 1, 0, 1 ) test ecx, ecx jz done loop1: movaps xmm0, [esi+ecx] add edi, 1 mulps xmm0, xmm4 movhlps xmm1, xmm0 addps xmm0, xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 0 ) addss xmm0, xmm1 cmpnless xmm0, xmm7 movmskps eax, xmm0 and eax, 1 mov byte ptr [edi-1], al add ecx, 16 jl loop1 done: mov byte ptr [edi], 1 // for dangling edges to reference } #else int i; for ( i = 0; i < numTriangles; i++ ) { facing[i] = planes[i][0] * light.x + planes[i][1] * light.y + planes[i][2] * light.z + planes[i][3] * light.w > 0.0f; } facing[numTriangles] = 1; // for dangling edges to reference #endif } /* ============ idSIMD_SSE::CalculateCullBits ============ */ void VPCALL idSIMD_SSE::CalculateCullBits( byte *cullBits, const idDrawVert *verts, const int numVerts, const int frontBits, const idPlane lightPlanes[NUM_LIGHT_PLANES] ) { #if 1 int i; assert( NUM_LIGHT_PLANES <= sizeof( cullBits[0] ) * 8 ); memset( cullBits, 0, numVerts * sizeof( cullBits[0] ) ); float *planeSide = (float *) _alloca16( numVerts * sizeof( float ) ); for ( i = 0; i < NUM_LIGHT_PLANES; i++ ) { // if completely infront of this clipping plane if ( frontBits & ( 1 << i ) ) { continue; } Dot( planeSide, lightPlanes[i], verts, numVerts ); CmpLT( cullBits, i, planeSide, 0.0f, numVerts ); } #else int i, j; assert( NUM_LIGHT_PLANES <= sizeof( cullBits[0] ) * 8 ); memset( cullBits, 0, numVerts * sizeof( cullBits[0] ) ); for ( i = 0; i < NUM_LIGHT_PLANES; i++ ) { // if completely infront of this clipping plane if ( frontBits & ( 1 << i ) ) { continue; } const idPlane &plane = lightPlanes[i]; for ( j = 0; j < numVerts; j++ ) { int bit = plane[0] * verts[j].xyz.x + plane[1] * verts[j].xyz.y + plane[2] * verts[j].xyz.z + plane[3] < 0.0f; cullBits[j] |= bit << i; } } #endif } /* ============ idSIMD_SSE::CreateShadowCache ============ */ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) { #if 1 assert_16_byte_aligned( vertexCache ); assert_16_byte_aligned( verts ); assert_16_byte_aligned( DRAWVERT_SIZE ); assert_16_byte_aligned( DRAWVERT_XYZ_OFFSET ); __asm { movaps xmm4, SIMD_SP_clearLast movaps xmm5, SIMD_SP_lastOne movaps xmm6, xmm4 movaps xmm7, xmm5 mov esi, verts mov edi, vertexCache mov eax, numVerts and eax, ~3 jz done4 shl eax, 5 add edi, eax neg eax loop4: prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] movaps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] andps xmm0, xmm4 movaps [edi+eax+1*16], xmm0 orps xmm0, xmm5 movaps [edi+eax+0*16], xmm0 movaps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] andps xmm1, xmm6 movaps [edi+eax+3*16], xmm1 orps xmm1, xmm7 movaps [edi+eax+2*16], xmm1 movaps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] andps xmm2, xmm4 movaps [edi+eax+5*16], xmm2 orps xmm2, xmm5 movaps [edi+eax+4*16], xmm2 movaps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] andps xmm3, xmm6 movaps [edi+eax+7*16], xmm3 orps xmm3, xmm7 movaps [edi+eax+6*16], xmm3 add esi, 4*DRAWVERT_SIZE add eax, 4*8*4 jl loop4 done4: mov eax, numVerts and eax, 3 jz done1 shl eax, 5 add edi, eax neg eax loop1: movaps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET] andps xmm0, xmm4 movaps [edi+eax+1*16], xmm0 orps xmm0, xmm5 movaps [edi+eax+0*16], xmm0 add esi, DRAWVERT_SIZE add eax, 8*4 jl loop1 done1: } return numVerts * 2; #else for ( int i = 0; i < numVerts; i++ ) { const float *v = verts[i].xyz.ToFloatPtr(); vertexCache[i*2+0][0] = v[0]; vertexCache[i*2+0][1] = v[1]; vertexCache[i*2+0][2] = v[2]; vertexCache[i*2+0][3] = 1.0f; vertexCache[i*2+1][0] = v[0]; vertexCache[i*2+1][1] = v[1]; vertexCache[i*2+1][2] = v[2]; vertexCache[i*2+1][3] = 0.0f; } return numVerts * 2; #endif } /* ============ idSIMD_SSE::CreateShadowCache ============ */ int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, const shadowCache_t *verts, const int numVerts ) { #if 1 assert_16_byte_aligned( vertexCache ); assert_16_byte_aligned( verts ); assert_16_byte_aligned( SHADOWVERT_SIZE ); __asm { movaps xmm4, SIMD_SP_clearLast movaps xmm5, SIMD_SP_lastOne movaps xmm6, xmm4 movaps xmm7, xmm5 mov esi, verts mov edi, vertexCache mov eax, numVerts and eax, ~3 jz done4 shl eax, 5 add edi, eax neg eax loop4: prefetchnta [esi+4*SHADOWVERT_SIZE] movaps xmm0, [esi+0*SHADOWVERT_SIZE] andps xmm0, xmm4 movaps [edi+eax+1*16], xmm0 orps xmm0, xmm5 movaps [edi+eax+0*16], xmm0 movaps xmm1, [esi+1*SHADOWVERT_SIZE] andps xmm1, xmm6 movaps [edi+eax+3*16], xmm1 orps xmm1, xmm7 movaps [edi+eax+2*16], xmm1 movaps xmm2, [esi+2*SHADOWVERT_SIZE] andps xmm2, xmm4 movaps [edi+eax+5*16], xmm2 orps xmm2, xmm5 movaps [edi+eax+4*16], xmm2 movaps xmm3, [esi+3*SHADOWVERT_SIZE] andps xmm3, xmm6 movaps [edi+eax+7*16], xmm3 orps xmm3, xmm7 movaps [edi+eax+6*16], xmm3 add esi, 4*SHADOWVERT_SIZE add eax, 4*8*4 jl loop4 done4: mov eax, numVerts and eax, 3 jz done1 shl eax, 5 add edi, eax neg eax loop1: movaps xmm0, [esi+0*SHADOWVERT_SIZE] andps xmm0, xmm4 movaps [edi+eax+1*16], xmm0 orps xmm0, xmm5 movaps [edi+eax+0*16], xmm0 add esi, SHADOWVERT_SIZE add eax, 8*4 jl loop1 done1: } return numVerts * 2; #else for ( int i = 0; i < numVerts; i++ ) { const float *v = verts[i].xyz.ToFloatPtr(); vertexCache[i*2+0][0] = v[0]; vertexCache[i*2+0][1] = v[1]; vertexCache[i*2+0][2] = v[2]; vertexCache[i*2+0][3] = 1.0f; vertexCache[i*2+1][0] = v[0]; vertexCache[i*2+1][1] = v[1]; vertexCache[i*2+1][2] = v[2]; vertexCache[i*2+1][3] = 0.0f; } return numVerts * 2; #endif } #if 0 /* ============ idSIMD_SSE::ShadowVolume_CreateSilTriangles ============ */ #if defined ( GL_INDEX_SHORT ) int VPCALL idSIMD_SSE::ShadowVolume_CreateSilTriangles( vertIndex_t *shadowIndexes, const byte *facing, const silEdge_t *silEdges, const int numSilEdges ) { #if 1 assert_sizeof( vertIndex_t, 2 ); int num; __asm { push ebx mov eax, numSilEdges mov ebx, shadowIndexes mov esi, facing mov edi, silEdges shl eax, 3 jz done add edi, eax neg eax shr ebx, 2 add eax, 4*8 jg run1 loop4: movzx ecx, word ptr [edi+eax-4*8+0] movzx ecx, byte ptr [esi+ecx] movd mm2, ecx movzx edx, word ptr [edi+eax-4*8+2] movzx edx, byte ptr [esi+edx] pinsrw mm2, edx, 1 movd mm0, dword ptr [edi+eax-4*8+4] pshufw mm1, mm2, R_SHUFFLE_PS( 2, 0, 1, 1 ) xor ecx, edx pshufw mm0, mm0, R_SHUFFLE_PS( 0, 1, 1, 0 ) lea edx, [ecx*2+ecx] pxor mm0, mm1 add edx, ebx movd dword ptr [ebx*4+0], mm0 pshufw mm4, mm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) movd dword ptr [ebx*4+2*2], mm4 pxor mm2, mm0 movd dword ptr [ebx*4+4*2], mm2 movzx ecx, word ptr [edi+eax-3*8+0] movzx ecx, byte ptr [esi+ecx] movd mm3, ecx movzx ebx, word ptr [edi+eax-3*8+2] movzx ebx, byte ptr [esi+ebx] pinsrw mm3, ebx, 1 movd mm0, dword ptr [edi+eax-3*8+4] pshufw mm1, mm3, R_SHUFFLE_PS( 2, 0, 1, 1 ) xor ecx, ebx pshufw mm0, mm0, R_SHUFFLE_PS( 0, 1, 1, 0 ) lea ebx, [ecx*2+ecx] pxor mm0, mm1 add ebx, edx movd dword ptr [edx*4+0], mm0 pshufw mm4, mm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) movd dword ptr [edx*4+2*2], mm4 pxor mm3, mm0 movd dword ptr [edx*4+4*2], mm3 movzx ecx, word ptr [edi+eax-2*8+0] movzx ecx, byte ptr [esi+ecx] movd mm2, ecx movzx edx, word ptr [edi+eax-2*8+2] movzx edx, byte ptr [esi+edx] pinsrw mm2, edx, 1 movd mm0, dword ptr [edi+eax-2*8+4] pshufw mm1, mm2, R_SHUFFLE_PS( 2, 0, 1, 1 ) xor ecx, edx pshufw mm0, mm0, R_SHUFFLE_PS( 0, 1, 1, 0 ) lea edx, [ecx*2+ecx] pxor mm0, mm1 add edx, ebx movd dword ptr [ebx*4+0], mm0 pshufw mm4, mm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) movd dword ptr [ebx*4+2*2], mm4 pxor mm2, mm0 movd dword ptr [ebx*4+4*2], mm2 movzx ecx, word ptr [edi+eax-1*8+0] movzx ecx, byte ptr [esi+ecx] movd mm3, ecx movzx ebx, word ptr [edi+eax-1*8+2] movzx ebx, byte ptr [esi+ebx] pinsrw mm3, ebx, 1 movd mm0, dword ptr [edi+eax-1*8+4] pshufw mm1, mm3, R_SHUFFLE_PS( 2, 0, 1, 1 ) xor ecx, ebx pshufw mm0, mm0, R_SHUFFLE_PS( 0, 1, 1, 0 ) lea ebx, [ecx*2+ecx] pxor mm0, mm1 add ebx, edx movd dword ptr [edx*4+0], mm0 pshufw mm4, mm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) movd dword ptr [edx*4+2*2], mm4 pxor mm3, mm0 movd dword ptr [edx*4+4*2], mm3 add eax, 4*8 jle loop4 run1: sub eax, 4*8 jge done loop1: movzx ecx, word ptr [edi+eax+0] movzx ecx, byte ptr [esi+ecx] movd mm2, ecx movzx edx, word ptr [edi+eax+2] movzx edx, byte ptr [esi+edx] pinsrw mm2, edx, 1 movd mm0, dword ptr [edi+eax+4] pshufw mm1, mm2, R_SHUFFLE_PS( 2, 0, 1, 1 ) pshufw mm0, mm0, R_SHUFFLE_PS( 0, 1, 1, 0 ) pxor mm0, mm1 movd dword ptr [ebx*4+0], mm0 pshufw mm4, mm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) movd dword ptr [ebx*4+2*2], mm4 pxor mm2, mm0 movd dword ptr [ebx*4+4*2], mm2 xor ecx, edx lea edx, [ecx*2+ecx] add ebx, edx add eax, 8 jl loop1 done: shl ebx, 2 mov num, ebx pop ebx } __asm emms return ( num - (int)shadowIndexes ) >> 1; #else const silEdge_t *sil, * end; vertIndex_t *si; si = shadowIndexes; end = silEdges + numSilEdges; for ( sil = silEdges; sil < end; sil++ ) { byte f1 = facing[sil->p1]; byte f2 = facing[sil->p2]; byte t = -( f1 ^ f2 ) & 6; int v1 = sil->v1; si[0] = v1; si[3] = v1 + f2; si[4] = v1 + f1; int v2 = sil->v2; si[1] = v2 + f1; si[2] = v2 + f2; si[5] = v2 + 1; si += t; } return si - shadowIndexes; #endif } #endif #if 0 #if defined ( GL_INDEX_SHORT ) ALIGN4_INIT4( unsigned short SIMD_W_capTris_c0, 0, 0, 0, 1 ); ALIGN4_INIT4( unsigned short SIMD_W_capTris_c1, 1, 1, 0, 0 ); ALIGN4_INIT4( unsigned short SIMD_W_capTris_c2, 0, 1, 0, 0 ); int VPCALL idSIMD_SSE::ShadowVolume_CreateCapTriangles( vertIndex_t *shadowIndexes, const byte *facing, const vertIndex_t *indexes, const int numIndexes ) { #if 1 assert_sizeof( vertIndex_t, 2 ); int num = numIndexes / 3; __asm { push ebx mov eax, numIndexes mov ebx, shadowIndexes mov esi, facing mov edi, indexes shl eax, 1 jz done add edi, eax mov eax, num add esi, eax neg eax shr ebx, 3 movq mm6, SIMD_W_capTris_c0 movq mm7, SIMD_W_capTris_c1 movq mm5, SIMD_W_capTris_c2 add eax, 2 lea edx, [eax*2+eax] jg run1 loop4: movq mm0, [edi+edx*2-4*3*2+0] // xmm0 = 0, 1, 2, 3 paddw mm0, mm0 pshufw mm1, mm0, R_SHUFFLE_PS( 2, 1, 0, 0 ) // xmm1 = 2, 1, 0, 0 movzx ecx, byte ptr [esi+eax-4] pshufw mm2, mm0, R_SHUFFLE_PS( 1, 2, 1, 2 ) // xmm2 = 1, 2, 1, 2 sub ecx, 1 pxor mm1, mm6 and ecx, 3 movd dword ptr [ebx*8+0*2], xmm1 add ecx, ebx movhps dword ptr [ebx*8+2*2], xmm1 pxor xmm2, xmm7 movd dword ptr [ebx*8+4*2], xmm2 movq mm3, [edi+edx*4-3*3*4+4] // xmm3 = 4, 5, 6, 7 paddw mm3, mm3 shufps xmm0, xmm3, R_SHUFFLE_PS( 3, 3, 1, 0 ) // xmm0 = 3 3, 5, 4 movzx ebx, byte ptr [esi+eax-3] movq mm2, mm3 // xmm2 = 4, 5, 6, 7 sub ebx, 1 pxor mm0, mm5 and ebx, 3 movhps dword ptr [ecx*8+0*4], xmm0 add ebx, ecx movd dword ptr [ecx*8+2*4], mm0 pxor mm2, mm7 movd dword ptr [ecx*8+4*4], mm2 movq mm0, [edi+edx*4-1*3*4-4] // xmm0 = 8, 9, 10, 11 paddw mm0, mm0 shufps xmm3, xmm0, R_SHUFFLE_PS( 2, 3, 0, 1 ) // xmm3 = 6, 7, 8, 9 pshufw mm1, mm3, R_SHUFFLE_PS( 2, 1, 0, 0 ) // xmm1 = 8, 7, 6, 6 movzx ecx, byte ptr [esi+eax-2] pshufw mm2, mm3, R_SHUFFLE_PS( 1, 2, 1, 2 ) // xmm2 = 7, 8, 7, 8 sub ecx, 1 pxor mm1, mm6 and ecx, 3 movd dword ptr [ebx*8+0*4], mm1 add ecx, ebx movhps dword ptr [ebx*8+2*4], xmm1 pxor xmm2, xmm7 movd dword ptr [ebx*8+4*4], mm2 pshufw mm1, mm0, R_SHUFFLE_PS( 3, 2, 1, 1 ) movzx ebx, byte ptr [esi+eax-1] pshufw mm2, mm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) sub ebx, 1 pxor mm1, mm6 and ebx, 3 movd dword ptr [ecx*8+0*4], mm1 add ebx, ecx movhps dword ptr [ecx*8+2*4], xmm1 pxor mm2, mm7 movd dword ptr [ecx*8+4*4], mm2 add edx, 3*4 add eax, 4 jle loop4 run1: sub eax, 4 jge done loop1: lea edx, [eax*2+eax] movq mm0, [edi+edx*4+0] paddw mm0, mm0 pshufw mm1, mm0, R_SHUFFLE_PS( 2, 1, 0, 0 ) pshufw mm2, mm0, R_SHUFFLE_PS( 1, 2, 1, 2 ) pxor mm1, mm6 movd dword ptr [ebx*8+0*4], mm1 pxor mm2, mm7 movhps qword ptr [ebx*8+2*4], xmm1 movzx ecx, byte ptr [esi+eax] movd dword ptr [ebx*8+4*4], mm2 sub ecx, 1 and ecx, 3 add ebx, ecx add eax, 1 jl loop1 done: shl ebx, 3 mov num, ebx pop ebx } __asm emms; return ( num - (int)shadowIndexes ) >> 2; #else int i, j; vertIndex_t *si; si = shadowIndexes; for ( i = 0, j = 0; i < numIndexes - 3*4; i += 4*3, j += 4 ) { byte t0 = ( facing[j+0] - 1 ) & 6; int i0 = indexes[i+0*3+0] << 1; int i1 = indexes[i+0*3+1] << 1; int i2 = indexes[i+0*3+2] << 1; si[0] = i2; si[1] = i1; si[2] = i0; si[3] = i0 + 1; si[4] = i1 + 1; si[5] = i2 + 1; si += t0; byte t1 = ( facing[j+1] - 1 ) & 6; int i3 = indexes[i+1*3+0] << 1; int i4 = indexes[i+1*3+1] << 1; int i5 = indexes[i+1*3+2] << 1; si[0] = i5; si[1] = i4; si[2] = i3; si[3] = i3 + 1; si[4] = i4 + 1; si[5] = i5 + 1; si += t1; byte t2 = ( facing[j+2] - 1 ) & 6; int i6 = indexes[i+2*3+0] << 1; int i7 = indexes[i+2*3+1] << 1; int i8 = indexes[i+2*3+2] << 1; si[0] = i8; si[1] = i7; si[2] = i6; si[3] = i6 + 1; si[4] = i7 + 1; si[5] = i8 + 1; si += t2; byte t3 = ( facing[j+3] - 1 ) & 6; int i9 = indexes[i+3*3+0] << 1; int i10 = indexes[i+3*3+1] << 1; int i11 = indexes[i+3*3+2] << 1; si[0] = i11; si[1] = i10; si[2] = i9; si[3] = i9 + 1; si[4] = i10 + 1; si[5] = i11 + 1; si += t3; } for ( ; i < numIndexes; i += 3, j++ ) { byte t = ( facing[j] - 1 ) & 6; int i0 = indexes[i+0] << 1; int i1 = indexes[i+1] << 1; int i2 = indexes[i+2] << 1; si[0] = i2; si[1] = i1; si[2] = i0; si[3] = i0 + 1; si[4] = i1 + 1; si[5] = i2 + 1; si += t; } return si - shadowIndexes; #endif } #endif #endif #endif /* ============ SSE_UpSample11kHzMonoPCMTo44kHz ============ */ static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 2*4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi-2*4*4+0], xmm0 movhps [edi-2*4*4+8], xmm0 movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi-1*4*4+0], xmm1 movhps [edi-1*4*4+8], xmm1 add eax, 2*2 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi+0], xmm0 movhps [edi+8], xmm0 done: } } /* ============ SSE_UpSample11kHzStereoPCMTo44kHz ============ */ static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples test eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 8*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx unpcklps xmm0, xmm1 movlps [edi-8*4+0], xmm0 movlps [edi-8*4+8], xmm0 movlps [edi-4*4+0], xmm0 movlps [edi-4*4+8], xmm0 add eax, 2*2 jl loop2 done2: } } /* ============ SSE_UpSample22kHzMonoPCMTo44kHz ============ */ static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi-4*4+0], xmm0 movhps [edi-4*4+8], xmm0 add eax, 2*2 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi], xmm0 done: } } /* ============ SSE_UpSample22kHzStereoPCMTo44kHz ============ */ static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples test eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 4*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movss [edi-4*4], xmm0 movss [edi-2*4], xmm0 movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx movss [edi-3*4], xmm1 movss [edi-1*4], xmm1 add eax, 2*2 jl loop2 done2: } } /* ============ SSE_UpSample44kHzMonoPCMTo44kHz ============ */ static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) { __asm { mov esi, src mov edi, dest mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add esi, eax neg eax align 16 loop2: add edi, 2*4 movsx ecx, word ptr [esi+eax+0] cvtsi2ss xmm0, ecx movss [edi-2*4], xmm0 movsx edx, word ptr [esi+eax+2] cvtsi2ss xmm1, edx movss [edi-1*4], xmm1 add eax, 2*2 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movsx ecx, word ptr [esi] cvtsi2ss xmm0, ecx movss [edi], xmm0 done: } } /* ============ idSIMD_SSE::UpSamplePCMTo44kHz Duplicate samples for 44kHz output. ============ */ void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples ); } else { SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples ); } } else if ( kHz == 22050 ) { if ( numChannels == 1 ) { SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples ); } else { SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples ); } } else if ( kHz == 44100 ) { SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples ); } else { assert( 0 ); } } /* ============ SSE_UpSample11kHzMonoOGGTo44kHz ============ */ static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 2 add esi, eax neg eax align 16 loop2: add edi, 2*16 movss xmm0, [esi+eax+0] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi-32], xmm0 movlps [edi-24], xmm0 movss xmm1, [esi+eax+4] mulss xmm1, xmm7 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi-16], xmm1 movlps [edi- 8], xmm1 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [esi] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi+0], xmm0 movlps [edi+8], xmm0 done: } } /* ============ SSE_UpSample11kHzStereoOGGTo44kHz ============ */ static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov ecx, [esi+0] mov edx, [esi+4] mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add ecx, eax add edx, eax neg eax align 16 loop2: add edi, 4*16 movlps xmm0, [ecx+eax] movlps xmm1, [edx+eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi-8*8], xmm0 movlps [edi-7*8], xmm0 movlps [edi-6*8], xmm0 movlps [edi-5*8], xmm0 movhps [edi-4*8], xmm0 movhps [edi-3*8], xmm0 movhps [edi-2*8], xmm0 movhps [edi-1*8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [ecx] movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi+0*8], xmm0 movlps [edi+1*8], xmm0 movlps [edi+2*8], xmm0 movlps [edi+3*8], xmm0 done: } } /* ============ SSE_UpSample22kHzMonoOGGTo44kHz ============ */ static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 2 add esi, eax neg eax align 16 loop2: add edi, 2*8 movss xmm0, [esi+eax+0] movss xmm1, [esi+eax+4] shufps xmm0, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm0, xmm7 movlps [edi-16], xmm0 movhps [edi- 8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [esi] mulss xmm0, xmm7 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 0, 0 ) movlps [edi+0], xmm0 done: } } /* ============ SSE_UpSample22kHzStereoOGGTo44kHz ============ */ static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov ecx, [esi+0] mov edx, [esi+4] mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add ecx, eax add edx, eax neg eax align 16 loop2: add edi, 2*16 movlps xmm0, [ecx+eax] movlps xmm1, [edx+eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi-4*8], xmm0 movlps [edi-3*8], xmm0 movhps [edi-2*8], xmm0 movhps [edi-1*8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [ecx] movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi+0*8], xmm0 movlps [edi+1*8], xmm0 done: } } /* ============ SSE_UpSample44kHzMonoOGGTo44kHz ============ */ static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) { float constant = 32768.0f; KFLOAT_CA( mul, dest, src, constant, numSamples ) } /* ============ SSE_UpSample44kHzStereoOGGTo44kHz ============ */ static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) { float constant = 32768.0f; __asm { mov esi, src mov ecx, [esi+0] mov edx, [esi+4] mov edi, dest movss xmm7, constant shufps xmm7, xmm7, R_SHUFFLE_PS( 0, 0, 0, 0 ) mov eax, numSamples and eax, ~1 jz done2 shl eax, 1 add ecx, eax add edx, eax neg eax align 16 loop2: add edi, 16 movlps xmm0, [ecx+eax] movlps xmm1, [edx+eax] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi-2*8], xmm0 movhps [edi-1*8], xmm0 add eax, 2*4 jl loop2 done2: mov eax, numSamples and eax, 1 jz done movss xmm0, [ecx] movss xmm1, [edx] unpcklps xmm0, xmm1 mulps xmm0, xmm7 movlps [edi+0*8], xmm0 done: } } /* ============ idSIMD_SSE::UpSampleOGGTo44kHz Duplicate samples for 44kHz output. ============ */ void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); } else { SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples ); } } else if ( kHz == 22050 ) { if ( numChannels == 1 ) { SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); } else { SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples ); } } else if ( kHz == 44100 ) { if ( numChannels == 1 ) { SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples ); } else { SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples ); } } else { assert( 0 ); } } /* ============ idSIMD_SSE::MixSoundTwoSpeakerMono ============ */ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) { #if 1 ALIGN16( float incs[2]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 2 add esi, eax neg eax mov ecx, lastV movlps xmm6, [ecx] xorps xmm7, xmm7 movhps xmm7, incs shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) addps xmm6, xmm7 shufps xmm7, xmm7, R_SHUFFLE_PS( 2, 3, 2, 3 ) addps xmm7, xmm7 loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] movaps xmm1, xmm0 shufps xmm0, xmm0, R_SHUFFLE_PS( 0, 0, 1, 1 ) mulps xmm0, xmm6 addps xmm0, [edi-4*4*4] addps xmm6, xmm7 movaps [edi-4*4*4], xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 2, 3, 3 ) mulps xmm1, xmm6 addps xmm1, [edi-3*4*4] addps xmm6, xmm7 movaps [edi-3*4*4], xmm1 movaps xmm2, [esi+eax+1*4*4] movaps xmm3, xmm2 shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 1, 1 ) mulps xmm2, xmm6 addps xmm2, [edi-2*4*4] addps xmm6, xmm7 movaps [edi-2*4*4], xmm2 shufps xmm3, xmm3, R_SHUFFLE_PS( 2, 2, 3, 3 ) mulps xmm3, xmm6 addps xmm3, [edi-1*4*4] addps xmm6, xmm7 movaps [edi-1*4*4], xmm3 add eax, 2*4*4 jl loop16 } #else int i; float incL; float incR; float sL0, sL1, sL2, sL3; float sR0, sR1, sR2, sR3; assert( numSamples == MIXBUFFER_SAMPLES ); incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sR0 = lastV[1]; sL1 = lastV[0] + incL; sR1 = lastV[1] + incR; sL2 = lastV[0] + 2 * incL; sR2 = lastV[1] + 2 * incR; sL3 = lastV[0] + 3 * incL; sR3 = lastV[1] + 3 * incR; incL *= 4; incR *= 4; for( i = 0; i < MIXBUFFER_SAMPLES; i += 4 ) { mixBuffer[i*4+0] += samples[i+0] * sL0; mixBuffer[i*4+1] += samples[i+0] * sR0; mixBuffer[i*4+2] += samples[i+1] * sL1; mixBuffer[i*4+3] += samples[i+1] * sR1; mixBuffer[i*4+0] += samples[i+2] * sL2; mixBuffer[i*4+1] += samples[i+2] * sR2; mixBuffer[i*4+2] += samples[i+3] * sL3; mixBuffer[i*4+3] += samples[i+3] * sR3; sL0 += incL; sR0 += incR; sL1 += incL; sR1 += incR; sL2 += incL; sR2 += incR; sL3 += incL; sR3 += incR; } #endif } /* ============ idSIMD_SSE::MixSoundTwoSpeakerStereo ============ */ void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) { #if 1 ALIGN16( float incs[2]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 3 add esi, eax neg eax mov ecx, lastV movlps xmm6, [ecx] xorps xmm7, xmm7 movhps xmm7, incs shufps xmm6, xmm6, R_SHUFFLE_PS( 0, 1, 0, 1 ) addps xmm6, xmm7 shufps xmm7, xmm7, R_SHUFFLE_PS( 2, 3, 2, 3 ) addps xmm7, xmm7 loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] mulps xmm0, xmm6 addps xmm0, [edi-4*4*4] addps xmm6, xmm7 movaps [edi-4*4*4], xmm0 movaps xmm2, [esi+eax+1*4*4] mulps xmm2, xmm6 addps xmm2, [edi-3*4*4] addps xmm6, xmm7 movaps [edi-3*4*4], xmm2 movaps xmm3, [esi+eax+2*4*4] mulps xmm3, xmm6 addps xmm3, [edi-2*4*4] addps xmm6, xmm7 movaps [edi-2*4*4], xmm3 movaps xmm4, [esi+eax+3*4*4] mulps xmm4, xmm6 addps xmm4, [edi-1*4*4] addps xmm6, xmm7 movaps [edi-1*4*4], xmm4 add eax, 4*4*4 jl loop16 } #else int i; float incL; float incR; float sL0, sL1; float sR0, sR1; assert( numSamples == MIXBUFFER_SAMPLES ); incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sR0 = lastV[1]; sL1 = lastV[0] + incL; sR1 = lastV[1] + incR; incL *= 2; incR *= 2; for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) { mixBuffer[i*2+0] += samples[i*2+0] * sL0; mixBuffer[i*2+1] += samples[i*2+1] * sR0; mixBuffer[i*2+2] += samples[i*2+2] * sL1; mixBuffer[i*2+3] += samples[i*2+3] * sR1; sL0 += incL; sR0 += incR; sL1 += incL; sR1 += incR; } #endif } /* ============ idSIMD_SSE::MixSoundFourSpeakerMono ============ */ void VPCALL idSIMD_SSE::MixSoundFourSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { #if 1 ALIGN16( float incs[4]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 2 add esi, eax neg eax mov ecx, lastV movlps xmm2, [ecx+ 0] movhps xmm2, [ecx+ 16] movaps xmm3, xmm2 movlps xmm4, incs movhps xmm4, incs+8 addps xmm3, xmm4 addps xmm4, xmm4 /* xmm2: lastV[0], lastV[1], lastV[4], lastV[5] xmm3: lastV[0] + inc[0], lastV[1] + inc[1], lastV[4] + inc[2], lastV[5] + inc[3] xmm4: 2 * inc[0], 2 * inc[1], 2 * inc[2], 2 * inc[3] */ loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm2 addps xmm1, [edi-4*4*4] addps xmm2, xmm4 movaps [edi-4*4*4], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm1, xmm3 addps xmm1, [edi-3*4*4] addps xmm3, xmm4 movaps [edi-3*4*4], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 2, 2, 2 ) mulps xmm1, xmm2 addps xmm1, [edi-2*4*4] addps xmm2, xmm4 movaps [edi-2*4*4], xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 3, 3, 3, 3 ) mulps xmm0, xmm3 addps xmm0, [edi-1*4*4] addps xmm3, xmm4 movaps [edi-1*4*4], xmm0 add eax, 4*4 jl loop16 } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7; float incL0, incL1, incL2, incL3; assert( numSamples == MIXBUFFER_SAMPLES ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[4]; sL3 = lastV[5]; sL4 = lastV[0] + incL0; sL5 = lastV[1] + incL1; sL6 = lastV[4] + incL2; sL7 = lastV[5] + incL3; incL0 *= 2; incL1 *= 2; incL2 *= 2; incL3 *= 2; for( i = 0; i < MIXBUFFER_SAMPLES - 4; i += 4 ) { mixBuffer[i*4+ 0] += samples[i+0] * sL0; mixBuffer[i*4+ 1] += samples[i+0] * sL1; mixBuffer[i*4+ 2] += samples[i+0] * sL2; mixBuffer[i*4+ 3] += samples[i+0] * sL3; mixBuffer[i*4+ 4] += samples[i+1] * sL4; mixBuffer[i*4+ 5] += samples[i+1] * sL5; mixBuffer[i*4+ 6] += samples[i+1] * sL6; mixBuffer[i*4+ 7] += samples[i+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL0; sL5 += incL1; sL6 += incL2; sL7 += incL3; mixBuffer[i*4+ 8] += samples[i+2] * sL0; mixBuffer[i*4+ 9] += samples[i+2] * sL1; mixBuffer[i*4+10] += samples[i+2] * sL2; mixBuffer[i*4+11] += samples[i+2] * sL3; mixBuffer[i*4+12] += samples[i+3] * sL4; mixBuffer[i*4+13] += samples[i+3] * sL5; mixBuffer[i*4+14] += samples[i+3] * sL6; mixBuffer[i*4+15] += samples[i+3] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL0; sL5 += incL1; sL6 += incL2; sL7 += incL3; } #endif } /* ============ idSIMD_SSE::MixSoundFourSpeakerStereo ============ */ void VPCALL idSIMD_SSE::MixSoundFourSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { #if 1 ALIGN16( float incs[4]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); compile_time_assert( SPEAKER_RIGHT == 1 ); compile_time_assert( SPEAKER_BACKRIGHT == 5 ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 3 add esi, eax neg eax mov ecx, lastV movlps xmm5, [ecx+ 0] movhps xmm5, [ecx+ 16] movaps xmm6, xmm5 movlps xmm7, incs movhps xmm7, incs+8 addps xmm6, xmm7 addps xmm7, xmm7 /* xmm5: lastV[0], lastV[1], lastV[4], lastV[5] xmm6: lastV[0] + incs[0], lastV[1] + incs[1], lastV[4] + incs[2], lastV[5] + incs[3] xmm7: 2 * incs[0], 2 * incs[1], 2 * incs[2], 2 * incs[3] */ loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 1 ) mulps xmm1, xmm5 addps xmm1, [edi-4*4*4] addps xmm5, xmm7 movaps [edi-4*4*4], xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) mulps xmm0, xmm6 addps xmm0, [edi-3*4*4] addps xmm6, xmm7 movaps [edi-3*4*4], xmm0 movaps xmm2, [esi+eax+1*4*4] movaps xmm3, xmm2 shufps xmm3, xmm3, R_SHUFFLE_PS( 0, 1, 0, 1 ) mulps xmm3, xmm5 addps xmm3, [edi-2*4*4] addps xmm5, xmm7 movaps [edi-2*4*4], xmm3 shufps xmm2, xmm2, R_SHUFFLE_PS( 2, 3, 2, 3 ) mulps xmm2, xmm6 addps xmm2, [edi-1*4*4] addps xmm6, xmm7 movaps [edi-1*4*4], xmm2 add eax, 2*4*4 jl loop16 } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7; float incL0, incL1, incL2, incL3; assert( numSamples == MIXBUFFER_SAMPLES ); compile_time_assert( SPEAKER_RIGHT == 1 ); compile_time_assert( SPEAKER_BACKRIGHT == 5 ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[4]; sL3 = lastV[5]; sL4 = lastV[0] + incL0; sL5 = lastV[1] + incL1; sL6 = lastV[4] + incL2; sL7 = lastV[5] + incL3; incL0 *= 2; incL1 *= 2; incL2 *= 2; incL3 *= 2; for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) { mixBuffer[i*4+ 0] += samples[i*2+0+0] * sL0; mixBuffer[i*4+ 1] += samples[i*2+0+1] * sL1; mixBuffer[i*4+ 2] += samples[i*2+0+0] * sL2; mixBuffer[i*4+ 3] += samples[i*2+0+1] * sL3; mixBuffer[i*4+ 4] += samples[i*2+2+0] * sL4; mixBuffer[i*4+ 5] += samples[i*2+2+1] * sL5; mixBuffer[i*4+ 6] += samples[i*2+2+0] * sL6; mixBuffer[i*4+ 7] += samples[i*2+2+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL0; sL5 += incL1; sL6 += incL2; sL7 += incL3; mixBuffer[i*4+ 8] += samples[i*2+4+0] * sL0; mixBuffer[i*4+ 9] += samples[i*2+4+1] * sL1; mixBuffer[i*4+10] += samples[i*2+4+0] * sL2; mixBuffer[i*4+11] += samples[i*2+4+1] * sL3; mixBuffer[i*4+12] += samples[i*2+6+0] * sL4; mixBuffer[i*4+13] += samples[i*2+6+1] * sL5; mixBuffer[i*4+14] += samples[i*2+6+0] * sL6; mixBuffer[i*4+15] += samples[i*2+6+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL0; sL5 += incL1; sL6 += incL2; sL7 += incL3; } #endif } /* ============ idSIMD_SSE::MixSoundSixSpeakerMono ============ */ void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { #if 1 ALIGN16( float incs[6]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 2 add esi, eax neg eax mov ecx, lastV movlps xmm2, [ecx+ 0] movhps xmm2, [ecx+ 8] movlps xmm3, [ecx+16] movaps xmm4, xmm2 shufps xmm3, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) shufps xmm4, xmm3, R_SHUFFLE_PS( 2, 3, 0, 1 ) xorps xmm5, xmm5 movhps xmm5, incs movlps xmm7, incs+8 movhps xmm7, incs+16 addps xmm3, xmm5 addps xmm4, xmm7 shufps xmm5, xmm7, R_SHUFFLE_PS( 2, 3, 0, 1 ) movaps xmm6, xmm7 shufps xmm6, xmm5, R_SHUFFLE_PS( 2, 3, 0, 1 ) addps xmm5, xmm5 addps xmm6, xmm6 addps xmm7, xmm7 loop24: add edi, 6*16 movaps xmm0, [esi+eax] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) mulps xmm1, xmm2 addps xmm1, [edi-6*16] addps xmm2, xmm5 movaps [edi-6*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 1, 1 ) mulps xmm1, xmm3 addps xmm1, [edi-5*16] addps xmm3, xmm6 movaps [edi-5*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) mulps xmm1, xmm4 addps xmm1, [edi-4*16] addps xmm4, xmm7 movaps [edi-4*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 2, 2, 2 ) mulps xmm1, xmm2 addps xmm1, [edi-3*16] addps xmm2, xmm5 movaps [edi-3*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 2, 3, 3 ) mulps xmm1, xmm3 addps xmm1, [edi-2*16] addps xmm3, xmm6 movaps [edi-2*16], xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 3, 3, 3, 3 ) mulps xmm0, xmm4 addps xmm0, [edi-1*16] addps xmm4, xmm7 movaps [edi-1*16], xmm0 add eax, 4*4 jl loop24 } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11; float incL0, incL1, incL2, incL3, incL4, incL5; assert( numSamples == MIXBUFFER_SAMPLES ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[2]; sL3 = lastV[3]; sL4 = lastV[4]; sL5 = lastV[5]; sL6 = lastV[0] + incL0; sL7 = lastV[1] + incL1; sL8 = lastV[2] + incL2; sL9 = lastV[3] + incL3; sL10 = lastV[4] + incL4; sL11 = lastV[5] + incL5; incL0 *= 2; incL1 *= 2; incL2 *= 2; incL3 *= 2; incL4 *= 2; incL5 *= 2; for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) { mixBuffer[i*6+ 0] += samples[i+0] * sL0; mixBuffer[i*6+ 1] += samples[i+0] * sL1; mixBuffer[i*6+ 2] += samples[i+0] * sL2; mixBuffer[i*6+ 3] += samples[i+0] * sL3; mixBuffer[i*6+ 4] += samples[i+0] * sL4; mixBuffer[i*6+ 5] += samples[i+0] * sL5; mixBuffer[i*6+ 6] += samples[i+1] * sL6; mixBuffer[i*6+ 7] += samples[i+1] * sL7; mixBuffer[i*6+ 8] += samples[i+1] * sL8; mixBuffer[i*6+ 9] += samples[i+1] * sL9; mixBuffer[i*6+10] += samples[i+1] * sL10; mixBuffer[i*6+11] += samples[i+1] * sL11; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL0; sL7 += incL1; sL8 += incL2; sL9 += incL3; sL10 += incL4; sL11 += incL5; mixBuffer[i*6+12] += samples[i+2] * sL0; mixBuffer[i*6+13] += samples[i+2] * sL1; mixBuffer[i*6+14] += samples[i+2] * sL2; mixBuffer[i*6+15] += samples[i+2] * sL3; mixBuffer[i*6+16] += samples[i+2] * sL4; mixBuffer[i*6+17] += samples[i+2] * sL5; mixBuffer[i*6+18] += samples[i+3] * sL6; mixBuffer[i*6+19] += samples[i+3] * sL7; mixBuffer[i*6+20] += samples[i+3] * sL8; mixBuffer[i*6+21] += samples[i+3] * sL9; mixBuffer[i*6+22] += samples[i+3] * sL10; mixBuffer[i*6+23] += samples[i+3] * sL11; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL0; sL7 += incL1; sL8 += incL2; sL9 += incL3; sL10 += incL4; sL11 += incL5; } #endif } /* ============ idSIMD_SSE::MixSoundSixSpeakerStereo ============ */ void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { #if 1 ALIGN16( float incs[6]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); compile_time_assert( SPEAKER_RIGHT == 1 ); compile_time_assert( SPEAKER_BACKRIGHT == 5 ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 3 add esi, eax neg eax mov ecx, lastV // ecx = lastV movlps xmm2, [ecx+ 0] movhps xmm2, [ecx+ 8] // xmm2 = lastV[0], lastV[1], lastV[2], lastV[3] movlps xmm3, [ecx+16] // xmm3 = lastV[4], lastV[5], ???, ??? movaps xmm4, xmm2 // xmm4 = xmm2 shufps xmm3, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) // xmm3 = lastV[4], lastV[5], lastV[0], lastV[1] shufps xmm4, xmm3, R_SHUFFLE_PS( 2, 3, 0, 1 ) // xmm4 = lastV[2], lastV[3], lastV[4], lastV[5] xorps xmm5, xmm5 // xmm5 = 0 movhps xmm5, incs // xmm5 = 0, 0, incs[0], incs[1] movlps xmm7, incs+ 8 movhps xmm7, incs+16 // xmm7 = incs[2], incs[3], incs[4], incs[5] addps xmm3, xmm5 // xmm3 = lastV[4], lastV[5], lastV[0] + incs[0], lastV[1] + incs[1] addps xmm4, xmm7 // xmm4 = lastV[2] + incs[2], lastV[3] + incs[3], lastV[4] + incs[4], lastV[5] + incs[5] shufps xmm5, xmm7, R_SHUFFLE_PS( 2, 3, 0, 1 ) // xmm5 = incs[0], incs[1], incs[2], incs[3] movaps xmm6, xmm7 // xmm6 = xmm7 shufps xmm6, xmm5, R_SHUFFLE_PS( 2, 3, 0, 1 ) // xmm6 = incs[4], incs[5], incs[0], incs[1] addps xmm5, xmm5 // xmm5 = 2 * incs[0], 2 * incs[1], 2 * incs[2], 2 * incs[3] addps xmm6, xmm6 // xmm6 = 2 * incs[4], 2 * incs[5], 2 * incs[0], 2 * incs[1] addps xmm7, xmm7 // xmm7 = 2 * incs[2], 2 * incs[3], 2 * incs[4], 2 * incs[5] /* xmm2: lastV[0], lastV[1], lastV[2], lastV[3] xmm3: lastV[4], lastV[5], lastV[0] + incs[0], lastV[1] + incs[1] xmm4: lastV[2] + incs[2], lastV[3] + incs[3], lastV[4] + incs[4], lastV[5] + incs[5] xmm5: 2 * incs[0], 2 * incs[1], 2 * incs[2], 2 * incs[3] xmm6: 2 * incs[4], 2 * incs[5], 2 * incs[0], 2 * incs[1] xmm7: 2 * incs[2], 2 * incs[3], 2 * incs[4], 2 * incs[5] */ loop12: add edi, 3*16 movaps xmm0, [esi+eax+0] // xmm0 = lastSample + eax; - going through this backwards in blocks movaps xmm1, xmm0 // xmm1 = xmm0, four float samples loaded shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 0 ) // xmm1 = s[0], s[1], s[0], s[0] mulps xmm1, xmm2 // xmm1 = s[0] *, s[1] *, s[0] *, s[0] * addps xmm1, [edi-3*16] addps xmm2, xmm5 movaps [edi-3*16], xmm1 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 2, 3 ) mulps xmm1, xmm3 addps xmm1, [edi-2*16] addps xmm3, xmm6 movaps [edi-2*16], xmm1 add eax, 4*4 shufps xmm0, xmm0, R_SHUFFLE_PS( 2, 2, 2, 3 ) mulps xmm0, xmm4 addps xmm0, [edi-1*16] addps xmm4, xmm7 movaps [edi-1*16], xmm0 jl loop12 emms } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11; float incL0, incL1, incL2, incL3, incL4, incL5; assert( numSamples == MIXBUFFER_SAMPLES ); compile_time_assert( SPEAKER_RIGHT == 1 ); compile_time_assert( SPEAKER_BACKRIGHT == 5 ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[2]; sL3 = lastV[3]; sL4 = lastV[4]; sL5 = lastV[5]; sL6 = lastV[0] + incL0; sL7 = lastV[1] + incL1; sL8 = lastV[2] + incL2; sL9 = lastV[3] + incL3; sL10 = lastV[4] + incL4; sL11 = lastV[5] + incL5; incL0 *= 2; incL1 *= 2; incL2 *= 2; incL3 *= 2; incL4 *= 2; incL5 *= 2; for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) { mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0; mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1; mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2; mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3; mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4; mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5; mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6; mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7; mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8; mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9; mixBuffer[i*6+10] += samples[i*2+2+0] * sL10; mixBuffer[i*6+11] += samples[i*2+2+1] * sL11; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL0; sL7 += incL1; sL8 += incL2; sL9 += incL3; sL10 += incL4; sL11 += incL5; } #endif } /* ============ idSIMD_SSE::MixSoundEightSpeakerMono ============ */ void VPCALL idSIMD_SSE::MixSoundEightSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[8], const float currentV[8] ) { #if 1 ALIGN16( float incs[8]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; incs[6] = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES; incs[7] = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 2 add esi, eax neg eax mov ecx, lastV movlps xmm4, [ecx+ 0] movhps xmm4, [ecx+ 8] movlps xmm5, [ecx+ 16] movhps xmm5, [ecx+ 24] movlps xmm6, incs movhps xmm6, incs+8 movlps xmm7, incs+16 movhps xmm7, incs+24 /* xmm4: lastV[0], lastV[1], lastV[2], lastV[3] xmm5: lastV[3], lastV[4], lastV[6], lastV[7] xmm6: inc[0], inc[1], inc[2], inc[3] xmm7: inc[4], inc[5], inc[6], inc[7] */ loop32: add edi, 8*4*4 movaps xmm0, [esi+eax+0*4*4] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 0, 0, 0 ) movaps xmm2, xmm1 mulps xmm1, xmm4 addps xmm1, [edi-8*4*4] addps xmm4, xmm6 movaps [edi-8*4*4], xmm1 mulps xmm2, xmm5 addps xmm2, [edi-7*4*4] addps xmm5, xmm7 movaps [edi-7*4*4], xmm2 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 1, 1, 1, 1 ) movaps xmm2, xmm1 mulps xmm1, xmm4 addps xmm1, [edi-6*4*4] addps xmm4, xmm6 movaps [edi-6*4*4], xmm1 mulps xmm2, xmm5 addps xmm2, [edi-5*4*4] addps xmm5, xmm7 movaps [edi-5*4*4], xmm2 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 2, 2, 2 ) movaps xmm2, xmm1 mulps xmm1, xmm4 addps xmm1, [edi-4*4*4] addps xmm4, xmm6 movaps [edi-4*4*4], xmm1 mulps xmm2, xmm5 addps xmm2, [edi-3*4*4] addps xmm5, xmm7 movaps [edi-3*4*4], xmm2 shufps xmm0, xmm0, R_SHUFFLE_PS( 3, 3, 3, 3 ) movaps xmm2, xmm0 mulps xmm0, xmm4 addps xmm0, [edi-2*4*4] addps xmm4, xmm6 movaps [edi-2*4*4], xmm0 mulps xmm2, xmm5 addps xmm2, [edi-1*4*4] addps xmm5, xmm7 movaps [edi-1*4*4], xmm2 add eax, 4*4 jl loop32 } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7; float incL0, incL1, incL2, incL3, incL4, incL5, incL6, incL7; assert( numSamples == MIXBUFFER_SAMPLES ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; incL6 = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES; incL7 = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[2]; sL3 = lastV[3]; sL4 = lastV[4]; sL5 = lastV[5]; sL6 = lastV[6]; sL7 = lastV[7]; for( i = 0; i < MIXBUFFER_SAMPLES - 4; i += 4 ) { mixBuffer[i*8+ 0] += samples[i+0] * sL0; mixBuffer[i*8+ 1] += samples[i+0] * sL1; mixBuffer[i*8+ 2] += samples[i+0] * sL2; mixBuffer[i*8+ 3] += samples[i+0] * sL3; mixBuffer[i*8+ 4] += samples[i+0] * sL4; mixBuffer[i*8+ 5] += samples[i+0] * sL5; mixBuffer[i*8+ 6] += samples[i+0] * sL6; mixBuffer[i*8+ 7] += samples[i+0] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; mixBuffer[i*8+ 8] += samples[i+1] * sL0; mixBuffer[i*8+ 9] += samples[i+1] * sL1; mixBuffer[i*8+10] += samples[i+1] * sL2; mixBuffer[i*8+11] += samples[i+1] * sL3; mixBuffer[i*8+12] += samples[i+1] * sL4; mixBuffer[i*8+13] += samples[i+1] * sL5; mixBuffer[i*8+14] += samples[i+1] * sL6; mixBuffer[i*8+15] += samples[i+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; mixBuffer[i*8+16] += samples[i+2] * sL0; mixBuffer[i*8+17] += samples[i+2] * sL1; mixBuffer[i*8+18] += samples[i+2] * sL2; mixBuffer[i*8+19] += samples[i+2] * sL3; mixBuffer[i*8+20] += samples[i+2] * sL4; mixBuffer[i*8+21] += samples[i+2] * sL5; mixBuffer[i*8+22] += samples[i+2] * sL6; mixBuffer[i*8+23] += samples[i+2] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; mixBuffer[i*8+24] += samples[i+3] * sL0; mixBuffer[i*8+25] += samples[i+3] * sL1; mixBuffer[i*8+26] += samples[i+3] * sL2; mixBuffer[i*8+27] += samples[i+3] * sL3; mixBuffer[i*8+28] += samples[i+3] * sL4; mixBuffer[i*8+29] += samples[i+3] * sL5; mixBuffer[i*8+30] += samples[i+3] * sL6; mixBuffer[i*8+31] += samples[i+3] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; } #endif } /* ============ idSIMD_SSE::MixSoundEightSpeakerStereo ============ */ void VPCALL idSIMD_SSE::MixSoundEightSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[8], const float currentV[8] ) { #if 1 ALIGN16( float incs[8]; ) assert_16_byte_aligned( mixBuffer ); assert_16_byte_aligned( samples ); assert( numSamples == MIXBUFFER_SAMPLES ); compile_time_assert( SPEAKER_RIGHT == 1 ); compile_time_assert( SPEAKER_BACKRIGHT == 5 ); compile_time_assert( SPEAKER_SIDERIGHT == 7 ); incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; incs[6] = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES; incs[7] = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES; __asm { mov eax, MIXBUFFER_SAMPLES mov edi, mixBuffer mov esi, samples shl eax, 3 add esi, eax neg eax mov ecx, lastV movlps xmm4, [ecx+ 0] movhps xmm4, [ecx+ 8] movlps xmm5, [ecx+ 16] movhps xmm5, [ecx+ 24] movlps xmm6, incs movhps xmm6, incs+8 movlps xmm7, incs+16 movhps xmm7, incs+24 /* xmm4: lastV[0], lastV[1], lastV[2], lastV[3] xmm5: lastV[3], lastV[4], lastV[6], lastV[7] xmm6: inc[0], inc[1], inc[2], inc[3] xmm7: inc[4], inc[5], inc[6], inc[7] */ loop16: add edi, 4*4*4 movaps xmm0, [esi+eax+0*4*4] movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 0, 1, 0, 0 ) movaps xmm2, xmm1 mulps xmm1, xmm4 addps xmm1, [edi-4*4*4] addps xmm4, xmm6 movaps [edi-4*4*4], xmm1 movaps xmm2, xmm0 shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 1, 0, 1 ) mulps xmm2, xmm5 addps xmm2, [edi-3*4*4] addps xmm5, xmm7 movaps [edi-3*4*4], xmm2 movaps xmm1, xmm0 shufps xmm1, xmm1, R_SHUFFLE_PS( 2, 3, 2, 2 ) movaps xmm2, xmm1 mulps xmm1, xmm4 addps xmm1, [edi-2*4*4] addps xmm4, xmm6 movaps [edi-2*4*4], xmm1 shufps xmm0, xmm0, R_SHUFFLE_PS( 2, 3, 2, 3 ) mulps xmm0, xmm5 addps xmm0, [edi-1*4*4] addps xmm5, xmm7 movaps [edi-1*4*4], xmm0 add eax, 4*4 jl loop16 } #else int i; float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7; float incL0, incL1, incL2, incL3, incL4, incL5, incL6, incL7; assert( numSamples == MIXBUFFER_SAMPLES ); incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; incL6 = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES; incL7 = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES; sL0 = lastV[0]; sL1 = lastV[1]; sL2 = lastV[2]; sL3 = lastV[3]; sL4 = lastV[4]; sL5 = lastV[5]; sL6 = lastV[6]; sL7 = lastV[7]; for( i = 0; i < MIXBUFFER_SAMPLES - 2; i += 2 ) { mixBuffer[i*8+ 0] += samples[i*2+0+0] * sL0; mixBuffer[i*8+ 1] += samples[i*2+0+1] * sL1; mixBuffer[i*8+ 2] += samples[i*2+0+0] * sL2; mixBuffer[i*8+ 3] += samples[i*2+0+0] * sL3; mixBuffer[i*8+ 4] += samples[i*2+0+0] * sL4; mixBuffer[i*8+ 5] += samples[i*2+0+1] * sL5; mixBuffer[i*8+ 6] += samples[i*2+0+0] * sL6; mixBuffer[i*8+ 7] += samples[i*2+0+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; mixBuffer[i*8+ 8] += samples[i*2+2+0] * sL0; mixBuffer[i*8+ 9] += samples[i*2+2+1] * sL1; mixBuffer[i*8+10] += samples[i*2+2+0] * sL2; mixBuffer[i*8+11] += samples[i*2+2+0] * sL3; mixBuffer[i*8+12] += samples[i*2+2+0] * sL4; mixBuffer[i*8+13] += samples[i*2+2+1] * sL5; mixBuffer[i*8+14] += samples[i*2+2+0] * sL6; mixBuffer[i*8+15] += samples[i*2+2+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; } #endif } /* ============ idSIMD_SSE::MixedSoundToSamples ============ */ void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) { #if 1 assert_16_byte_aligned( samples ); assert_16_byte_aligned( mixBuffer ); assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 ); __asm { mov eax, numSamples mov edi, mixBuffer mov esi, samples shl eax, 2 add edi, eax neg eax loop16: movaps xmm0, [edi+eax+0*16] movaps xmm2, [edi+eax+1*16] movaps xmm4, [edi+eax+2*16] movaps xmm6, [edi+eax+3*16] add esi, 4*4*2 movhlps xmm1, xmm0 movhlps xmm3, xmm2 movhlps xmm5, xmm4 movhlps xmm7, xmm6 prefetchnta [edi+eax+64] cvtps2pi mm0, xmm0 cvtps2pi mm2, xmm2 cvtps2pi mm4, xmm4 cvtps2pi mm6, xmm6 prefetchnta [edi+eax+128] cvtps2pi mm1, xmm1 cvtps2pi mm3, xmm3 cvtps2pi mm5, xmm5 cvtps2pi mm7, xmm7 add eax, 4*16 packssdw mm0, mm1 packssdw mm2, mm3 packssdw mm4, mm5 packssdw mm6, mm7 movq [esi-4*4*2], mm0 movq [esi-3*4*2], mm2 movq [esi-2*4*2], mm4 movq [esi-1*4*2], mm6 jl loop16 emms } #else for ( int i = 0; i < numSamples; i++ ) { if ( mixBuffer[i] <= -32768.0f ) { samples[i] = -32768; } else if ( mixBuffer[i] >= 32767.0f ) { samples[i] = 32767; } else { samples[i] = (short) mixBuffer[i]; } } #endif } #endif /* ID_WIN_X86_ASM */