quadrilateralcowboy/idlib/math/Simd_SSE.cpp

18088 lines
463 KiB
C++

/*
===========================================================================
Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"
//===============================================================
// M
// SSE implementation of idSIMDProcessor MrE
// E
//===============================================================
#if defined(MACOS_X) && defined(__i386__)
#include <xmmintrin.h>
#define DRAWVERT_SIZE 60
#define DRAWVERT_XYZ_OFFSET (0*4)
#define DRAWVERT_ST_OFFSET (3*4)
#define DRAWVERT_NORMAL_OFFSET (5*4)
#define DRAWVERT_TANGENT0_OFFSET (8*4)
#define DRAWVERT_TANGENT1_OFFSET (11*4)
#define DRAWVERT_COLOR_OFFSET (14*4)
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
/*
============
idSIMD_SSE::GetName
============
*/
const char * idSIMD_SSE::GetName( void ) const {
return "MMX & SSE";
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
// 0, 1, 2
// 3, 4, 5
// 6, 7, 8
// 9, 10, 11
/*
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
*/
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers.
int count_l4 = count; // count_l4 = eax
int count_l1 = count; // count_l1 = edx
char *constant_p = (char *)&constant; // constant_p = edi
char *src_p = (char *) src; // src_p = esi
char *dst_p = (char *) dst; // dst_p = ecx
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
/*
and eax, ~3
movss xmm4, [edi+0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm5, [edi+4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [edi+12]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
*/
count_l4 = count_l4 & ~3;
xmm4 = _mm_load_ss((float *) (constant_p));
xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
xmm5 = _mm_load_ss((float *) (constant_p + 4));
xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
xmm6 = _mm_load_ss((float *) (constant_p + 8));
xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
xmm7 = _mm_load_ss((float *) (constant_p + 12));
xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
/*
jz startVert1
*/
if(count_l4 != 0) {
/*
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
*/
count_l4 = count_l4 * DRAWVERT_SIZE;
src_p = src_p + count_l4;
count_l4 = -count_l4;
/*
loopVert4:
*/
do {
/*
movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
movaps xmm1, xmm0 // 3, X, 0, 1
*/
xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X
xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X
xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1
xmm1 = xmm0; // 3, X, 0, 1
/*
movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
*/
xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1
xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5
/*
movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
*/
xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X
xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7
xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9
/*
movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
*/
xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7
xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10
/*
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
*/
xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X
xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11
/*
add ecx, 16
add eax, 4*DRAWVERT_SIZE
*/
dst_p = dst_p + 16;
count_l4 = count_l4 + 4*DRAWVERT_SIZE;
/*
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
addps xmm0, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
*/
xmm0 = _mm_mul_ps(xmm0, xmm4);
xmm1 = _mm_mul_ps(xmm1, xmm5);
xmm2 = _mm_mul_ps(xmm2, xmm6);
xmm0 = _mm_add_ps(xmm0, xmm7);
xmm0 = _mm_add_ps(xmm0, xmm1);
xmm0 = _mm_add_ps(xmm0, xmm2);
/*
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loopVert4
*/
_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
} while(count_l4 < 0);
}
/*
startVert1:
and edx, 3
jz done
*/
count_l1 = count_l1 & 3;
if(count_l1 != 0) {
/*
loopVert1:
movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
mulss xmm0, xmm4
mulss xmm1, xmm5
mulss xmm2, xmm6
addss xmm0, xmm7
add ecx, 4
addss xmm0, xmm1
add eax, DRAWVERT_SIZE
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loopVert1
*/
do {
xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
xmm0 = _mm_mul_ss(xmm0, xmm4);
xmm1 = _mm_mul_ss(xmm1, xmm5);
xmm2 = _mm_mul_ss(xmm2, xmm6);
xmm0 = _mm_add_ss(xmm0, xmm7);
dst_p = dst_p + 4;
xmm0 = _mm_add_ss(xmm0, xmm1);
count_l4 = count_l4 + DRAWVERT_SIZE;
xmm0 = _mm_add_ss(xmm0, xmm2);
count_l1 = count_l1 - 1;
_mm_store_ss((float *) (dst_p-4), xmm0);
} while( count_l1 != 0);
}
/*
done:
*/
}
/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
char *indexes_p;
char *src_p;
int count_l;
int edx;
char *min_p;
char *max_p;
/*
movss xmm0, idMath::INFINITY
xorps xmm1, xmm1
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
subps xmm1, xmm0
movaps xmm2, xmm0
movaps xmm3, xmm1
*/
xmm0 = _mm_load_ss(&idMath::INFINITY);
// To satisfy the compiler use xmm0 instead.
xmm1 = _mm_xor_ps(xmm0, xmm0);
xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
xmm1 = _mm_sub_ps(xmm1, xmm0);
xmm2 = xmm0;
xmm3 = xmm1;
/*
mov edi, indexes
mov esi, src
mov eax, count
and eax, ~3
jz done4
*/
indexes_p = (char *) indexes;
src_p = (char *) src;
count_l = count;
count_l = count_l & ~3;
if(count_l != 0) {
/*
shl eax, 2
add edi, eax
neg eax
*/
count_l = count_l << 2;
indexes_p = indexes_p + count_l;
count_l = -count_l;
/*
loop4:
// prefetchnta [edi+128]
// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
*/
do {
/*
mov edx, [edi+eax+0]
imul edx, DRAWVERT_SIZE
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm4
maxps xmm1, xmm4
*/
edx = *((int*)(indexes_p+count_l+0));
edx = edx * DRAWVERT_SIZE;
xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
xmm0 = _mm_min_ps(xmm0, xmm4);
xmm1 = _mm_max_ps(xmm1, xmm4);
/*
mov edx, [edi+eax+4]
imul edx, DRAWVERT_SIZE
movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
minps xmm2, xmm5
maxps xmm3, xmm5
*/
edx = *((int*)(indexes_p+count_l+4));
edx = edx * DRAWVERT_SIZE;
xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
xmm2 = _mm_min_ps(xmm2, xmm5);
xmm3 = _mm_max_ps(xmm3, xmm5);
/*
mov edx, [edi+eax+8]
imul edx, DRAWVERT_SIZE
movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm6
maxps xmm1, xmm6
*/
edx = *((int*)(indexes_p+count_l+8));
edx = edx * DRAWVERT_SIZE;
xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
xmm0 = _mm_min_ps(xmm0, xmm6);
xmm1 = _mm_max_ps(xmm1, xmm6);
/*
mov edx, [edi+eax+12]
imul edx, DRAWVERT_SIZE
movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
minps xmm2, xmm7
maxps xmm3, xmm7
*/
edx = *((int*)(indexes_p+count_l+12));
edx = edx * DRAWVERT_SIZE;
xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
xmm2 = _mm_min_ps(xmm2, xmm7);
xmm3 = _mm_max_ps(xmm3, xmm7);
/*
add eax, 4*4
jl loop4
*/
count_l = count_l + 4*4;
} while (count_l < 0);
}
/*
done4:
mov eax, count
and eax, 3
jz done1
*/
count_l = count;
count_l = count_l & 3;
if(count_l != 0) {
/*
shl eax, 2
add edi, eax
neg eax
*/
count_l = count_l << 2;
indexes_p = indexes_p + count_l;
count_l = -count_l;
/*
loop1:
*/
do{
/*
mov edx, [edi+eax+0]
imul edx, DRAWVERT_SIZE;
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm4
maxps xmm1, xmm4
*/
edx = *((int*)(indexes_p+count_l+0));
edx = edx * DRAWVERT_SIZE;
xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
xmm0 = _mm_min_ps(xmm0, xmm4);
xmm1 = _mm_max_ps(xmm1, xmm4);
/*
add eax, 4
jl loop1
*/
count_l = count_l + 4;
} while (count_l < 0);
}
/*
done1:
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
minps xmm0, xmm2
maxps xmm1, xmm3
mov esi, min
movhps [esi], xmm0
movss [esi+8], xmm0
mov edi, max
movhps [edi], xmm1
movss [edi+8], xmm1
*/
xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
xmm0 = _mm_min_ps(xmm0, xmm2);
xmm1 = _mm_max_ps(xmm1, xmm3);
min_p = (char *) &min;
_mm_storeh_pi((__m64 *)(min_p), xmm0);
_mm_store_ss((float *)(min_p+8), xmm0);
max_p = (char *) &max;
_mm_storeh_pi((__m64 *)(max_p), xmm1);
_mm_store_ss((float *)(max_p+8), xmm1);
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant * src[i].Normal() + src[i][3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
int count_l4;
int count_l1;
char *constant_p;
char *src_p;
char *dst_p;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
/*
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
and eax, ~3
*/
count_l4 = count;
constant_p = (char *) &constant;
count_l1 = count_l4;
src_p = (char *) src;
dst_p = (char *) dst;
count_l4 = count_l4 & ~3;
/*
movss xmm5, [edi+0]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [edi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
*/
xmm5 = _mm_load_ss((float *) (constant_p+0));
xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
xmm6 = _mm_load_ss((float *) (constant_p+4));
xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
xmm7 = _mm_load_ss((float *) (constant_p+8));
xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
/*
jz startVert1
*/
if (count != 0) {
/*
imul eax, 16
add esi, eax
neg eax
*/
count_l4 = count_l4 * 16;
src_p = src_p + count_l4;
count_l4 = -count_l4;
/*
loopVert4:
*/
do {
/*
movlps xmm1, [esi+eax+ 0]
movlps xmm3, [esi+eax+ 8]
movhps xmm1, [esi+eax+16]
movhps xmm3, [esi+eax+24]
movlps xmm2, [esi+eax+32]
movlps xmm4, [esi+eax+40]
movhps xmm2, [esi+eax+48]
movhps xmm4, [esi+eax+56]
movaps xmm0, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
movaps xmm2, xmm3
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
*/
xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
xmm0 = xmm1;
xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
xmm2 = xmm3;
xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
/*
add ecx, 16
add eax, 4*16
*/
dst_p = dst_p + 16;
count_l4 = count_l4 + 4*16;
/*
mulps xmm0, xmm5
mulps xmm1, xmm6
mulps xmm2, xmm7
addps xmm0, xmm3
addps xmm0, xmm1
addps xmm0, xmm2
*/
xmm0 = _mm_mul_ps(xmm0, xmm5);
xmm1 = _mm_mul_ps(xmm1, xmm6);
xmm2 = _mm_mul_ps(xmm2, xmm7);
xmm0 = _mm_add_ps(xmm0, xmm3);
xmm0 = _mm_add_ps(xmm0, xmm1);
xmm0 = _mm_add_ps(xmm0, xmm2);
/*
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loopVert4
*/
_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
} while (count_l4 < 0);
}
/*
startVert1:
and edx, 3
jz done
*/
count_l1 = count_l1 & 3;
if(count_l1 != 0) {
/*
loopVert1:
*/
do {
/*
movss xmm0, [esi+eax+0]
movss xmm1, [esi+eax+4]
movss xmm2, [esi+eax+8]
mulss xmm0, xmm5
mulss xmm1, xmm6
mulss xmm2, xmm7
addss xmm0, [esi+eax+12]
add ecx, 4
addss xmm0, xmm1
add eax, 16
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loopVert1
*/
xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
xmm0 = _mm_mul_ss(xmm0, xmm5);
xmm1 = _mm_mul_ss(xmm1, xmm6);
xmm2 = _mm_mul_ss(xmm2, xmm7);
xmm0 = _mm_add_ss(xmm0, xmm3);
dst_p = dst_p + 4;
xmm0 = _mm_add_ss(xmm0, xmm1);
count_l4 = count_l4 + 16;
xmm0 = _mm_add_ss(xmm0, xmm2);
count_l1 = count_l1 - 1;
_mm_store_ss((float *) (dst_p-4), xmm0);
} while (count_l1 != 0);
}
/*
done:
*/
}
#elif defined(_WIN32)
#include <xmmintrin.h>
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
__asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
__asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
__asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
__asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
__asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
__asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
__asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
__asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \
__asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \
__asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
__asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \
__asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */
// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
__asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
__asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \
__asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \
__asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \
__asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \
__asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \
__asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \
__asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \
__asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
__asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \
__asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \
__asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
__asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \
__asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */
// transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
#define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
__asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \
__asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
__asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \
__asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \
__asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
__asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \
__asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
__asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
__asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \
__asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \
__asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \
__asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \
__asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \
__asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */
// transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
#define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
__asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \
__asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \
__asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \
__asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \
__asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \
__asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \
__asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \
__asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */
// transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
#define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
__asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
__asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \
__asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \
__asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \
__asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \
__asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \
__asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
__asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \
__asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \
__asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */
// transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
#define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
__asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \
__asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
__asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \
__asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \
__asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
__asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
__asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \
__asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \
__asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \
__asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */
// with alignment
#define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
#define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
__asm mov ecx,DST \
__asm shr ecx,2 \
__asm mov ebx,COUNT \
__asm neg ecx \
__asm mov edx,SRC0 \
__asm and ecx,3 \
__asm mov esi,SRC1 \
__asm sub ebx,ecx \
__asm jge noUnderFlow \
__asm xor ebx,ebx \
__asm mov ecx,COUNT \
__asm noUnderFlow: \
__asm mov PRE,ecx \
__asm mov eax,ebx \
__asm mov edi,DST \
__asm and eax,8-1 \
__asm mov POST,eax \
__asm and ebx,0xfffffff8 \
__asm jle done \
__asm shl ebx,2 \
__asm lea ecx,[ecx*4+ebx] \
__asm neg ebx \
__asm add edx,ecx \
__asm add esi,ecx \
__asm add edi,ecx \
__asm mov eax,edx \
__asm or eax,esi
// without alignment (pre==0)
#define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
#define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
#define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
__asm mov eax,COUNT \
__asm mov PRE,0 \
__asm and eax,8-1 \
__asm mov ebx,COUNT \
__asm mov POST,eax \
__asm and ebx,0xfffffff8 \
__asm je done \
__asm shl ebx,2 \
__asm mov edx,SRC0 \
__asm mov esi,SRC1 \
__asm mov edi,DST \
__asm add edx,ebx \
__asm add esi,ebx \
__asm add edi,ebx \
__asm mov eax,edx \
__asm or eax,esi \
__asm or eax,edi \
__asm neg ebx \
/*
when OPER is called:
edx = s0
esi = s1
edi = d
ebx = index*4
xmm0 & xmm1 must not be trashed
*/
#define KMOVDS1( DST, SRC0 ) \
__asm movss xmm2,SRC0 \
__asm movss DST,xmm2
#define KMOVDS4( DST, SRC0 ) \
__asm movups xmm2,SRC0 \
__asm movups DST,xmm2
#define KMINDS1( DST, SRC0 ) \
__asm movss xmm2,SRC0 \
__asm minss DST,xmm2
#define KMAXDS1( DST, SRC0 ) \
__asm movss xmm2,SRC0 \
__asm maxss DST,xmm2
// general ALU operation
#define KALUDSS1( OP, DST, SRC0, SRC1 ) \
__asm movss xmm2,SRC0 \
__asm OP##ss xmm2,SRC1 \
__asm movss DST,xmm2
#define KALUDSS4( OP, DST, SRC0, SRC1 ) \
__asm movups xmm2,SRC0 \
__asm movups xmm3,SRC1 \
__asm OP##ps xmm2,xmm3 \
__asm movups DST,xmm2
#define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 )
#define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 )
#define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 )
#define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 )
#define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 )
#define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 )
#define KDIVDSS1( DST, SRC0, SRC1 ) \
__asm movss xmm2,SRC1 \
__asm rcpss xmm3,xmm2 \
__asm mulss xmm2,xmm3 \
__asm mulss xmm2,xmm3 \
__asm addss xmm3,xmm3 \
__asm subss xmm3,xmm2 \
__asm mulss xmm3,SRC0 \
__asm movss DST,xmm3
#define KDIVDSS4( DST, SRC0, SRC1 ) \
__asm movups xmm2,SRC1 \
__asm rcpps xmm3,xmm2 \
__asm mulps xmm2,xmm3 \
__asm mulps xmm2,xmm3 \
__asm addps xmm3,xmm3 \
__asm subps xmm3,xmm2 \
__asm movups xmm2,SRC0 \
__asm mulps xmm3,xmm2 \
__asm movups DST,xmm3
#define KF2IDS1( SRC0 ) \
__asm movss xmm2,SRC0 \
__asm cvttps2pi mm2,xmm2 \
__asm movd [edi+ebx],mm2
#define KF2IDS4( SRC0 ) \
__asm movups xmm2,SRC0 \
__asm cvttps2pi mm2,xmm2 \
__asm movq [edi+ebx+0],mm2 \
__asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \
__asm cvttps2pi mm2,xmm2 \
__asm movq [edi+ebx+8],mm2
#define KISQRTDS1( DST,SRC0 ) \
__asm movss xmm2,SRC0 \
__asm rsqrtss xmm3,xmm2 \
__asm mulss xmm2,xmm3 \
__asm mulss xmm2,xmm3 \
__asm subss xmm2,xmm1 \
__asm mulss xmm3,xmm0 \
__asm mulss xmm3,xmm2 \
__asm movss DST,xmm3
#define KISQRTDS4( DST,SRC0 ) \
__asm movups xmm2,SRC0 \
__asm rsqrtps xmm3,xmm2 \
__asm mulps xmm2,xmm3 \
__asm mulps xmm2,xmm3 \
__asm subps xmm2,xmm1 \
__asm mulps xmm3,xmm0 \
__asm mulps xmm3,xmm2 \
__asm movups DST,xmm3
// this is used in vector4 implementation to shift constant V4
#define KANDREGDSV( DST, SRC0, VALUE ) \
__asm mov DST,SRC0 \
__asm and DST,VALUE
// this is used in vector4 code to operate with float arrays as sources
#define KEXPANDFLOAT( DST, SRC ) \
__asm movss DST,SRC \
__asm shufps DST,DST,0
#define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC )
#define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC )
#define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC )
#define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC )
#define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC )
#define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC )
#define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC )
#define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC )
// handles pre & post leftovers
#define KFLOATOPER( OPER, OPER4, COUNT ) \
__asm mov ecx,pre \
__asm mov ebx,COUNT \
__asm cmp ebx,ecx \
__asm cmovl ecx,COUNT \
__asm test ecx,ecx \
__asm je preDone \
__asm xor ebx,ebx \
__asm lpPre: \
OPER \
__asm add ebx,4 \
__asm dec ecx \
__asm jg lpPre \
__asm preDone: \
__asm mov ecx,post \
__asm mov ebx,COUNT \
__asm sub ebx,ecx \
__asm shl ebx,2 \
__asm cmp ecx,4 \
__asm jl post4Done \
OPER4 \
__asm sub ecx,4 \
__asm add ebx,4*4 \
__asm post4Done: \
__asm test ecx,ecx \
__asm je postDone \
__asm lpPost: \
OPER \
__asm add ebx,4 \
__asm dec ecx \
__asm jg lpPost \
__asm postDone:
// operate on a constant and a float array
#define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \
int pre,post; \
__asm movss xmm0,CONSTANT \
__asm shufps xmm0,xmm0,0 \
KFLOATINITDS( DST, SRC, COUNT, pre, post ) \
__asm and eax,15 \
__asm jne lpNA \
__asm jmp lpA \
__asm align 16 \
__asm lpA: \
__asm prefetchnta [edx+ebx+64] \
__asm movaps xmm1,xmm0 \
__asm movaps xmm2,xmm0 \
__asm ALUOP##ps xmm1,[edx+ebx] \
__asm ALUOP##ps xmm2,[edx+ebx+16] \
__asm movaps [edi+ebx],xmm1 \
__asm movaps [edi+ebx+16],xmm2 \
__asm add ebx,16*2 \
__asm jl lpA \
__asm jmp done \
__asm align 16 \
__asm lpNA: \
__asm prefetchnta [edx+ebx+64] \
__asm movaps xmm1,xmm0 \
__asm movaps xmm2,xmm0 \
__asm movups xmm3,[edx+ebx] \
__asm movups xmm4,[edx+ebx+16] \
__asm ALUOP##ps xmm1,xmm3 \
__asm ALUOP##ps xmm2,xmm4 \
__asm movaps [edi+ebx],xmm1 \
__asm movaps [edi+ebx+16],xmm2 \
__asm add ebx,16*2 \
__asm jl lpNA \
__asm done: \
__asm mov edx,SRC \
__asm mov edi,DST \
__asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \
__asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
// operate on two float arrays
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \
int pre,post; \
KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \
__asm and eax,15 \
__asm jne lpNA \
__asm jmp lpA \
__asm align 16 \
__asm lpA: \
__asm movaps xmm1,[edx+ebx] \
__asm movaps xmm2,[edx+ebx+16] \
__asm ALUOP##ps xmm1,[esi+ebx] \
__asm ALUOP##ps xmm2,[esi+ebx+16] \
__asm prefetchnta [edx+ebx+64] \
__asm prefetchnta [esi+ebx+64] \
__asm movaps [edi+ebx],xmm1 \
__asm movaps [edi+ebx+16],xmm2 \
__asm add ebx,16*2 \
__asm jl lpA \
__asm jmp done \
__asm align 16 \
__asm lpNA: \
__asm movups xmm1,[edx+ebx] \
__asm movups xmm2,[edx+ebx+16] \
__asm movups xmm3,[esi+ebx] \
__asm movups xmm4,[esi+ebx+16] \
__asm prefetchnta [edx+ebx+64] \
__asm prefetchnta [esi+ebx+64] \
__asm ALUOP##ps xmm1,xmm3 \
__asm ALUOP##ps xmm2,xmm4 \
__asm movaps [edi+ebx],xmm1 \
__asm movaps [edi+ebx+16],xmm2 \
__asm add ebx,16*2 \
__asm jl lpNA \
__asm done: \
__asm mov edx,SRC0 \
__asm mov esi,SRC1 \
__asm mov edi,DST \
KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \
KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
#define DRAWVERT_SIZE 60
#define DRAWVERT_XYZ_OFFSET (0*4)
#define DRAWVERT_ST_OFFSET (3*4)
#define DRAWVERT_NORMAL_OFFSET (5*4)
#define DRAWVERT_TANGENT0_OFFSET (8*4)
#define DRAWVERT_TANGENT1_OFFSET (11*4)
#define DRAWVERT_COLOR_OFFSET (14*4)
#define JOINTQUAT_SIZE (7*4)
#define JOINTMAT_SIZE (4*3*4)
#define JOINTWEIGHT_SIZE (4*4)
#define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
#define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
#define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
ALIGN4_INIT1( unsigned long SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
ALIGN4_INIT1( unsigned long SIMD_SP_not, 0xFFFFFFFF );
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f );
ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f );
ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f );
ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );
ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f );
ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f );
ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );
ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f );
ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f );
ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f );
ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f );
ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );
/*
============
SSE_InvSqrt
============
*/
float SSE_InvSqrt( float x ) {
float y;
__asm {
movss xmm0, x
rsqrtss xmm1, xmm0
mulss xmm0, xmm1
mulss xmm0, xmm1
subss xmm0, SIMD_SP_rsqrt_c0
mulss xmm1, SIMD_SP_rsqrt_c1
mulss xmm0, xmm1
movss y, xmm0
}
return y;
}
/*
============
SSE_InvSqrt4
============
*/
void SSE_InvSqrt4( float x[4] ) {
__asm {
mov edi, x
movaps xmm0, [edi]
rsqrtps xmm1, xmm0
mulps xmm0, xmm1
mulps xmm0, xmm1
subps xmm0, SIMD_SP_rsqrt_c0
mulps xmm1, SIMD_SP_rsqrt_c1
mulps xmm0, xmm1
movaps [edi], xmm0
}
}
/*
============
SSE_SinZeroHalfPI
The angle must be between zero and half PI.
============
*/
float SSE_SinZeroHalfPI( float a ) {
#if 1
float t;
assert( a >= 0.0f && a <= idMath::HALF_PI );
__asm {
movss xmm0, a
movss xmm1, xmm0
mulss xmm1, xmm1
movss xmm2, SIMD_SP_sin_c0
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c1
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c2
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c3
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c4
mulss xmm2, xmm1
addss xmm2, SIMD_SP_one
mulss xmm2, xmm0
movss t, xmm2
}
return t;
#else
float s, t;
assert( a >= 0.0f && a <= idMath::HALF_PI );
s = a * a;
t = -2.39e-08f;
t *= s;
t += 2.7526e-06f;
t *= s;
t += -1.98409e-04f;
t *= s;
t += 8.3333315e-03f;
t *= s;
t += -1.666666664e-01f;
t *= s;
t += 1.0f;
t *= a;
return t;
#endif
}
/*
============
SSE_Sin4ZeroHalfPI
The angle must be between zero and half PI.
============
*/
void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
__asm {
mov edi, a
mov esi, s
movaps xmm0, [edi]
movaps xmm1, xmm0
mulps xmm1, xmm1
movaps xmm2, SIMD_SP_sin_c0
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c1
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c2
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c3
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c4
mulps xmm2, xmm1
addps xmm2, SIMD_SP_one
mulps xmm2, xmm0
movaps [esi], xmm2
}
}
/*
============
SSE_Sin
============
*/
float SSE_Sin( float a ) {
#if 1
float t;
__asm {
movss xmm1, a
movss xmm2, xmm1
movss xmm3, xmm1
mulss xmm2, SIMD_SP_oneOverTwoPI
cvttss2si ecx, xmm2
cmpltss xmm3, SIMD_SP_zero
andps xmm3, SIMD_SP_one
cvtsi2ss xmm2, ecx
subss xmm2, xmm3
mulss xmm2, SIMD_SP_twoPI
subss xmm1, xmm2
movss xmm0, SIMD_SP_PI // xmm0 = PI
subss xmm0, xmm1 // xmm0 = PI - a
movss xmm1, xmm0 // xmm1 = PI - a
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
movss xmm2, xmm0 // xmm2 = PI - a
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
movss xmm3, SIMD_SP_PI // xmm3 = PI
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
xorps xmm0, xmm2
addps xmm0, xmm3
movss xmm1, xmm0
mulss xmm1, xmm1
movss xmm2, SIMD_SP_sin_c0
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c1
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c2
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c3
mulss xmm2, xmm1
addss xmm2, SIMD_SP_sin_c4
mulss xmm2, xmm1
addss xmm2, SIMD_SP_one
mulss xmm2, xmm0
movss t, xmm2
}
return t;
#else
float s, t;
if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
}
a = idMath::PI - a;
if ( fabs( a ) >= idMath::HALF_PI ) {
a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
}
s = a * a;
t = -2.39e-08f;
t *= s;
t += 2.7526e-06f;
t *= s;
t += -1.98409e-04f;
t *= s;
t += 8.3333315e-03f;
t *= s;
t += -1.666666664e-01f;
t *= s;
t += 1.0f;
t *= a;
return t;
#endif
}
/*
============
SSE_Sin4
============
*/
void SSE_Sin4( float a[4], float s[4] ) {
__asm {
mov edi, a
mov esi, s
movaps xmm1, [edi]
movaps xmm2, xmm1
mulps xmm2, SIMD_SP_oneOverTwoPI
movhlps xmm3, xmm2
cvttss2si ecx, xmm2
cvtsi2ss xmm2, ecx
cvttss2si edx, xmm3
cvtsi2ss xmm3, edx
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
cvttss2si ecx, xmm2
cvtsi2ss xmm2, ecx
cvttss2si edx, xmm3
cvtsi2ss xmm3, edx
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
movaps xmm3, xmm1
cmpltps xmm3, SIMD_SP_zero
andps xmm3, SIMD_SP_one
subps xmm2, xmm3
mulps xmm2, SIMD_SP_twoPI
subps xmm1, xmm2
movaps xmm0, SIMD_SP_PI // xmm0 = PI
subps xmm0, xmm1 // xmm0 = PI - a
movaps xmm1, xmm0 // xmm1 = PI - a
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
movaps xmm2, xmm0 // xmm2 = PI - a
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
movaps xmm3, SIMD_SP_PI // xmm3 = PI
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
xorps xmm0, xmm2
addps xmm0, xmm3
movaps xmm1, xmm0
mulps xmm1, xmm1
movaps xmm2, SIMD_SP_sin_c0
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c1
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c2
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c3
mulps xmm2, xmm1
addps xmm2, SIMD_SP_sin_c4
mulps xmm2, xmm1
addps xmm2, SIMD_SP_one
mulps xmm2, xmm0
movaps [esi], xmm2
}
}
/*
============
SSE_CosZeroHalfPI
The angle must be between zero and half PI.
============
*/
float SSE_CosZeroHalfPI( float a ) {
#if 1
float t;
assert( a >= 0.0f && a <= idMath::HALF_PI );
__asm {
movss xmm0, a
mulss xmm0, xmm0
movss xmm1, SIMD_SP_cos_c0
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c1
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c2
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c3
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c4
mulss xmm1, xmm0
addss xmm1, SIMD_SP_one
movss t, xmm1
}
return t;
#else
float s, t;
assert( a >= 0.0f && a <= idMath::HALF_PI );
s = a * a;
t = -2.605e-07f;
t *= s;
t += 2.47609e-05f;
t *= s;
t += -1.3888397e-03f;
t *= s;
t += 4.16666418e-02f;
t *= s;
t += -4.999999963e-01f;
t *= s;
t += 1.0f;
return t;
#endif
}
/*
============
SSE_Cos4ZeroHalfPI
The angle must be between zero and half PI.
============
*/
void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
__asm {
mov edi, a
mov esi, c
movaps xmm0, [edi]
mulps xmm0, xmm0
movaps xmm1, SIMD_SP_cos_c0
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c1
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c2
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c3
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c4
mulps xmm1, xmm0
addps xmm1, SIMD_SP_one
movaps [esi], xmm2
}
}
/*
============
SSE_Cos
============
*/
float SSE_Cos( float a ) {
#if 1
float t;
__asm {
movss xmm1, a
movss xmm2, xmm1
movss xmm3, xmm1
mulss xmm2, SIMD_SP_oneOverTwoPI
cvttss2si ecx, xmm2
cmpltss xmm3, SIMD_SP_zero
andps xmm3, SIMD_SP_one
cvtsi2ss xmm2, ecx
subss xmm2, xmm3
mulss xmm2, SIMD_SP_twoPI
subss xmm1, xmm2
movss xmm0, SIMD_SP_PI // xmm0 = PI
subss xmm0, xmm1 // xmm0 = PI - a
movss xmm1, xmm0 // xmm1 = PI - a
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
movss xmm2, xmm0 // xmm2 = PI - a
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
movss xmm3, SIMD_SP_PI // xmm3 = PI
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
xorps xmm0, xmm2
addps xmm0, xmm3
mulss xmm0, xmm0
movss xmm1, SIMD_SP_cos_c0
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c1
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c2
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c3
mulss xmm1, xmm0
addss xmm1, SIMD_SP_cos_c4
mulss xmm1, xmm0
addss xmm1, SIMD_SP_one
xorps xmm2, SIMD_SP_signBitMask
xorps xmm1, xmm2
movss t, xmm1
}
return t;
#else
float s, t;
if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
}
a = idMath::PI - a;
if ( fabs( a ) >= idMath::HALF_PI ) {
a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
d = 1.0f;
} else {
d = -1.0f;
}
s = a * a;
t = -2.605e-07f;
t *= s;
t += 2.47609e-05f;
t *= s;
t += -1.3888397e-03f;
t *= s;
t += 4.16666418e-02f;
t *= s;
t += -4.999999963e-01f;
t *= s;
t += 1.0f;
t *= d;
return t;
#endif
}
/*
============
SSE_Cos4
============
*/
void SSE_Cos4( float a[4], float c[4] ) {
__asm {
mov edi, a
mov esi, c
movaps xmm1, [edi]
movaps xmm2, xmm1
mulps xmm2, SIMD_SP_oneOverTwoPI
movhlps xmm3, xmm2
cvttss2si ecx, xmm2
cvtsi2ss xmm2, ecx
cvttss2si edx, xmm3
cvtsi2ss xmm3, edx
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
cvttss2si ecx, xmm2
cvtsi2ss xmm2, ecx
cvttss2si edx, xmm3
cvtsi2ss xmm3, edx
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
movaps xmm3, xmm1
cmpltps xmm3, SIMD_SP_zero
andps xmm3, SIMD_SP_one
subps xmm2, xmm3
mulps xmm2, SIMD_SP_twoPI
subps xmm1, xmm2
movaps xmm0, SIMD_SP_PI // xmm0 = PI
subps xmm0, xmm1 // xmm0 = PI - a
movaps xmm1, xmm0 // xmm1 = PI - a
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
movaps xmm2, xmm0 // xmm2 = PI - a
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
movaps xmm3, SIMD_SP_PI // xmm3 = PI
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
xorps xmm0, xmm2
addps xmm0, xmm3
mulps xmm0, xmm0
movaps xmm1, SIMD_SP_cos_c0
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c1
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c2
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c3
mulps xmm1, xmm0
addps xmm1, SIMD_SP_cos_c4
mulps xmm1, xmm0
addps xmm1, SIMD_SP_one
xorps xmm2, SIMD_SP_signBitMask
xorps xmm1, xmm2
movaps [esi], xmm1
}
}
/*
============
SSE_SinCos
============
*/
void SSE_SinCos( float a, float &s, float &c ) {
__asm {
mov edi, s
mov esi, c
movss xmm1, a
movss xmm2, xmm1
movss xmm3, xmm1
mulss xmm2, SIMD_SP_oneOverTwoPI
cvttss2si ecx, xmm2
cmpltss xmm3, SIMD_SP_zero
andps xmm3, SIMD_SP_one
cvtsi2ss xmm2, ecx
subss xmm2, xmm3
mulss xmm2, SIMD_SP_twoPI
subss xmm1, xmm2
movss xmm0, SIMD_SP_PI // xmm0 = PI
subss xmm0, xmm1 // xmm0 = PI - a
movss xmm1, xmm0 // xmm1 = PI - a
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
movss xmm2, xmm0 // xmm2 = PI - a
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
movss xmm3, SIMD_SP_PI // xmm3 = PI
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
xorps xmm0, xmm2
addps xmm0, xmm3
movss xmm1, xmm0
mulss xmm1, xmm1
movss xmm3, SIMD_SP_sin_c0
movss xmm4, SIMD_SP_cos_c0
mulss xmm3, xmm1
mulss xmm4, xmm1
addss xmm3, SIMD_SP_sin_c1
addss xmm4, SIMD_SP_cos_c1
mulss xmm3, xmm1
mulss xmm4, xmm1
addss xmm3, SIMD_SP_sin_c2
addss xmm4, SIMD_SP_cos_c2
mulss xmm3, xmm1
mulss xmm4, xmm1
addss xmm3, SIMD_SP_sin_c3
addss xmm4, SIMD_SP_cos_c3
mulss xmm3, xmm1
mulss xmm4, xmm1
addss xmm3, SIMD_SP_sin_c4
addss xmm4, SIMD_SP_cos_c4
mulss xmm3, xmm1
mulss xmm4, xmm1
addss xmm3, SIMD_SP_one
addss xmm4, SIMD_SP_one
mulss xmm3, xmm0
xorps xmm2, SIMD_SP_signBitMask
xorps xmm4, xmm2
movss [edi], xmm2
movss [esi], xmm3
}
}
/*
============
SSE_SinCos4
============
*/
void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
__asm {
mov eax, a
mov edi, s
mov esi, c
movaps xmm1, [eax]
movaps xmm2, xmm1
mulps xmm2, SIMD_SP_oneOverTwoPI
movhlps xmm3, xmm2
cvttss2si ecx, xmm2
cvtsi2ss xmm2, ecx
cvttss2si edx, xmm3
cvtsi2ss xmm3, edx
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
cvttss2si ecx, xmm2
cvtsi2ss xmm2, ecx
cvttss2si edx, xmm3
cvtsi2ss xmm3, edx
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
movaps xmm3, xmm1
cmpltps xmm3, SIMD_SP_zero
andps xmm3, SIMD_SP_one
subps xmm2, xmm3
mulps xmm2, SIMD_SP_twoPI
subps xmm1, xmm2
movaps xmm0, SIMD_SP_PI // xmm0 = PI
subps xmm0, xmm1 // xmm0 = PI - a
movaps xmm1, xmm0 // xmm1 = PI - a
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
movaps xmm2, xmm0 // xmm2 = PI - a
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
movaps xmm3, SIMD_SP_PI // xmm3 = PI
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
xorps xmm0, xmm2
addps xmm0, xmm3
movaps xmm0, [eax]
movaps xmm1, xmm0
mulps xmm1, xmm1
movaps xmm3, SIMD_SP_sin_c0
movaps xmm4, SIMD_SP_cos_c0
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm3, SIMD_SP_sin_c1
addps xmm4, SIMD_SP_cos_c1
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm3, SIMD_SP_sin_c2
addps xmm4, SIMD_SP_cos_c2
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm3, SIMD_SP_sin_c3
addps xmm4, SIMD_SP_cos_c3
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm3, SIMD_SP_sin_c4
addps xmm4, SIMD_SP_cos_c4
mulps xmm3, xmm1
mulps xmm4, xmm1
addps xmm3, SIMD_SP_one
addps xmm4, SIMD_SP_one
mulps xmm3, xmm0
xorps xmm2, SIMD_SP_signBitMask
xorps xmm4, xmm2
movaps [edi], xmm3
movaps [esi], xmm4
}
}
/*
============
SSE_ATanPositive
Both 'x' and 'y' must be positive.
============
*/
float SSE_ATanPositive( float y, float x ) {
#if 1
float t;
assert( y >= 0.0f && x >= 0.0f );
__asm {
movss xmm0, x
movss xmm3, xmm0
movss xmm1, y
minss xmm0, xmm1
maxss xmm1, xmm3
cmpeqss xmm3, xmm0
rcpss xmm2, xmm1
mulss xmm1, xmm2
mulss xmm1, xmm2
addss xmm2, xmm2
subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
mulss xmm0, xmm2 // xmm0 = x / y or y / x
movss xmm1, xmm3
andps xmm1, SIMD_SP_signBitMask
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
movss xmm1, xmm0
mulss xmm1, xmm1 // xmm1 = s
movss xmm2, SIMD_SP_atan_c0
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c1
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c2
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c3
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c4
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c5
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c6
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c7
mulss xmm2, xmm1
addss xmm2, SIMD_SP_one
mulss xmm2, xmm0
addss xmm2, xmm3
movss t, xmm2
}
return t;
#else
float a, d, s, t;
assert( y >= 0.0f && x >= 0.0f );
if ( y > x ) {
a = -x / y;
d = idMath::HALF_PI;
} else {
a = y / x;
d = 0.0f;
}
s = a * a;
t = 0.0028662257f;
t *= s;
t += -0.0161657367f;
t *= s;
t += 0.0429096138f;
t *= s;
t += -0.0752896400f;
t *= s;
t += 0.1065626393f;
t *= s;
t += -0.1420889944f;
t *= s;
t += 0.1999355085f;
t *= s;
t += -0.3333314528f;
t *= s;
t += 1.0f;
t *= a;
t += d;
return t;
#endif
}
/*
============
SSE_ATan4Positive
Both 'x' and 'y' must be positive.
============
*/
void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
__asm {
mov esi, x
mov edi, y
mov edx, at
movaps xmm0, [esi]
movaps xmm3, xmm0
movaps xmm1, [edi]
minps xmm0, xmm1
maxps xmm1, xmm3
cmpeqps xmm3, xmm0
rcpps xmm2, xmm1
mulps xmm1, xmm2
mulps xmm1, xmm2
addps xmm2, xmm2
subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
mulps xmm0, xmm2 // xmm0 = x / y or y / x
movaps xmm1, xmm3
andps xmm1, SIMD_SP_signBitMask
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
movaps xmm1, xmm0
mulps xmm1, xmm1 // xmm1 = s
movaps xmm2, SIMD_SP_atan_c0
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c1
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c2
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c3
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c4
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c5
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c6
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c7
mulps xmm2, xmm1
addps xmm2, SIMD_SP_one
mulps xmm2, xmm0
addps xmm2, xmm3
movaps [edx], xmm2
}
}
/*
============
SSE_ATan
============
*/
float SSE_ATan( float y, float x ) {
#if 1
float t;
__asm {
movss xmm0, x
movss xmm3, xmm0
movss xmm4, xmm0
andps xmm0, SIMD_SP_absMask
movss xmm1, y
xorps xmm4, xmm1
andps xmm1, SIMD_SP_absMask
andps xmm4, SIMD_SP_signBitMask
minss xmm0, xmm1
maxss xmm1, xmm3
cmpeqss xmm3, xmm0
rcpss xmm2, xmm1
mulss xmm1, xmm2
mulss xmm1, xmm2
addss xmm2, xmm2
subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
mulss xmm0, xmm2 // xmm0 = x / y or y / x
xorps xmm0, xmm4
movss xmm1, xmm3
andps xmm1, SIMD_SP_signBitMask
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
movss xmm1, xmm0
mulss xmm1, xmm1 // xmm1 = s
movss xmm2, SIMD_SP_atan_c0
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c1
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c2
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c3
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c4
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c5
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c6
mulss xmm2, xmm1
addss xmm2, SIMD_SP_atan_c7
mulss xmm2, xmm1
addss xmm2, SIMD_SP_one
mulss xmm2, xmm0
addss xmm2, xmm3
movss t, xmm2
}
return t;
#else
float a, d, s, t;
if ( fabs( y ) > fabs( x ) ) {
a = -x / y;
d = idMath::HALF_PI;
*((unsigned long *)&d) ^= ( *((unsigned long *)&x) ^ *((unsigned long *)&y) ) & (1<<31);
} else {
a = y / x;
d = 0.0f;
}
s = a * a;
t = 0.0028662257f;
t *= s;
t += -0.0161657367f;
t *= s;
t += 0.0429096138f;
t *= s;
t += -0.0752896400f;
t *= s;
t += 0.1065626393f;
t *= s;
t += -0.1420889944f;
t *= s;
t += 0.1999355085f;
t *= s;
t += -0.3333314528f;
t *= s;
t += 1.0f;
t *= a;
t += d;
return t;
#endif
}
/*
============
SSE_ATan4
============
*/
void SSE_ATan4( float y[4], float x[4], float at[4] ) {
__asm {
mov esi, x
mov edi, y
mov edx, at
movaps xmm0, [esi]
movaps xmm3, xmm0
movaps xmm4, xmm0
andps xmm0, SIMD_SP_absMask
movaps xmm1, [edi]
xorps xmm4, xmm1
andps xmm1, SIMD_SP_absMask
andps xmm4, SIMD_SP_signBitMask
minps xmm0, xmm1
maxps xmm1, xmm3
cmpeqps xmm3, xmm0
rcpps xmm2, xmm1
mulps xmm1, xmm2
mulps xmm1, xmm2
addps xmm2, xmm2
subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
mulps xmm0, xmm2 // xmm0 = x / y or y / x
xorps xmm0, xmm4
movaps xmm1, xmm3
andps xmm1, SIMD_SP_signBitMask
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
movaps xmm1, xmm0
mulps xmm1, xmm1 // xmm1 = s
movaps xmm2, SIMD_SP_atan_c0
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c1
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c2
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c3
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c4
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c5
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c6
mulps xmm2, xmm1
addps xmm2, SIMD_SP_atan_c7
mulps xmm2, xmm1
addps xmm2, SIMD_SP_one
mulps xmm2, xmm0
addps xmm2, xmm3
movaps [edx], xmm2
}
}
/*
============
SSE_TestTrigonometry
============
*/
void SSE_TestTrigonometry( void ) {
int i;
float a, s1, s2, c1, c2;
for ( i = 0; i < 100; i++ ) {
a = i * idMath::HALF_PI / 100.0f;
s1 = sin( a );
s2 = SSE_SinZeroHalfPI( a );
if ( fabs( s1 - s2 ) > 1e-7f ) {
assert( 0 );
}
c1 = cos( a );
c2 = SSE_CosZeroHalfPI( a );
if ( fabs( c1 - c2 ) > 1e-7f ) {
assert( 0 );
}
}
for ( i = -200; i < 200; i++ ) {
a = i * idMath::TWO_PI / 100.0f;
s1 = sin( a );
s2 = SSE_Sin( a );
if ( fabs( s1 - s2 ) > 1e-6f ) {
assert( 0 );
}
c1 = cos( a );
c2 = SSE_Cos( a );
if ( fabs( c1 - c2 ) > 1e-6f ) {
assert( 0 );
}
SSE_SinCos( a, s2, c2 );
if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
assert( 0 );
}
}
}
/*
============
idSIMD_SSE::GetName
============
*/
const char * idSIMD_SSE::GetName( void ) const {
return "MMX & SSE";
}
/*
============
idSIMD_SSE::Add
dst[i] = constant + src[i];
============
*/
void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
KFLOAT_CA( add, dst, src, constant, count )
}
/*
============
idSIMD_SSE::Add
dst[i] = src0[i] + src1[i];
============
*/
void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
KFLOAT_AA( add, dst, src0, src1, count )
}
/*
============
idSIMD_SSE::Sub
dst[i] = constant - src[i];
============
*/
void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
KFLOAT_CA( sub, dst, src, constant, count )
}
/*
============
idSIMD_SSE::Sub
dst[i] = src0[i] - src1[i];
============
*/
void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
KFLOAT_AA( sub, dst, src0, src1, count )
}
/*
============
idSIMD_SSE::Mul
dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
KFLOAT_CA( mul, dst, src, constant, count )
}
/*
============
idSIMD_SSE::Mul
dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
KFLOAT_AA( mul, dst, src0, src1, count )
}
/*
============
idSIMD_SSE::Div
dst[i] = constant / src[i];
============
*/
void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
int pre, post;
// 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
__asm
{
movss xmm1,constant
shufps xmm1,xmm1,0
KFLOATINITDS( dst, src, count, pre, post )
and eax,15
jne lpNA
jmp lpA
align 16
lpA:
movaps xmm2,[edx+ebx]
movaps xmm3,[edx+ebx+16]
rcpps xmm4,xmm2
rcpps xmm5,xmm3
prefetchnta [edx+ebx+64]
mulps xmm2,xmm4
mulps xmm2,xmm4
mulps xmm3,xmm5
mulps xmm3,xmm5
addps xmm4,xmm4
addps xmm5,xmm5
subps xmm4,xmm2
subps xmm5,xmm3
mulps xmm4,xmm1
mulps xmm5,xmm1
movaps [edi+ebx],xmm4
movaps [edi+ebx+16],xmm5
add ebx,16*2
jl lpA
jmp done
align 16
lpNA:
movups xmm2,[edx+ebx]
movups xmm3,[edx+ebx+16]
rcpps xmm4,xmm2
rcpps xmm5,xmm3
prefetchnta [edx+ebx+64]
mulps xmm2,xmm4
mulps xmm2,xmm4
mulps xmm3,xmm5
mulps xmm3,xmm5
addps xmm4,xmm4
addps xmm5,xmm5
subps xmm4,xmm2
subps xmm5,xmm3
mulps xmm4,xmm1
mulps xmm5,xmm1
movaps [edi+ebx],xmm4
movaps [edi+ebx+16],xmm5
add ebx,16*2
jl lpNA
done:
mov edx,src
mov edi,dst
KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
}
}
/*
============
idSIMD_SSE::Div
dst[i] = src0[i] / src1[i];
============
*/
void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
int pre,post;
// 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
__asm
{
KFLOATINITDSS( dst, src0, src1, count, pre, post )
and eax,15
jne lpNA
jmp lpA
align 16
lpA:
movaps xmm2,[esi+ebx]
movaps xmm3,[esi+ebx+16]
rcpps xmm4,xmm2
rcpps xmm5,xmm3
prefetchnta [esi+ebx+64]
mulps xmm2,xmm4
mulps xmm2,xmm4
mulps xmm3,xmm5
mulps xmm3,xmm5
addps xmm4,xmm4
addps xmm5,xmm5
subps xmm4,xmm2
subps xmm5,xmm3
mulps xmm4,[edx+ebx]
mulps xmm5,[edx+ebx+16]
movaps [edi+ebx],xmm4
movaps [edi+ebx+16],xmm5
add ebx,16*2
jl lpA
jmp done
align 16
lpNA:
movups xmm2,[esi+ebx]
movups xmm3,[esi+ebx+16]
rcpps xmm4,xmm2
rcpps xmm5,xmm3
prefetchnta [esi+ebx+64]
mulps xmm2,xmm4
mulps xmm2,xmm4
mulps xmm3,xmm5
mulps xmm3,xmm5
addps xmm4,xmm4
addps xmm5,xmm5
subps xmm4,xmm2
subps xmm5,xmm3
movups xmm2,[edx+ebx]
movups xmm3,[edx+ebx+16]
mulps xmm4,xmm2
mulps xmm5,xmm3
movaps [edi+ebx],xmm4
movaps [edi+ebx+16],xmm5
add ebx,16*2
jl lpNA
done:
mov edx,src0
mov esi,src1
mov edi,dst
KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
}
}
/*
============
Simd_MulAdd
assumes count >= 7
============
*/
static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
__asm mov esi, dst
__asm mov edi, src
__asm mov eax, count
__asm shl eax, 2
__asm mov ecx, esi
__asm mov edx, eax
__asm or ecx, edi
__asm fld constant
__asm and ecx, 15
__asm jz SimdMulAdd16
__asm and ecx, 3
__asm jnz SimdMulAdd8
__asm mov ecx, esi
__asm xor ecx, edi
__asm and ecx, 15
__asm jnz MulAdd8
__asm mov ecx, esi
__asm and ecx, 15
__asm neg ecx
__asm add ecx, 16
__asm sub eax, ecx
__asm add edi, ecx
__asm add esi, ecx
__asm neg ecx
__asm mov edx, eax
__asm loopPreMulAdd16:
__asm fld st
__asm fmul dword ptr [edi+ecx]
__asm fadd dword ptr [esi+ecx]
__asm fstp dword ptr [esi+ecx]
__asm add ecx, 4
__asm jl loopPreMulAdd16
__asm SimdMulAdd16:
__asm and eax, ~15
__asm movss xmm1, constant
__asm shufps xmm1, xmm1, 0x00
__asm add esi, eax
__asm add edi, eax
__asm neg eax
__asm align 16
__asm loopMulAdd16:
__asm movaps xmm0, [edi+eax]
__asm mulps xmm0, xmm1
__asm addps xmm0, [esi+eax]
__asm movaps [esi+eax], xmm0
__asm add eax, 16
__asm jl loopMulAdd16
__asm jmp postMulAdd
__asm MulAdd8:
__asm mov ecx, esi
__asm and ecx, 7
__asm jz SimdMulAdd8
__asm sub eax, ecx
__asm add esi, ecx
__asm add edi, ecx
__asm neg ecx
__asm mov edx, eax
__asm loopPreMulAdd8:
__asm fld st
__asm fmul dword ptr [edi+ecx]
__asm fadd dword ptr [esi+ecx]
__asm fstp dword ptr [esi+ecx]
__asm add ecx, 4
__asm jl loopPreMulAdd8
__asm SimdMulAdd8:
__asm and eax, ~15
__asm movss xmm1, constant
__asm shufps xmm1, xmm1, 0x00
__asm add esi, eax
__asm add edi, eax
__asm neg eax
__asm align 16
__asm loopMulAdd8:
__asm movlps xmm0, [edi+eax]
__asm movhps xmm0, [edi+eax+8]
__asm mulps xmm0, xmm1
__asm movlps xmm2, [esi+eax]
__asm movhps xmm2, [esi+eax+8]
__asm addps xmm0, xmm2
__asm movlps [esi+eax], xmm0
__asm movhps [esi+eax+8], xmm0
__asm add eax, 16
__asm jl loopMulAdd8
__asm jmp postMulAdd
__asm postMulAdd:
__asm and edx, 15
__asm jz MulAddDone
__asm add esi, edx
__asm add edi, edx
__asm neg edx
__asm loopPostMulAdd:
__asm fld st
__asm fmul dword ptr [edi+edx]
__asm fadd dword ptr [esi+edx]
__asm fstp dword ptr [esi+edx]
__asm add edx, 4
__asm jl loopPostMulAdd
__asm MulAddDone:
__asm fstp st
}
#define MULADD_FEW( OPER ) \
switch( count ) { \
case 0: \
return; \
case 1: \
dst[0] OPER c * src[0]; \
return; \
case 2: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \
return; \
case 3: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \
return; \
case 4: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
return; \
case 5: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; \
return; \
case 6: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \
return; \
case 7: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \
return; \
case 8: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
return; \
case 9: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
dst[8] OPER c * src[8]; \
return; \
case 10: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \
return; \
case 11: \
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \
return; \
}
/*
============
idSIMD_SSE::MulAdd
dst[i] += constant * src[i];
============
*/
void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
float c = constant;
MULADD_FEW( += )
Simd_MulAdd( dst, constant, src, count );
}
/*
============
idSIMD_SSE::MulAdd
dst[i] += src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
for ( int i = 0; i < count; i++ ) {
dst[i] += src0[i] + src1[i];
}
}
/*
============
idSIMD_SSE::MulSub
dst[i] -= constant * src[i];
============
*/
void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
float c = constant;
MULADD_FEW( -= )
Simd_MulAdd( dst, -constant, src, count );
}
/*
============
idSIMD_SSE::MulSub
dst[i] -= src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
for ( int i = 0; i < count; i++ ) {
dst[i] -= src0[i] + src1[i];
}
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
__asm
{
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
and eax, ~3
movss xmm4, [edi+0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm5, [edi+4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
jz done4
imul eax, 12
add esi, eax
neg eax
loop4:
movlps xmm1, [esi+eax+ 0]
movlps xmm2, [esi+eax+ 8]
movlps xmm3, [esi+eax+16]
movhps xmm1, [esi+eax+24]
movhps xmm2, [esi+eax+32]
movhps xmm3, [esi+eax+40]
movaps xmm0, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
add ecx, 16
add eax, 4*12
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
addps xmm0, xmm1
addps xmm0, xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loop4
done4:
and edx, 3
jz done1
loop1:
movss xmm0, [esi+eax+0]
movss xmm1, [esi+eax+4]
movss xmm2, [esi+eax+8]
mulss xmm0, xmm4
mulss xmm1, xmm5
mulss xmm2, xmm6
add ecx, 4
addss xmm0, xmm1
add eax, 12
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loop1
done1:
}
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant * src[i].Normal() + src[i][3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
__asm {
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
and eax, ~3
movss xmm5, [edi+0]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [edi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
jz startVert1
imul eax, 16
add esi, eax
neg eax
loopVert4:
movlps xmm1, [esi+eax+ 0]
movlps xmm3, [esi+eax+ 8]
movhps xmm1, [esi+eax+16]
movhps xmm3, [esi+eax+24]
movlps xmm2, [esi+eax+32]
movlps xmm4, [esi+eax+40]
movhps xmm2, [esi+eax+48]
movhps xmm4, [esi+eax+56]
movaps xmm0, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
movaps xmm2, xmm3
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
add ecx, 16
add eax, 4*16
mulps xmm0, xmm5
mulps xmm1, xmm6
mulps xmm2, xmm7
addps xmm0, xmm3
addps xmm0, xmm1
addps xmm0, xmm2
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loopVert4
startVert1:
and edx, 3
jz done
loopVert1:
movss xmm0, [esi+eax+0]
movss xmm1, [esi+eax+4]
movss xmm2, [esi+eax+8]
mulss xmm0, xmm5
mulss xmm1, xmm6
mulss xmm2, xmm7
addss xmm0, [esi+eax+12]
add ecx, 4
addss xmm0, xmm1
add eax, 16
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loopVert1
done:
}
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant * src[i].xyz;
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
// 0, 1, 2
// 3, 4, 5
// 6, 7, 8
// 9, 10, 11
__asm {
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
and eax, ~3
movss xmm4, [edi+0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm5, [edi+4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
jz startVert1
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
loopVert4:
movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
movaps xmm1, xmm0 // 3, X, 0, 1
movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
add ecx, 16
add eax, 4*DRAWVERT_SIZE
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
addps xmm0, xmm1
addps xmm0, xmm2
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loopVert4
startVert1:
and edx, 3
jz done
loopVert1:
movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
mulss xmm0, xmm4
mulss xmm1, xmm5
mulss xmm2, xmm6
add ecx, 4
addss xmm0, xmm1
add eax, DRAWVERT_SIZE
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loopVert1
done:
}
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant.Normal() * src[i] + constant[3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
__asm
{
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
and eax, ~3
movss xmm4, [edi+0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm5, [edi+4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [edi+12]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
jz done4
imul eax, 12
add esi, eax
neg eax
loop4:
movlps xmm1, [esi+eax+ 0]
movlps xmm2, [esi+eax+ 8]
movlps xmm3, [esi+eax+16]
movhps xmm1, [esi+eax+24]
movhps xmm2, [esi+eax+32]
movhps xmm3, [esi+eax+40]
movaps xmm0, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
add ecx, 16
add eax, 4*12
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
addps xmm0, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loop4
done4:
and edx, 3
jz done1
loop1:
movss xmm0, [esi+eax+0]
movss xmm1, [esi+eax+4]
movss xmm2, [esi+eax+8]
mulss xmm0, xmm4
mulss xmm1, xmm5
mulss xmm2, xmm6
addss xmm0, xmm7
add ecx, 4
addss xmm0, xmm1
add eax, 12
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loop1
done1:
}
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
#define SINGLE_OP(SRC, DEST) \
__asm movlps xmm0,[SRC] \
__asm movlps xmm1,[SRC+8] \
__asm mulps xmm0,xmm4 \
__asm mulps xmm1,xmm5 \
__asm addps xmm0,xmm1 \
__asm movaps xmm1,xmm0 \
__asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \
__asm addss xmm0,xmm1 \
__asm movss [DEST],xmm0 \
__asm add SRC,16 \
__asm add DEST,4
#define DUAL_OP(SRC, DEST) \
__asm movlps xmm0,[SRC] \
__asm movlps xmm1,[SRC+8] \
__asm movhps xmm0,[SRC+16] \
__asm movhps xmm1,[SRC+24] \
__asm mulps xmm0,xmm4 \
__asm mulps xmm1,xmm5 \
__asm addps xmm0,xmm1 \
__asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \
__asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \
__asm addps xmm0,xmm1 \
__asm movhps [DEST],xmm0 \
__asm add SRC,32 \
__asm add DEST,8
__asm {
mov edx, dst
mov eax, src
mov ebx, constant
mov ecx, count
movlps xmm4, [ebx]
shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0)
movlps xmm5, [ebx+8]
shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0)
xorps xmm0, xmm0
xorps xmm1, xmm1
_lpAlignDest:
test edx, 0x0f
jz _destAligned
SINGLE_OP(eax,edx)
dec ecx
jnz _lpAlignDest
jmp _vpExit
_destAligned:
push ecx
cmp ecx, 4
jl _post
and ecx, ~3
shl ecx, 2
lea eax, [eax+ecx*4]
add edx, ecx
neg ecx
movlps xmm0, [eax+ecx*4]
movhps xmm0, [eax+ecx*4+16]
movlps xmm2, [eax+ecx*4+32]
movhps xmm2, [eax+ecx*4+48]
jmp _lpStart
align 16
_lp:
prefetchnta [eax+ecx*4+128]
addps xmm1, xmm0
movlps xmm0, [eax+ecx*4]
movhps xmm0, [eax+ecx*4+16]
movlps xmm2, [eax+ecx*4+32]
movhps xmm2, [eax+ecx*4+48]
movaps [edx+ecx-16],xmm1
_lpStart:
movlps xmm1, [eax+ecx*4+8]
movhps xmm1, [eax+ecx*4+24]
movlps xmm3, [eax+ecx*4+40]
movhps xmm3, [eax+ecx*4+56]
add ecx, 16
mulps xmm1, xmm5
mulps xmm2, xmm4
mulps xmm3, xmm5
addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2
mulps xmm0, xmm4
addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0
movaps xmm1, xmm0
shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0
shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0
js _lp
addps xmm1, xmm0
movaps [edx+ecx-16], xmm1
_post:
pop ecx
and ecx, 0x3
cmp ecx, 2
jl _post1
DUAL_OP(eax,edx)
sub ecx, 2
_post1:
cmp ecx, 1
jne _vpExit
SINGLE_OP(eax,edx)
_vpExit:
}
#undef DUAL_OP
#undef SINGLE_OP
}
/*
============
idSIMD_SSE::Dot
dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
// 0, 1, 2
// 3, 4, 5
// 6, 7, 8
// 9, 10, 11
__asm {
mov eax, count
mov edi, constant
mov edx, eax
mov esi, src
mov ecx, dst
and eax, ~3
movss xmm4, [edi+0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm5, [edi+4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [edi+8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [edi+12]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
jz startVert1
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
loopVert4:
movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
movaps xmm1, xmm0 // 3, X, 0, 1
movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
add ecx, 16
add eax, 4*DRAWVERT_SIZE
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
addps xmm0, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
movlps [ecx-16+0], xmm0
movhps [ecx-16+8], xmm0
jl loopVert4
startVert1:
and edx, 3
jz done
loopVert1:
movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
mulss xmm0, xmm4
mulss xmm1, xmm5
mulss xmm2, xmm6
addss xmm0, xmm7
add ecx, 4
addss xmm0, xmm1
add eax, DRAWVERT_SIZE
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loopVert1
done:
}
}
/*
============
idSIMD_SSE::Dot
dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
__asm
{
mov eax, count
mov edi, src0
mov edx, eax
mov esi, src1
mov ecx, dst
and eax, ~3
jz done4
imul eax, 12
add edi, eax
add esi, eax
neg eax
loop4:
movlps xmm0, [esi+eax] // 0, 1, X, X
movlps xmm3, [edi+eax] // 0, 1, X, X
movlps xmm1, [esi+eax+8] // 2, 3, X, X
movlps xmm4, [edi+eax+8] // 2, 3, X, X
movhps xmm0, [esi+eax+24] // 0, 1, 6, 7
movhps xmm3, [edi+eax+24] // 0, 1, 6, 7
movhps xmm1, [esi+eax+32] // 2, 3, 8, 9
movhps xmm4, [edi+eax+32] // 2, 3, 8, 9
movlps xmm2, [esi+eax+16] // 4, 5, X, X
movlps xmm5, [edi+eax+16] // 4, 5, X, X
movhps xmm2, [esi+eax+40] // 4, 5, 10, 11
movhps xmm5, [edi+eax+40] // 4, 5, 10, 11
add ecx, 16
add eax, 48
mulps xmm0, xmm3
mulps xmm1, xmm4
mulps xmm2, xmm5
movaps xmm7, xmm0
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) // 0, 6, 3, 9
shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 ) // 1, 7, 4, 10
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) // 2, 8, 5, 11
addps xmm7, xmm0
addps xmm7, xmm1
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
movlps [ecx-16+0], xmm7
movhps [ecx-16+8], xmm7
jl loop4
done4:
and edx, 3
jz done1
loop1:
movss xmm0, [esi+eax+0]
movss xmm3, [edi+eax+0]
movss xmm1, [esi+eax+4]
movss xmm4, [edi+eax+4]
movss xmm2, [esi+eax+8]
movss xmm5, [edi+eax+8]
mulss xmm0, xmm3
mulss xmm1, xmm4
mulss xmm2, xmm5
add ecx, 4
addss xmm0, xmm1
add eax, 12
addss xmm0, xmm2
dec edx
movss [ecx-4], xmm0
jnz loop1
done1:
}
}
/*
============
idSIMD_SSE::Dot
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
============
*/
void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
switch( count ) {
case 0:
dot = 0.0f;
return;
case 1:
dot = src1[0] * src2[0];
return;
case 2:
dot = src1[0] * src2[0] + src1[1] * src2[1];
return;
case 3:
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
return;
default:
__asm {
mov ecx, src1
mov edx, src2
mov eax, ecx
or eax, edx
and eax, 15
jz alignedDot
// unaligned
mov eax, count
shr eax, 2
shl eax, 4
add ecx, eax
add edx, eax
neg eax
movups xmm0, [ecx+eax]
movups xmm1, [edx+eax]
mulps xmm0, xmm1
add eax, 16
jz doneDot
loopUnalignedDot:
movups xmm1, [ecx+eax]
movups xmm2, [edx+eax]
mulps xmm1, xmm2
addps xmm0, xmm1
add eax, 16
jl loopUnalignedDot
jmp doneDot
// aligned
alignedDot:
mov eax, count
shr eax, 2
shl eax, 4
add ecx, eax
add edx, eax
neg eax
movaps xmm0, [ecx+eax]
movaps xmm1, [edx+eax]
mulps xmm0, xmm1
add eax, 16
jz doneDot
loopAlignedDot:
movaps xmm1, [ecx+eax]
movaps xmm2, [edx+eax]
mulps xmm1, xmm2
addps xmm0, xmm1
add eax, 16
jl loopAlignedDot
doneDot:
}
switch( count & 3 ) {
case 1:
__asm {
movss xmm1, [ecx]
movss xmm2, [edx]
mulss xmm1, xmm2
addss xmm0, xmm1
}
break;
case 2:
__asm {
xorps xmm2, xmm2
movlps xmm1, [ecx]
movlps xmm2, [edx]
mulps xmm1, xmm2
addps xmm0, xmm1
}
break;
case 3:
__asm {
movss xmm1, [ecx]
movhps xmm1, [ecx+4]
movss xmm2, [edx]
movhps xmm2, [edx+4]
mulps xmm1, xmm2
addps xmm0, xmm1
}
break;
}
__asm {
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm1
mov eax, dot
movss [eax], xmm0
}
return;
}
}
//
// cmpeqps == Equal
// cmpneqps != Not Equal
// cmpltps < Less Than
// cmpnltps >= Not Less Than
// cmpnleps > Not Less Or Equal
//
#define FLIP not al
#define NOFLIP
#define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
int i, cnt, pre, post; \
float *aligned; \
\
/* if the float array is not aligned on a 4 byte boundary */ \
if ( ((int) SRC0) & 3 ) { \
/* unaligned memory access */ \
pre = 0; \
cnt = COUNT >> 2; \
post = COUNT - (cnt<<2); \
__asm mov edx, cnt \
__asm test edx, edx \
__asm je doneCmp \
__asm push ebx \
__asm neg edx \
__asm mov esi, SRC0 \
__asm prefetchnta [esi+64] \
__asm movss xmm1, CONSTANT \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mov edi, DST \
__asm mov ecx, 0x01010101 \
__asm loopNA: \
__asm movups xmm0, [esi] \
__asm prefetchnta [esi+128] \
__asm CMPSIMD xmm0, xmm1 \
__asm movmskps eax, xmm0 \
__asm DOFLIP \
__asm mov ah, al \
__asm shr ah, 1 \
__asm mov bx, ax \
__asm shl ebx, 14 \
__asm mov bx, ax \
__asm and ebx, ecx \
__asm mov dword ptr [edi], ebx \
__asm add esi, 16 \
__asm add edi, 4 \
__asm inc edx \
__asm jl loopNA \
__asm pop ebx \
} \
else { \
/* aligned memory access */ \
aligned = (float *) ((((int) SRC0) + 15) & ~15); \
if ( (int)aligned > ((int)src0) + COUNT ) { \
pre = COUNT; \
post = 0; \
} \
else { \
pre = aligned - SRC0; \
cnt = (COUNT - pre) >> 2; \
post = COUNT - pre - (cnt<<2); \
__asm mov edx, cnt \
__asm test edx, edx \
__asm je doneCmp \
__asm push ebx \
__asm neg edx \
__asm mov esi, aligned \
__asm prefetchnta [esi+64] \
__asm movss xmm1, CONSTANT \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mov edi, DST \
__asm add edi, pre \
__asm mov ecx, 0x01010101 \
__asm loopA: \
__asm movaps xmm0, [esi] \
__asm prefetchnta [esi+128] \
__asm CMPSIMD xmm0, xmm1 \
__asm movmskps eax, xmm0 \
__asm DOFLIP \
__asm mov ah, al \
__asm shr ah, 1 \
__asm mov bx, ax \
__asm shl ebx, 14 \
__asm mov bx, ax \
__asm and ebx, ecx \
__asm mov dword ptr [edi], ebx \
__asm add esi, 16 \
__asm add edi, 4 \
__asm inc edx \
__asm jl loopA \
__asm pop ebx \
} \
} \
doneCmp: \
double c = constant; \
for ( i = 0; i < pre; i++ ) { \
dst[i] = src0[i] CMP c; \
} \
for ( i = count - post; i < count; i++ ) { \
dst[i] = src0[i] CMP c; \
}
#define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
int i, cnt, pre, post; \
float *aligned; \
\
/* if the float array is not aligned on a 4 byte boundary */ \
if ( ((int) SRC0) & 3 ) { \
/* unaligned memory access */ \
pre = 0; \
cnt = COUNT >> 2; \
post = COUNT - (cnt<<2); \
__asm mov edx, cnt \
__asm test edx, edx \
__asm je doneCmp \
__asm push ebx \
__asm neg edx \
__asm mov esi, SRC0 \
__asm prefetchnta [esi+64] \
__asm movss xmm1, CONSTANT \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mov edi, DST \
__asm mov cl, bitNum \
__asm loopNA: \
__asm movups xmm0, [esi] \
__asm prefetchnta [esi+128] \
__asm CMPSIMD xmm0, xmm1 \
__asm movmskps eax, xmm0 \
__asm DOFLIP \
__asm mov ah, al \
__asm shr ah, 1 \
__asm mov bx, ax \
__asm shl ebx, 14 \
__asm mov bx, ax \
__asm and ebx, 0x01010101 \
__asm shl ebx, cl \
__asm or ebx, dword ptr [edi] \
__asm mov dword ptr [edi], ebx \
__asm add esi, 16 \
__asm add edi, 4 \
__asm inc edx \
__asm jl loopNA \
__asm pop ebx \
} \
else { \
/* aligned memory access */ \
aligned = (float *) ((((int) SRC0) + 15) & ~15); \
if ( (int)aligned > ((int)src0) + COUNT ) { \
pre = COUNT; \
post = 0; \
} \
else { \
pre = aligned - SRC0; \
cnt = (COUNT - pre) >> 2; \
post = COUNT - pre - (cnt<<2); \
__asm mov edx, cnt \
__asm test edx, edx \
__asm je doneCmp \
__asm push ebx \
__asm neg edx \
__asm mov esi, aligned \
__asm prefetchnta [esi+64] \
__asm movss xmm1, CONSTANT \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mov edi, DST \
__asm add edi, pre \
__asm mov cl, bitNum \
__asm loopA: \
__asm movaps xmm0, [esi] \
__asm prefetchnta [esi+128] \
__asm CMPSIMD xmm0, xmm1 \
__asm movmskps eax, xmm0 \
__asm DOFLIP \
__asm mov ah, al \
__asm shr ah, 1 \
__asm mov bx, ax \
__asm shl ebx, 14 \
__asm mov bx, ax \
__asm and ebx, 0x01010101 \
__asm shl ebx, cl \
__asm or ebx, dword ptr [edi] \
__asm mov dword ptr [edi], ebx \
__asm add esi, 16 \
__asm add edi, 4 \
__asm inc edx \
__asm jl loopA \
__asm pop ebx \
} \
} \
doneCmp: \
float c = constant; \
for ( i = 0; i < pre; i++ ) { \
dst[i] |= ( src0[i] CMP c ) << BITNUM; \
} \
for ( i = count - post; i < count; i++ ) { \
dst[i] |= ( src0[i] CMP c ) << BITNUM; \
}
/*
============
idSIMD_SSE::CmpGT
dst[i] = src0[i] > constant;
============
*/
void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
}
/*
============
idSIMD_SSE::CmpGT
dst[i] |= ( src0[i] > constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
}
/*
============
idSIMD_SSE::CmpGE
dst[i] = src0[i] >= constant;
============
*/
void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
}
/*
============
idSIMD_SSE::CmpGE
dst[i] |= ( src0[i] >= constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
}
/*
============
idSIMD_SSE::CmpLT
dst[i] = src0[i] < constant;
============
*/
void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
}
/*
============
idSIMD_SSE::CmpLT
dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
}
/*
============
idSIMD_SSE::CmpLE
dst[i] = src0[i] <= constant;
============
*/
void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
}
/*
============
idSIMD_SSE::CmpLE
dst[i] |= ( src0[i] <= constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
}
/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
int i, pre, post;
min = idMath::INFINITY; max = -idMath::INFINITY;
__asm
{
push ebx
mov eax, min
mov ebx, max
movss xmm0, [eax]
movss xmm1, [ebx]
shufps xmm0, xmm0, 0
shufps xmm1, xmm1, 0
KFLOATINITS( src, count, pre, post )
and eax, 15
jz lpA
jmp lpNA
align 16
lpNA:
movups xmm2, [edx+ebx]
movups xmm3, [edx+ebx+16]
minps xmm0, xmm2
maxps xmm1, xmm2
prefetchnta [edx+ebx+64]
minps xmm0, xmm3
maxps xmm1, xmm3
add ebx, 16*2
jl lpNA
jmp done2
lpA:
movaps xmm2, [edx+ebx]
movaps xmm3, [edx+ebx+16]
minps xmm0, xmm2
maxps xmm1, xmm2
prefetchnta [edx+ebx+64]
minps xmm0, xmm3
maxps xmm1, xmm3
add ebx, 16*2
jl lpA
jmp done2
align 16
done2:
movaps xmm2, xmm0
movaps xmm3, xmm1
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
minss xmm0, xmm2
maxss xmm1, xmm3
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
minss xmm0, xmm2
maxss xmm1, xmm3
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
minss xmm0, xmm2
maxss xmm1, xmm3
mov eax, min
mov ebx, max
movss [eax], xmm0
movss [ebx], xmm1
done:
pop ebx
}
for ( i = 0; i < pre; i++ ) {
float tmp = src[i];
if ( tmp > max ) {
max = tmp;
}
if ( tmp < min ) {
min = tmp;
}
}
for ( i = count - post; i < count; i++ ) {
float tmp = src[i];
if ( tmp > max ) {
max = tmp;
}
if ( tmp < min ) {
min = tmp;
}
}
}
/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
__asm {
mov eax, count
test eax, eax
movss xmm0, idMath::INFINITY
xorps xmm1, xmm1
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
subps xmm1, xmm0
jz done
mov ecx, eax
and ecx, 1
mov esi, src
jz startLoop
movlps xmm2, [esi]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
dec eax
add esi, 2*4
minps xmm0, xmm2
maxps xmm1, xmm2
startLoop:
imul eax, 2*4
add esi, eax
neg eax
loopVert:
movlps xmm2, [esi+eax]
movhps xmm2, [esi+eax+8]
add eax, 4*4
minps xmm0, xmm2
maxps xmm1, xmm2
jl loopVert
done:
movaps xmm2, xmm0
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
minps xmm0, xmm2
mov esi, min
movlps [esi], xmm0
movaps xmm3, xmm1
shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
maxps xmm1, xmm3
mov edi, max
movlps [edi], xmm1
}
}
/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
__asm {
movss xmm0, idMath::INFINITY
xorps xmm1, xmm1
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
subps xmm1, xmm0
movaps xmm2, xmm0
movaps xmm3, xmm1
mov esi, src
mov eax, count
and eax, ~3
jz done4
imul eax, 12
add esi, eax
neg eax
loop4:
// prefetchnta [esi+4*12]
movss xmm4, [esi+eax+0*12+8]
movhps xmm4, [esi+eax+0*12+0]
minps xmm0, xmm4
maxps xmm1, xmm4
movss xmm5, [esi+eax+1*12+0]
movhps xmm5, [esi+eax+1*12+4]
minps xmm2, xmm5
maxps xmm3, xmm5
movss xmm6, [esi+eax+2*12+8]
movhps xmm6, [esi+eax+2*12+0]
minps xmm0, xmm6
maxps xmm1, xmm6
movss xmm7, [esi+eax+3*12+0]
movhps xmm7, [esi+eax+3*12+4]
minps xmm2, xmm7
maxps xmm3, xmm7
add eax, 4*12
jl loop4
done4:
mov eax, count
and eax, 3
jz done1
imul eax, 12
add esi, eax
neg eax
loop1:
movss xmm4, [esi+eax+0*12+8]
movhps xmm4, [esi+eax+0*12+0]
minps xmm0, xmm4
maxps xmm1, xmm4
add eax, 12
jl loop1
done1:
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
minps xmm0, xmm2
maxps xmm1, xmm3
mov esi, min
movhps [esi], xmm0
movss [esi+8], xmm0
mov edi, max
movhps [edi], xmm1
movss [edi+8], xmm1
}
}
/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__asm {
movss xmm0, idMath::INFINITY
xorps xmm1, xmm1
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
subps xmm1, xmm0
movaps xmm2, xmm0
movaps xmm3, xmm1
mov esi, src
mov eax, count
and eax, ~3
jz done4
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
loop4:
// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm4
maxps xmm1, xmm4
movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
minps xmm2, xmm5
maxps xmm3, xmm5
movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm6
maxps xmm1, xmm6
movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
minps xmm2, xmm7
maxps xmm3, xmm7
add eax, 4*DRAWVERT_SIZE
jl loop4
done4:
mov eax, count
and eax, 3
jz done1
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
loop1:
movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm4
maxps xmm1, xmm4
add eax, DRAWVERT_SIZE
jl loop1
done1:
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
minps xmm0, xmm2
maxps xmm1, xmm3
mov esi, min
movhps [esi], xmm0
movss [esi+8], xmm0
mov edi, max
movhps [edi], xmm1
movss [edi+8], xmm1
}
}
/*
============
idSIMD_SSE::MinMax
============
*/
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__asm {
movss xmm0, idMath::INFINITY
xorps xmm1, xmm1
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
subps xmm1, xmm0
movaps xmm2, xmm0
movaps xmm3, xmm1
mov edi, indexes
mov esi, src
mov eax, count
and eax, ~3
jz done4
shl eax, 2
add edi, eax
neg eax
loop4:
// prefetchnta [edi+128]
// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
mov edx, [edi+eax+0]
imul edx, DRAWVERT_SIZE
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm4
maxps xmm1, xmm4
mov edx, [edi+eax+4]
imul edx, DRAWVERT_SIZE
movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
minps xmm2, xmm5
maxps xmm3, xmm5
mov edx, [edi+eax+8]
imul edx, DRAWVERT_SIZE
movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm6
maxps xmm1, xmm6
mov edx, [edi+eax+12]
imul edx, DRAWVERT_SIZE
movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
minps xmm2, xmm7
maxps xmm3, xmm7
add eax, 4*4
jl loop4
done4:
mov eax, count
and eax, 3
jz done1
shl eax, 2
add edi, eax
neg eax
loop1:
mov edx, [edi+eax+0]
imul edx, DRAWVERT_SIZE;
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
minps xmm0, xmm4
maxps xmm1, xmm4
add eax, 4
jl loop1
done1:
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
minps xmm0, xmm2
maxps xmm1, xmm3
mov esi, min
movhps [esi], xmm0
movss [esi+8], xmm0
mov edi, max
movhps [edi], xmm1
movss [edi+8], xmm1
}
}
/*
============
idSIMD_SSE::Clamp
============
*/
void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
int i, pre, post;
__asm
{
movss xmm0,min
movss xmm1,max
shufps xmm0,xmm0,0
shufps xmm1,xmm1,0
KFLOATINITDS( dst, src, count, pre, post )
and eax,15
jne lpNA
jmp lpA
align 16
lpA:
movaps xmm2,[edx+ebx]
movaps xmm3,[edx+ebx+16]
maxps xmm2,xmm0
maxps xmm3,xmm0
prefetchnta [edx+ebx+64]
minps xmm2,xmm1
minps xmm3,xmm1
movaps [edi+ebx],xmm2
movaps [edi+ebx+16],xmm3
add ebx,16*2
jl lpA
jmp done
align 16
lpNA:
movups xmm2,[edx+ebx]
movups xmm3,[edx+ebx+16]
maxps xmm2,xmm0
maxps xmm3,xmm0
prefetchnta [edx+ebx+64]
minps xmm2,xmm1
minps xmm3,xmm1
movaps [edi+ebx],xmm2
movaps [edi+ebx+16],xmm3
add ebx,16*2
jl lpNA
done:
}
for ( i = 0; i < pre; i++ ) {
if ( src[i] < min )
dst[i] = min;
else if ( src[i] > max )
dst[i] = max;
else
dst[i] = src[i];
}
for( i = count - post; i < count; i++ ) {
if ( src[i] < min )
dst[i] = min;
else if ( src[i] > max )
dst[i] = max;
else
dst[i] = src[i];
}
}
/*
============
idSIMD_SSE::ClampMin
============
*/
void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
int i, pre, post;
__asm
{
movss xmm0,min
shufps xmm0,xmm0,0
KFLOATINITDS( dst, src, count, pre, post )
and eax,15
jne lpNA
jmp lpA
align 16
lpA:
movaps xmm2,[edx+ebx]
movaps xmm3,[edx+ebx+16]
maxps xmm2,xmm0
prefetchnta [edx+ebx+64]
maxps xmm3,xmm0
movaps [edi+ebx],xmm2
movaps [edi+ebx+16],xmm3
add ebx,16*2
jl lpA
jmp done
align 16
lpNA:
movups xmm2,[edx+ebx]
movups xmm3,[edx+ebx+16]
maxps xmm2,xmm0
prefetchnta [edx+ebx+64]
maxps xmm3,xmm0
movaps [edi+ebx],xmm2
movaps [edi+ebx+16],xmm3
add ebx,16*2
jl lpNA
done:
}
for( i = 0; i < pre; i++ ) {
if ( src[i] < min )
dst[i] = min;
else
dst[i] = src[i];
}
for( i = count - post; i < count; i++ ) {
if ( src[i] < min )
dst[i] = min;
else
dst[i] = src[i];
}
}
/*
============
idSIMD_SSE::ClampMax
============
*/
void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
int i, pre, post;
__asm
{
movss xmm1,max
shufps xmm1,xmm1,0
KFLOATINITDS( dst, src, count, pre, post )
and eax,15
jne lpNA
jmp lpA
align 16
lpA:
movaps xmm2,[edx+ebx]
movaps xmm3,[edx+ebx+16]
minps xmm2,xmm1
prefetchnta [edx+ebx+64]
minps xmm3,xmm1
movaps [edi+ebx],xmm2
movaps [edi+ebx+16],xmm3
add ebx,16*2
jl lpA
jmp done
align 16
lpNA:
movups xmm2,[edx+ebx]
movups xmm3,[edx+ebx+16]
minps xmm2,xmm1
prefetchnta [edx+ebx+64]
minps xmm3,xmm1
movaps [edi+ebx],xmm2
movaps [edi+ebx+16],xmm3
add ebx,16*2
jl lpNA
done:
}
for( i = 0; i < pre; i++ ) {
if ( src[i] > max )
dst[i] = max;
else
dst[i] = src[i];
}
for( i = count - post; i < count; i++ ) {
if ( src[i] > max )
dst[i] = max;
else
dst[i] = src[i];
}
}
/*
============
idSIMD_SSE::Zero16
============
*/
void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
__asm {
mov edx, dst
mov eax, count
add eax, 3
shr eax, 2
jz doneZero16
shl eax, 4
add edx, eax
neg eax
xorps xmm0, xmm0
loopZero16:
movaps [edx+eax], xmm0
add eax, 16
jl loopZero16
doneZero16:
}
}
/*
============
idSIMD_SSE::Negate16
============
*/
void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
__asm {
mov edx, dst
mov eax, count
add eax, 3
shr eax, 2
jz doneNegate16
shl eax, 4
add edx, eax
neg eax
movss xmm0, SIMD_SP_signBitMask
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
loopNegate16:
movaps xmm1, [edx+eax]
xorps xmm1, xmm0
movaps [edx+eax], xmm1
add eax, 16
jl loopNegate16
doneNegate16:
}
}
/*
============
idSIMD_SSE::Copy16
============
*/
void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
__asm {
mov ecx, src
mov edx, dst
mov eax, count
add eax, 3
shr eax, 2
jz doneCopy16
shl eax, 4
add ecx, eax
add edx, eax
neg eax
loopCopy16:
movaps xmm0, [ecx+eax]
movaps [edx+eax], xmm0
add eax, 16
jl loopCopy16
doneCopy16:
}
}
/*
============
idSIMD_SSE::Add16
============
*/
void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
__asm {
mov ecx, src1
mov edx, src2
mov esi, dst
mov eax, count
add eax, 3
shr eax, 2
jz doneAdd16
shl eax, 4
add esi, eax
add ecx, eax
add edx, eax
neg eax
loopAdd16:
movaps xmm0, [ecx+eax]
addps xmm0, [edx+eax]
movaps [esi+eax], xmm0
add eax, 16
jl loopAdd16
doneAdd16:
}
}
/*
============
idSIMD_SSE::Sub16
============
*/
void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
__asm {
mov ecx, src1
mov edx, src2
mov esi, dst
mov eax, count
add eax, 3
shr eax, 2
jz doneSub16
shl eax, 4
add esi, eax
add ecx, eax
add edx, eax
neg eax
loopSub16:
movaps xmm0, [ecx+eax]
subps xmm0, [edx+eax]
movaps [esi+eax], xmm0
add eax, 16
jl loopSub16
doneSub16:
}
}
/*
============
idSIMD_SSE::Mul16
============
*/
void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
__asm {
mov ecx, dst
mov edx, src1
mov eax, count
add eax, 3
shr eax, 2
jz doneMulScalar16
movss xmm1, constant
shl eax, 4
add ecx, eax
add edx, eax
neg eax
shufps xmm1, xmm1, 0x00
loopMulScalar16:
movaps xmm0, [edx+eax]
mulps xmm0, xmm1
movaps [ecx+eax], xmm0
add eax, 16
jl loopMulScalar16
doneMulScalar16:
}
}
/*
============
idSIMD_SSE::AddAssign16
============
*/
void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
__asm {
mov ecx, dst
mov edx, src
mov eax, count
add eax, 3
shr eax, 2
jz doneAddAssign16
shl eax, 4
add ecx, eax
add edx, eax
neg eax
loopAddAssign16:
movaps xmm0, [ecx+eax]
addps xmm0, [edx+eax]
movaps [ecx+eax], xmm0
add eax, 16
jl loopAddAssign16
doneAddAssign16:
}
}
/*
============
idSIMD_SSE::SubAssign16
============
*/
void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
__asm {
mov ecx, dst
mov edx, src
mov eax, count
add eax, 3
shr eax, 2
jz doneSubAssign16
shl eax, 4
add ecx, eax
add edx, eax
neg eax
loopSubAssign16:
movaps xmm0, [ecx+eax]
subps xmm0, [edx+eax]
movaps [ecx+eax], xmm0
add eax, 16
jl loopSubAssign16
doneSubAssign16:
}
}
/*
============
idSIMD_SSE::MulAssign16
============
*/
void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
__asm {
mov ecx, dst
mov eax, count
add eax, 3
shr eax, 2
jz doneMulAssign16
movss xmm1, constant
shl eax, 4
add ecx, eax
neg eax
shufps xmm1, xmm1, 0x00
loopMulAssign16:
movaps xmm0, [ecx+eax]
mulps xmm0, xmm1
movaps [ecx+eax], xmm0
add eax, 16
jl loopMulAssign16
doneMulAssign16:
}
}
/*
============
idSIMD_SSE::MatX_MultiplyVecX
optimizes the following matrix multiplications:
NxN * Nx1
Nx6 * 6x1
6xN * Nx1
with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 ) \
__asm movss [eax+offset], reg1
#define STORE2LO( offset, reg1, reg2 ) \
__asm movlps [eax+offset], reg1
#define STORE2HI( offset, reg1, reg2 ) \
__asm movhps [eax+offset], reg1
#define STORE4( offset, reg1, reg2 ) \
__asm movlps [eax+offset], reg1 \
__asm movhps [eax+offset+8], reg1
#define STOREC =
int numRows;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumColumns() );
assert( dst.GetSize() >= mat.GetNumRows() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numRows = mat.GetNumRows();
switch( mat.GetNumColumns() ) {
case 1: {
switch( numRows ) {
case 1: { // 1x1 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
mulss xmm0, [edi]
STORE1( 0, xmm0, xmm1 )
}
return;
}
case 6: { // 6x1 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
STORE4( 0, xmm0, xmm2 )
STORE2LO( 16, xmm1, xmm2 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0];
mPtr++;
}
return;
}
}
break;
}
case 2: {
switch( numRows ) {
case 2: { // 2x2 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
movss xmm1, [esi+4]
movss xmm2, [edi]
mulss xmm2, xmm0
movss xmm3, [edi+4]
mulss xmm3, xmm1
addss xmm2, xmm3
STORE1( 0, xmm2, xmm4 )
mulss xmm0, [edi+8]
mulss xmm1, [edi+8+4]
addss xmm0, xmm1
STORE1( 4, xmm0, xmm4 )
}
return;
}
case 6: { // 6x2 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm7, [esi]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movaps xmm0, [edi]
mulps xmm0, xmm7
movaps xmm1, [edi+16]
mulps xmm1, xmm7
movaps xmm2, xmm0
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
movaps xmm3, [edi+32]
addps xmm0, xmm2
mulps xmm3, xmm7
STORE4( 0, xmm0, xmm4 )
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm1, xmm3
addps xmm3, xmm1
STORE2LO( 16, xmm3, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
mPtr += 2;
}
return;
}
}
break;
}
case 3: {
switch( numRows ) {
case 3: { // 3x3 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
movss xmm4, [edi]
mulss xmm4, xmm0
movss xmm1, [esi+4]
movss xmm5, [edi+4]
mulss xmm5, xmm1
addss xmm4, xmm5
movss xmm2, [esi+8]
movss xmm6, [edi+8]
mulss xmm6, xmm2
addss xmm4, xmm6
movss xmm3, [edi+12]
mulss xmm3, xmm0
STORE1( 0, xmm4, xmm7 );
movss xmm5, [edi+12+4]
mulss xmm5, xmm1
addss xmm3, xmm5
movss xmm6, [edi+12+8]
mulss xmm6, xmm2
addss xmm3, xmm6
mulss xmm0, [edi+24]
mulss xmm1, [edi+24+4]
STORE1( 4, xmm3, xmm7 );
addss xmm0, xmm1
mulss xmm2, [edi+24+8]
addss xmm0, xmm2
STORE1( 8, xmm0, xmm7 );
}
return;
}
case 6: { // 6x3 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm5, [esi]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [esi+4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [esi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
movlps xmm1, [edi+4*4]
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
movlps xmm2, [edi+6*4]
movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
mulps xmm0, xmm5
movlps xmm3, [edi+10*4]
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
movaps xmm3, xmm1
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
mulps xmm1, xmm6
shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm3
STORE4( 0, xmm0, xmm4 )
movss xmm1, [edi+12*4]
mulss xmm1, xmm5
movss xmm2, [edi+13*4]
mulss xmm2, xmm6
movss xmm3, [edi+14*4]
mulss xmm3, xmm7
addss xmm1, xmm2
addss xmm1, xmm3
STORE1( 16, xmm1, xmm4 )
mulss xmm5, [edi+15*4]
mulss xmm6, [edi+16*4]
mulss xmm7, [edi+17*4]
addss xmm5, xmm6
addss xmm5, xmm7
STORE1( 20, xmm5, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
mPtr += 3;
}
return;
}
}
break;
}
case 4: {
switch( numRows ) {
case 4: { // 4x4 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, qword ptr [esi ]
movlps xmm0, qword ptr [edi ]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm0, qword ptr [edi+16]
mulps xmm0, xmm6
movlps xmm7, qword ptr [esi+ 8]
movlps xmm2, qword ptr [edi+ 8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm2, qword ptr [edi+24]
mulps xmm2, xmm7
movlps xmm1, qword ptr [edi+32]
movhps xmm1, qword ptr [edi+48]
mulps xmm1, xmm6
movlps xmm3, qword ptr [edi+40]
addps xmm0, xmm2
movhps xmm3, qword ptr [edi+56]
mulps xmm3, xmm7
movaps xmm4, xmm0
addps xmm1, xmm3
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm4
STORE4( 0, xmm0, xmm2 )
}
return;
}
case 6: { // 6x4 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, qword ptr [esi+ 0]
movlps xmm0, qword ptr [edi+ 0]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm0, qword ptr [edi+16]
mulps xmm0, xmm6
movlps xmm7, qword ptr [esi+ 8]
movlps xmm2, qword ptr [edi+ 8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm2, qword ptr [edi+24]
mulps xmm2, xmm7
movlps xmm1, qword ptr [edi+32]
movhps xmm1, qword ptr [edi+48]
mulps xmm1, xmm6
movlps xmm3, qword ptr [edi+40]
addps xmm0, xmm2
movhps xmm3, qword ptr [edi+56]
mulps xmm3, xmm7
movaps xmm4, xmm0
addps xmm1, xmm3
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm4
movlps xmm1, qword ptr [edi+64]
movhps xmm1, qword ptr [edi+80]
STORE4( 0, xmm0, xmm4 )
mulps xmm1, xmm6
movlps xmm2, qword ptr [edi+72]
movhps xmm2, qword ptr [edi+88]
mulps xmm2, xmm7
addps xmm1, xmm2
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm3, xmm1
addps xmm1, xmm3
STORE2LO( 16, xmm1, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
mPtr += 4;
}
return;
}
}
break;
}
case 5: {
switch( numRows ) {
case 5: { // 5x5 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
movss xmm7, [esi+0*4]
shufps xmm7, xmm7, 0
mulps xmm0, xmm7
movss xmm5, [esi+1*4]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
addps xmm0, xmm1
movss xmm6, [esi+2*4]
shufps xmm6, xmm6, 0
mulps xmm2, xmm6
addps xmm0, xmm2
movss xmm1, [esi+3*4]
shufps xmm1, xmm1, 0
mulps xmm3, xmm1
addps xmm0, xmm3
movss xmm2, [esi+4*4]
shufps xmm2, xmm2, 0
mulps xmm4, xmm2
addps xmm0, xmm4
mulss xmm7, [edi+20*4]
mulss xmm5, [edi+21*4]
addps xmm7, xmm5
mulss xmm6, [edi+22*4]
addps xmm7, xmm6
mulss xmm1, [edi+23*4]
addps xmm7, xmm1
mulss xmm2, [edi+24*4]
addps xmm7, xmm2
STORE4( 0, xmm0, xmm3 )
STORE1( 16, xmm7, xmm4 )
}
return;
}
case 6: { // 6x5 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, [esi]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movlps xmm7, [esi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movlps xmm0, [edi]
movhps xmm3, [edi+8]
movaps xmm1, [edi+16]
movlps xmm2, [edi+32]
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
mulps xmm0, xmm6
mulps xmm3, xmm7
movlps xmm2, [edi+40]
addps xmm0, xmm3 // xmm0 + xmm1
movhps xmm5, [edi+40+8]
movlps xmm3, [edi+40+16]
movhps xmm3, [edi+40+24]
movlps xmm4, [edi+40+32]
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
mulps xmm2, xmm6
mulps xmm5, xmm7
addps xmm2, xmm5 // xmm2 + xmm3
movss xmm5, [esi+16]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm4, xmm0
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
addps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
STORE4( 0, xmm0, xmm2 )
movlps xmm4, [edi+80]
movhps xmm3, [edi+80+8]
movaps xmm1, [edi+80+16]
movlps xmm2, [edi+80+32]
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
mulps xmm4, xmm6
mulps xmm3, xmm7
mulps xmm1, xmm5
addps xmm4, xmm3 // xmm4 + xmm1
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
addps xmm4, xmm1
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
addps xmm4, xmm1
STORE2LO( 16, xmm4, xmm2 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
mPtr += 5;
}
return;
}
}
break;
}
case 6: {
switch( numRows ) {
case 1: { // 1x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
mulss xmm0, [edi]
movss xmm1, [esi+4]
mulss xmm1, [edi+4]
movss xmm2, [esi+8]
addss xmm0, xmm1
mulss xmm2, [edi+8]
movss xmm3, [esi+12]
addss xmm0, xmm2
mulss xmm3, [edi+12]
movss xmm4, [esi+16]
addss xmm0, xmm3
mulss xmm4, [edi+16]
movss xmm5, [esi+20]
addss xmm0, xmm4
mulss xmm5, [edi+20]
movss xmm6, [esi+24]
addss xmm0, xmm5
mulss xmm6, [edi+24]
addss xmm0, xmm6
STORE1( 0, xmm0, xmm7 )
}
return;
}
case 2: { // 2x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm0, xmm1
addps xmm0, xmm1
STORE2LO( 0, xmm0, xmm3 )
}
return;
}
case 3: { // 3x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm0, xmm1
addps xmm0, xmm1
STORE2LO( 0, xmm0, xmm3 )
// row 2
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm1
STORE1( 8, xmm0, xmm3 )
}
return;
}
case 4: { // 4x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm7, xmm0
movlhps xmm7, xmm2
addps xmm7, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm7, xmm0
// row 2 and 3
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
movaps xmm2, [edi+48+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
// last 4 additions for the first 4 rows and store result
movaps xmm0, xmm7
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm7
STORE4( 0, xmm0, xmm4 )
}
return;
}
case 5: { // 5x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm7, xmm0
movlhps xmm7, xmm2
addps xmm7, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm7, xmm0
// row 2 and 3
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
movaps xmm2, [edi+48+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
// last 4 additions for the first 4 rows and store result
movaps xmm0, xmm7
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm7
STORE4( 0, xmm0, xmm3 )
// row 5
movaps xmm0, [edi+96]
movaps xmm1, [edi+96+16]
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0x01
addss xmm0, xmm1
STORE1( 16, xmm0, xmm3 )
}
return;
}
case 6: { // 6x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm7, qword ptr [esi]
movlps xmm6, qword ptr [esi+8]
shufps xmm7, xmm7, 0x44
shufps xmm6, xmm6, 0x44
movlps xmm0, qword ptr [edi ]
movhps xmm0, qword ptr [edi+ 24]
mulps xmm0, xmm7
movlps xmm3, qword ptr [edi+ 8]
movhps xmm3, qword ptr [edi+ 32]
mulps xmm3, xmm6
movlps xmm1, qword ptr [edi+ 48]
movhps xmm1, qword ptr [edi+ 72]
mulps xmm1, xmm7
movlps xmm2, qword ptr [edi+ 96]
movhps xmm2, qword ptr [edi+120]
mulps xmm2, xmm7
movlps xmm4, qword ptr [edi+ 56]
movhps xmm4, qword ptr [edi+ 80]
movlps xmm5, qword ptr [edi+104]
movhps xmm5, qword ptr [edi+128]
mulps xmm4, xmm6
movlps xmm7, qword ptr [esi+16]
addps xmm0, xmm3
shufps xmm7, xmm7, 0x44
mulps xmm5, xmm6
addps xmm1, xmm4
movlps xmm3, qword ptr [edi+ 16]
movhps xmm3, qword ptr [edi+ 40]
addps xmm2, xmm5
movlps xmm4, qword ptr [edi+ 64]
movhps xmm4, qword ptr [edi+ 88]
mulps xmm3, xmm7
movlps xmm5, qword ptr [edi+112]
movhps xmm5, qword ptr [edi+136]
addps xmm0, xmm3
mulps xmm4, xmm7
mulps xmm5, xmm7
addps xmm1, xmm4
addps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm6, xmm1, 0xDD
movaps xmm7, xmm2
shufps xmm7, xmm2, 0x88
shufps xmm2, xmm2, 0xDD
addps xmm0, xmm6
addps xmm2, xmm7
STORE4( 0, xmm0, xmm3 )
STORE2LO( 16, xmm2, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
mPtr += 6;
}
return;
}
}
break;
}
default: {
int numColumns = mat.GetNumColumns();
for ( int i = 0; i < numRows; i++ ) {
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numColumns; j++ ) {
sum += mPtr[j] * vPtr[j];
}
dstPtr[i] STOREC sum;
mPtr += numColumns;
}
break;
}
}
#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}
/*
============
idSIMD_SSE::MatX_MultiplyAddVecX
optimizes the following matrix multiplications:
NxN * Nx1
Nx6 * 6x1
6xN * Nx1
with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 ) \
__asm movss reg2, [eax+offset] \
__asm addss reg2, reg1 \
__asm movss [eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm addps reg2, reg1 \
__asm movlps [eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 ) \
__asm movhps reg2, [eax+offset] \
__asm addps reg2, reg1 \
__asm movhps [eax+offset], reg2
#define STORE4( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm movhps reg2, [eax+offset+8] \
__asm addps reg2, reg1 \
__asm movlps [eax+offset], reg2 \
__asm movhps [eax+offset+8], reg2
#define STOREC +=
int numRows;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumColumns() );
assert( dst.GetSize() >= mat.GetNumRows() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numRows = mat.GetNumRows();
switch( mat.GetNumColumns() ) {
case 1: {
switch( numRows ) {
case 1: { // 1x1 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
mulss xmm0, [edi]
STORE1( 0, xmm0, xmm1 )
}
return;
}
case 6: { // 6x1 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
STORE4( 0, xmm0, xmm2 )
STORE2LO( 16, xmm1, xmm2 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0];
mPtr++;
}
return;
}
}
break;
}
case 2: {
switch( numRows ) {
case 2: { // 2x2 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
movss xmm1, [esi+4]
movss xmm2, [edi]
mulss xmm2, xmm0
movss xmm3, [edi+4]
mulss xmm3, xmm1
addss xmm2, xmm3
STORE1( 0, xmm2, xmm4 )
mulss xmm0, [edi+8]
mulss xmm1, [edi+8+4]
addss xmm0, xmm1
STORE1( 4, xmm0, xmm4 )
}
return;
}
case 6: { // 6x2 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm7, [esi]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movaps xmm0, [edi]
mulps xmm0, xmm7
movaps xmm1, [edi+16]
mulps xmm1, xmm7
movaps xmm2, xmm0
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
movaps xmm3, [edi+32]
addps xmm0, xmm2
mulps xmm3, xmm7
STORE4( 0, xmm0, xmm4 )
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm1, xmm3
addps xmm3, xmm1
STORE2LO( 16, xmm3, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
mPtr += 2;
}
return;
}
}
break;
}
case 3: {
switch( numRows ) {
case 3: { // 3x3 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
movss xmm4, [edi]
mulss xmm4, xmm0
movss xmm1, [esi+4]
movss xmm5, [edi+4]
mulss xmm5, xmm1
addss xmm4, xmm5
movss xmm2, [esi+8]
movss xmm6, [edi+8]
mulss xmm6, xmm2
addss xmm4, xmm6
movss xmm3, [edi+12]
mulss xmm3, xmm0
STORE1( 0, xmm4, xmm7 );
movss xmm5, [edi+12+4]
mulss xmm5, xmm1
addss xmm3, xmm5
movss xmm6, [edi+12+8]
mulss xmm6, xmm2
addss xmm3, xmm6
mulss xmm0, [edi+24]
mulss xmm1, [edi+24+4]
STORE1( 4, xmm3, xmm7 );
addss xmm0, xmm1
mulss xmm2, [edi+24+8]
addss xmm0, xmm2
STORE1( 8, xmm0, xmm7 );
}
return;
}
case 6: { // 6x3 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm5, [esi]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [esi+4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [esi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
movlps xmm1, [edi+4*4]
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
movlps xmm2, [edi+6*4]
movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
mulps xmm0, xmm5
movlps xmm3, [edi+10*4]
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
movaps xmm3, xmm1
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
mulps xmm1, xmm6
shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm3
STORE4( 0, xmm0, xmm4 )
movss xmm1, [edi+12*4]
mulss xmm1, xmm5
movss xmm2, [edi+13*4]
mulss xmm2, xmm6
movss xmm3, [edi+14*4]
mulss xmm3, xmm7
addss xmm1, xmm2
addss xmm1, xmm3
STORE1( 16, xmm1, xmm4 )
mulss xmm5, [edi+15*4]
mulss xmm6, [edi+16*4]
mulss xmm7, [edi+17*4]
addss xmm5, xmm6
addss xmm5, xmm7
STORE1( 20, xmm5, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
mPtr += 3;
}
return;
}
}
break;
}
case 4: {
switch( numRows ) {
case 4: { // 4x4 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, qword ptr [esi ]
movlps xmm0, qword ptr [edi ]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm0, qword ptr [edi+16]
mulps xmm0, xmm6
movlps xmm7, qword ptr [esi+ 8]
movlps xmm2, qword ptr [edi+ 8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm2, qword ptr [edi+24]
mulps xmm2, xmm7
movlps xmm1, qword ptr [edi+32]
movhps xmm1, qword ptr [edi+48]
mulps xmm1, xmm6
movlps xmm3, qword ptr [edi+40]
addps xmm0, xmm2
movhps xmm3, qword ptr [edi+56]
mulps xmm3, xmm7
movaps xmm4, xmm0
addps xmm1, xmm3
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm4
STORE4( 0, xmm0, xmm2 )
}
return;
}
case 6: { // 6x4 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, qword ptr [esi+ 0]
movlps xmm0, qword ptr [edi+ 0]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm0, qword ptr [edi+16]
mulps xmm0, xmm6
movlps xmm7, qword ptr [esi+ 8]
movlps xmm2, qword ptr [edi+ 8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm2, qword ptr [edi+24]
mulps xmm2, xmm7
movlps xmm1, qword ptr [edi+32]
movhps xmm1, qword ptr [edi+48]
mulps xmm1, xmm6
movlps xmm3, qword ptr [edi+40]
addps xmm0, xmm2
movhps xmm3, qword ptr [edi+56]
mulps xmm3, xmm7
movaps xmm4, xmm0
addps xmm1, xmm3
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm4
movlps xmm1, qword ptr [edi+64]
movhps xmm1, qword ptr [edi+80]
STORE4( 0, xmm0, xmm4 )
mulps xmm1, xmm6
movlps xmm2, qword ptr [edi+72]
movhps xmm2, qword ptr [edi+88]
mulps xmm2, xmm7
addps xmm1, xmm2
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm3, xmm1
addps xmm1, xmm3
STORE2LO( 16, xmm1, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
mPtr += 4;
}
return;
}
}
break;
}
case 5: {
switch( numRows ) {
case 5: { // 5x5 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
movss xmm7, [esi+0*4]
shufps xmm7, xmm7, 0
mulps xmm0, xmm7
movss xmm5, [esi+1*4]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
addps xmm0, xmm1
movss xmm6, [esi+2*4]
shufps xmm6, xmm6, 0
mulps xmm2, xmm6
addps xmm0, xmm2
movss xmm1, [esi+3*4]
shufps xmm1, xmm1, 0
mulps xmm3, xmm1
addps xmm0, xmm3
movss xmm2, [esi+4*4]
shufps xmm2, xmm2, 0
mulps xmm4, xmm2
addps xmm0, xmm4
mulss xmm7, [edi+20*4]
mulss xmm5, [edi+21*4]
addps xmm7, xmm5
mulss xmm6, [edi+22*4]
addps xmm7, xmm6
mulss xmm1, [edi+23*4]
addps xmm7, xmm1
mulss xmm2, [edi+24*4]
addps xmm7, xmm2
STORE4( 0, xmm0, xmm3 )
STORE1( 16, xmm7, xmm4 )
}
return;
}
case 6: { // 6x5 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, [esi]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movlps xmm7, [esi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movlps xmm0, [edi]
movhps xmm3, [edi+8]
movaps xmm1, [edi+16]
movlps xmm2, [edi+32]
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
mulps xmm0, xmm6
mulps xmm3, xmm7
movlps xmm2, [edi+40]
addps xmm0, xmm3 // xmm0 + xmm1
movhps xmm5, [edi+40+8]
movlps xmm3, [edi+40+16]
movhps xmm3, [edi+40+24]
movlps xmm4, [edi+40+32]
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
mulps xmm2, xmm6
mulps xmm5, xmm7
addps xmm2, xmm5 // xmm2 + xmm3
movss xmm5, [esi+16]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm4, xmm0
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
addps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
STORE4( 0, xmm0, xmm2 )
movlps xmm4, [edi+80]
movhps xmm3, [edi+80+8]
movaps xmm1, [edi+80+16]
movlps xmm2, [edi+80+32]
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
mulps xmm4, xmm6
mulps xmm3, xmm7
mulps xmm1, xmm5
addps xmm4, xmm3 // xmm4 + xmm1
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
addps xmm4, xmm1
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
addps xmm4, xmm1
STORE2LO( 16, xmm4, xmm2 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
mPtr += 5;
}
return;
}
}
break;
}
case 6: {
switch( numRows ) {
case 1: { // 1x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
mulss xmm0, [edi]
movss xmm1, [esi+4]
mulss xmm1, [edi+4]
movss xmm2, [esi+8]
addss xmm0, xmm1
mulss xmm2, [edi+8]
movss xmm3, [esi+12]
addss xmm0, xmm2
mulss xmm3, [edi+12]
movss xmm4, [esi+16]
addss xmm0, xmm3
mulss xmm4, [edi+16]
movss xmm5, [esi+20]
addss xmm0, xmm4
mulss xmm5, [edi+20]
movss xmm6, [esi+24]
addss xmm0, xmm5
mulss xmm6, [edi+24]
addss xmm0, xmm6
STORE1( 0, xmm0, xmm7 )
}
return;
}
case 2: { // 2x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm0, xmm1
addps xmm0, xmm1
STORE2LO( 0, xmm0, xmm3 )
}
return;
}
case 3: { // 3x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm0, xmm1
addps xmm0, xmm1
STORE2LO( 0, xmm0, xmm3 )
// row 2
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm1
STORE1( 8, xmm0, xmm3 )
}
return;
}
case 4: { // 4x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm7, xmm0
movlhps xmm7, xmm2
addps xmm7, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm7, xmm0
// row 2 and 3
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
movaps xmm2, [edi+48+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
// last 4 additions for the first 4 rows and store result
movaps xmm0, xmm7
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm7
STORE4( 0, xmm0, xmm4 )
}
return;
}
case 5: { // 5x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm7, xmm0
movlhps xmm7, xmm2
addps xmm7, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm7, xmm0
// row 2 and 3
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
movaps xmm2, [edi+48+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
// last 4 additions for the first 4 rows and store result
movaps xmm0, xmm7
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm7
STORE4( 0, xmm0, xmm3 )
// row 5
movaps xmm0, [edi+96]
movaps xmm1, [edi+96+16]
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0x01
addss xmm0, xmm1
STORE1( 16, xmm0, xmm3 )
}
return;
}
case 6: { // 6x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm7, qword ptr [esi]
movlps xmm6, qword ptr [esi+8]
shufps xmm7, xmm7, 0x44
shufps xmm6, xmm6, 0x44
movlps xmm0, qword ptr [edi ]
movhps xmm0, qword ptr [edi+ 24]
mulps xmm0, xmm7
movlps xmm3, qword ptr [edi+ 8]
movhps xmm3, qword ptr [edi+ 32]
mulps xmm3, xmm6
movlps xmm1, qword ptr [edi+ 48]
movhps xmm1, qword ptr [edi+ 72]
mulps xmm1, xmm7
movlps xmm2, qword ptr [edi+ 96]
movhps xmm2, qword ptr [edi+120]
mulps xmm2, xmm7
movlps xmm4, qword ptr [edi+ 56]
movhps xmm4, qword ptr [edi+ 80]
movlps xmm5, qword ptr [edi+104]
movhps xmm5, qword ptr [edi+128]
mulps xmm4, xmm6
movlps xmm7, qword ptr [esi+16]
addps xmm0, xmm3
shufps xmm7, xmm7, 0x44
mulps xmm5, xmm6
addps xmm1, xmm4
movlps xmm3, qword ptr [edi+ 16]
movhps xmm3, qword ptr [edi+ 40]
addps xmm2, xmm5
movlps xmm4, qword ptr [edi+ 64]
movhps xmm4, qword ptr [edi+ 88]
mulps xmm3, xmm7
movlps xmm5, qword ptr [edi+112]
movhps xmm5, qword ptr [edi+136]
addps xmm0, xmm3
mulps xmm4, xmm7
mulps xmm5, xmm7
addps xmm1, xmm4
addps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm6, xmm1, 0xDD
movaps xmm7, xmm2
shufps xmm7, xmm2, 0x88
shufps xmm2, xmm2, 0xDD
addps xmm0, xmm6
addps xmm2, xmm7
STORE4( 0, xmm0, xmm3 )
STORE2LO( 16, xmm2, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
mPtr += 6;
}
return;
}
}
break;
}
default: {
int numColumns = mat.GetNumColumns();
for ( int i = 0; i < numRows; i++ ) {
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numColumns; j++ ) {
sum += mPtr[j] * vPtr[j];
}
dstPtr[i] STOREC sum;
mPtr += numColumns;
}
break;
}
}
#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}
/*
============
idSIMD_SSE::MatX_MultiplySubVecX
optimizes the following matrix multiplications:
NxN * Nx1
Nx6 * 6x1
6xN * Nx1
with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 ) \
__asm movss reg2, [eax+offset] \
__asm subss reg2, reg1 \
__asm movss [eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm subps reg2, reg1 \
__asm movlps [eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 ) \
__asm movhps reg2, [eax+offset] \
__asm subps reg2, reg1 \
__asm movhps [eax+offset], reg2
#define STORE4( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm movhps reg2, [eax+offset+8] \
__asm subps reg2, reg1 \
__asm movlps [eax+offset], reg2 \
__asm movhps [eax+offset+8], reg2
#define STOREC -=
int numRows;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumColumns() );
assert( dst.GetSize() >= mat.GetNumRows() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numRows = mat.GetNumRows();
switch( mat.GetNumColumns() ) {
case 1: {
switch( numRows ) {
case 1: { // 1x1 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
mulss xmm0, [edi]
STORE1( 0, xmm0, xmm1 )
}
return;
}
case 6: { // 6x1 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
STORE4( 0, xmm0, xmm2 )
STORE2LO( 16, xmm1, xmm2 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0];
mPtr++;
}
return;
}
}
break;
}
case 2: {
switch( numRows ) {
case 2: { // 2x2 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
movss xmm1, [esi+4]
movss xmm2, [edi]
mulss xmm2, xmm0
movss xmm3, [edi+4]
mulss xmm3, xmm1
addss xmm2, xmm3
STORE1( 0, xmm2, xmm4 )
mulss xmm0, [edi+8]
mulss xmm1, [edi+8+4]
addss xmm0, xmm1
STORE1( 4, xmm0, xmm4 )
}
return;
}
case 6: { // 6x2 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm7, [esi]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movaps xmm0, [edi]
mulps xmm0, xmm7
movaps xmm1, [edi+16]
mulps xmm1, xmm7
movaps xmm2, xmm0
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
movaps xmm3, [edi+32]
addps xmm0, xmm2
mulps xmm3, xmm7
STORE4( 0, xmm0, xmm4 )
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm1, xmm3
addps xmm3, xmm1
STORE2LO( 16, xmm3, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
mPtr += 2;
}
return;
}
}
break;
}
case 3: {
switch( numRows ) {
case 3: { // 3x3 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
movss xmm4, [edi]
mulss xmm4, xmm0
movss xmm1, [esi+4]
movss xmm5, [edi+4]
mulss xmm5, xmm1
addss xmm4, xmm5
movss xmm2, [esi+8]
movss xmm6, [edi+8]
mulss xmm6, xmm2
addss xmm4, xmm6
movss xmm3, [edi+12]
mulss xmm3, xmm0
STORE1( 0, xmm4, xmm7 );
movss xmm5, [edi+12+4]
mulss xmm5, xmm1
addss xmm3, xmm5
movss xmm6, [edi+12+8]
mulss xmm6, xmm2
addss xmm3, xmm6
mulss xmm0, [edi+24]
mulss xmm1, [edi+24+4]
STORE1( 4, xmm3, xmm7 );
addss xmm0, xmm1
mulss xmm2, [edi+24+8]
addss xmm0, xmm2
STORE1( 8, xmm0, xmm7 );
}
return;
}
case 6: { // 6x3 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm5, [esi]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [esi+4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm7, [esi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
movlps xmm1, [edi+4*4]
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
movlps xmm2, [edi+6*4]
movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
mulps xmm0, xmm5
movlps xmm3, [edi+10*4]
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
movaps xmm3, xmm1
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
mulps xmm1, xmm6
shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm3
STORE4( 0, xmm0, xmm4 )
movss xmm1, [edi+12*4]
mulss xmm1, xmm5
movss xmm2, [edi+13*4]
mulss xmm2, xmm6
movss xmm3, [edi+14*4]
mulss xmm3, xmm7
addss xmm1, xmm2
addss xmm1, xmm3
STORE1( 16, xmm1, xmm4 )
mulss xmm5, [edi+15*4]
mulss xmm6, [edi+16*4]
mulss xmm7, [edi+17*4]
addss xmm5, xmm6
addss xmm5, xmm7
STORE1( 20, xmm5, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
mPtr += 3;
}
return;
}
}
break;
}
case 4: {
switch( numRows ) {
case 4: { // 4x4 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, qword ptr [esi ]
movlps xmm0, qword ptr [edi ]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm0, qword ptr [edi+16]
mulps xmm0, xmm6
movlps xmm7, qword ptr [esi+ 8]
movlps xmm2, qword ptr [edi+ 8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm2, qword ptr [edi+24]
mulps xmm2, xmm7
movlps xmm1, qword ptr [edi+32]
movhps xmm1, qword ptr [edi+48]
mulps xmm1, xmm6
movlps xmm3, qword ptr [edi+40]
addps xmm0, xmm2
movhps xmm3, qword ptr [edi+56]
mulps xmm3, xmm7
movaps xmm4, xmm0
addps xmm1, xmm3
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm4
STORE4( 0, xmm0, xmm2 )
}
return;
}
case 6: { // 6x4 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, qword ptr [esi+ 0]
movlps xmm0, qword ptr [edi+ 0]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm0, qword ptr [edi+16]
mulps xmm0, xmm6
movlps xmm7, qword ptr [esi+ 8]
movlps xmm2, qword ptr [edi+ 8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movhps xmm2, qword ptr [edi+24]
mulps xmm2, xmm7
movlps xmm1, qword ptr [edi+32]
movhps xmm1, qword ptr [edi+48]
mulps xmm1, xmm6
movlps xmm3, qword ptr [edi+40]
addps xmm0, xmm2
movhps xmm3, qword ptr [edi+56]
mulps xmm3, xmm7
movaps xmm4, xmm0
addps xmm1, xmm3
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm4
movlps xmm1, qword ptr [edi+64]
movhps xmm1, qword ptr [edi+80]
STORE4( 0, xmm0, xmm4 )
mulps xmm1, xmm6
movlps xmm2, qword ptr [edi+72]
movhps xmm2, qword ptr [edi+88]
mulps xmm2, xmm7
addps xmm1, xmm2
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm3, xmm1
addps xmm1, xmm3
STORE2LO( 16, xmm1, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
mPtr += 4;
}
return;
}
}
break;
}
case 5: {
switch( numRows ) {
case 5: { // 5x5 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
movss xmm7, [esi+0*4]
shufps xmm7, xmm7, 0
mulps xmm0, xmm7
movss xmm5, [esi+1*4]
shufps xmm5, xmm5, 0
mulps xmm1, xmm5
addps xmm0, xmm1
movss xmm6, [esi+2*4]
shufps xmm6, xmm6, 0
mulps xmm2, xmm6
addps xmm0, xmm2
movss xmm1, [esi+3*4]
shufps xmm1, xmm1, 0
mulps xmm3, xmm1
addps xmm0, xmm3
movss xmm2, [esi+4*4]
shufps xmm2, xmm2, 0
mulps xmm4, xmm2
addps xmm0, xmm4
mulss xmm7, [edi+20*4]
mulss xmm5, [edi+21*4]
addps xmm7, xmm5
mulss xmm6, [edi+22*4]
addps xmm7, xmm6
mulss xmm1, [edi+23*4]
addps xmm7, xmm1
mulss xmm2, [edi+24*4]
addps xmm7, xmm2
STORE4( 0, xmm0, xmm3 )
STORE1( 16, xmm7, xmm4 )
}
return;
}
case 6: { // 6x5 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, [esi]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
movlps xmm7, [esi+8]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
movlps xmm0, [edi]
movhps xmm3, [edi+8]
movaps xmm1, [edi+16]
movlps xmm2, [edi+32]
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
mulps xmm0, xmm6
mulps xmm3, xmm7
movlps xmm2, [edi+40]
addps xmm0, xmm3 // xmm0 + xmm1
movhps xmm5, [edi+40+8]
movlps xmm3, [edi+40+16]
movhps xmm3, [edi+40+24]
movlps xmm4, [edi+40+32]
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
mulps xmm2, xmm6
mulps xmm5, xmm7
addps xmm2, xmm5 // xmm2 + xmm3
movss xmm5, [esi+16]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm4, xmm0
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
addps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
STORE4( 0, xmm0, xmm2 )
movlps xmm4, [edi+80]
movhps xmm3, [edi+80+8]
movaps xmm1, [edi+80+16]
movlps xmm2, [edi+80+32]
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
mulps xmm4, xmm6
mulps xmm3, xmm7
mulps xmm1, xmm5
addps xmm4, xmm3 // xmm4 + xmm1
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
addps xmm4, xmm1
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
addps xmm4, xmm1
STORE2LO( 16, xmm4, xmm2 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
mPtr += 5;
}
return;
}
}
break;
}
case 6: {
switch( numRows ) {
case 1: { // 1x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
mulss xmm0, [edi]
movss xmm1, [esi+4]
mulss xmm1, [edi+4]
movss xmm2, [esi+8]
addss xmm0, xmm1
mulss xmm2, [edi+8]
movss xmm3, [esi+12]
addss xmm0, xmm2
mulss xmm3, [edi+12]
movss xmm4, [esi+16]
addss xmm0, xmm3
mulss xmm4, [edi+16]
movss xmm5, [esi+20]
addss xmm0, xmm4
mulss xmm5, [edi+20]
movss xmm6, [esi+24]
addss xmm0, xmm5
mulss xmm6, [edi+24]
addss xmm0, xmm6
STORE1( 0, xmm0, xmm7 )
}
return;
}
case 2: { // 2x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm0, xmm1
addps xmm0, xmm1
STORE2LO( 0, xmm0, xmm3 )
}
return;
}
case 3: { // 3x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
movhlps xmm0, xmm1
addps xmm0, xmm1
STORE2LO( 0, xmm0, xmm3 )
// row 2
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm1
STORE1( 8, xmm0, xmm3 )
}
return;
}
case 4: { // 4x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm7, xmm0
movlhps xmm7, xmm2
addps xmm7, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm7, xmm0
// row 2 and 3
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
movaps xmm2, [edi+48+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
// last 4 additions for the first 4 rows and store result
movaps xmm0, xmm7
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm7
STORE4( 0, xmm0, xmm4 )
}
return;
}
case 5: { // 5x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
// load idVecX
movlps xmm4, [esi]
movhps xmm4, [esi+8]
movlps xmm5, [esi+16]
movlhps xmm5, xmm4
movhlps xmm6, xmm4
movlhps xmm6, xmm5
// row 0 and 1
movaps xmm0, [edi]
movaps xmm1, [edi+16]
movaps xmm2, [edi+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm7, xmm0
movlhps xmm7, xmm2
addps xmm7, xmm1
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm7, xmm0
// row 2 and 3
movaps xmm0, [edi+48]
movaps xmm1, [edi+48+16]
movaps xmm2, [edi+48+32]
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
movhlps xmm3, xmm0
movlhps xmm3, xmm2
addps xmm1, xmm3
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
addps xmm1, xmm0
// last 4 additions for the first 4 rows and store result
movaps xmm0, xmm7
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
addps xmm0, xmm7
STORE4( 0, xmm0, xmm3 )
// row 5
movaps xmm0, [edi+96]
movaps xmm1, [edi+96+16]
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0x01
addss xmm0, xmm1
STORE1( 16, xmm0, xmm3 )
}
return;
}
case 6: { // 6x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm7, qword ptr [esi]
movlps xmm6, qword ptr [esi+8]
shufps xmm7, xmm7, 0x44
shufps xmm6, xmm6, 0x44
movlps xmm0, qword ptr [edi ]
movhps xmm0, qword ptr [edi+ 24]
mulps xmm0, xmm7
movlps xmm3, qword ptr [edi+ 8]
movhps xmm3, qword ptr [edi+ 32]
mulps xmm3, xmm6
movlps xmm1, qword ptr [edi+ 48]
movhps xmm1, qword ptr [edi+ 72]
mulps xmm1, xmm7
movlps xmm2, qword ptr [edi+ 96]
movhps xmm2, qword ptr [edi+120]
mulps xmm2, xmm7
movlps xmm4, qword ptr [edi+ 56]
movhps xmm4, qword ptr [edi+ 80]
movlps xmm5, qword ptr [edi+104]
movhps xmm5, qword ptr [edi+128]
mulps xmm4, xmm6
movlps xmm7, qword ptr [esi+16]
addps xmm0, xmm3
shufps xmm7, xmm7, 0x44
mulps xmm5, xmm6
addps xmm1, xmm4
movlps xmm3, qword ptr [edi+ 16]
movhps xmm3, qword ptr [edi+ 40]
addps xmm2, xmm5
movlps xmm4, qword ptr [edi+ 64]
movhps xmm4, qword ptr [edi+ 88]
mulps xmm3, xmm7
movlps xmm5, qword ptr [edi+112]
movhps xmm5, qword ptr [edi+136]
addps xmm0, xmm3
mulps xmm4, xmm7
mulps xmm5, xmm7
addps xmm1, xmm4
addps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm6, xmm1, 0xDD
movaps xmm7, xmm2
shufps xmm7, xmm2, 0x88
shufps xmm2, xmm2, 0xDD
addps xmm0, xmm6
addps xmm2, xmm7
STORE4( 0, xmm0, xmm3 )
STORE2LO( 16, xmm2, xmm4 )
}
return;
}
default: {
for ( int i = 0; i < numRows; i++ ) {
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
mPtr += 6;
}
return;
}
}
break;
}
default: {
int numColumns = mat.GetNumColumns();
for ( int i = 0; i < numRows; i++ ) {
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numColumns; j++ ) {
sum += mPtr[j] * vPtr[j];
}
dstPtr[i] STOREC sum;
mPtr += numColumns;
}
break;
}
}
#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}
/*
============
idSIMD_SSE::MatX_TransposeMultiplyVecX
optimizes the following matrix multiplications:
Nx6 * Nx1
6xN * 6x1
with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 ) \
__asm movss [eax+offset], reg1
#define STORE2LO( offset, reg1, reg2 ) \
__asm movlps [eax+offset], reg1
#define STORE2HI( offset, reg1, reg2 ) \
__asm movhps [eax+offset], reg1
#define STORE4( offset, reg1, reg2 ) \
__asm movlps [eax+offset], reg1 \
__asm movhps [eax+offset+8], reg1
#define STOREC =
int numColumns;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumRows() );
assert( dst.GetSize() >= mat.GetNumColumns() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numColumns = mat.GetNumColumns();
switch( mat.GetNumRows() ) {
case 1:
switch( numColumns ) {
case 6: { // 1x6 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
STORE4( 0, xmm0, xmm2 )
STORE2LO( 16, xmm1, xmm3 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0];
mPtr++;
}
return;
}
}
break;
case 2:
switch( numColumns ) {
case 6: { // 2x6 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi]
movaps xmm1, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
movaps xmm2, [edi]
mulps xmm2, xmm0
movlps xmm3, [edi+24]
movhps xmm3, [edi+32]
mulps xmm3, xmm1
addps xmm2, xmm3
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm4, [edi+16]
movhps xmm4, [edi+40]
mulps xmm4, xmm0
movhlps xmm3, xmm4
addps xmm3, xmm4
STORE4( 0, xmm2, xmm5 )
STORE2LO( 16, xmm3, xmm6 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
mPtr++;
}
return;
}
}
break;
case 3:
switch( numColumns ) {
case 6: { // 3x6 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movss xmm1, [esi+2*4]
movlps xmm3, [edi+(0*6+0)*4]
movhps xmm3, [edi+(0*6+2)*4]
movaps xmm4, xmm0
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm4
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*6+0)*4]
movhps xmm4, [edi+(2*6+2)*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm1
addps xmm3, xmm4
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movhlps xmm4, xmm3
addps xmm3, xmm4
movlps xmm5, [edi+(2*6+4)*4]
mulps xmm5, xmm1
addps xmm3, xmm5
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
mPtr++;
}
return;
}
}
break;
case 4:
switch( numColumns ) {
case 6: { // 4x6 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*6+0)*4]
movhps xmm4, [edi+(2*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movhlps xmm4, xmm3
addps xmm3, xmm4
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3];
mPtr++;
}
return;
}
}
break;
case 5:
switch( numColumns ) {
case 6: { // 5x6 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movss xmm2, [esi+4*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(2*6+0)*4]
addps xmm3, xmm6
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm4, xmm2
mulps xmm4, [edi+(4*6+0)*4]
addps xmm3, xmm4
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movhlps xmm4, xmm3
addps xmm3, xmm4
movlps xmm5, [edi+(4*6+4)*4]
mulps xmm5, xmm2
addps xmm3, xmm5
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
mPtr++;
}
return;
}
}
break;
case 6:
switch( numColumns ) {
case 1: { // 6x1 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi]
movhps xmm0, [esi+8]
movlps xmm1, [esi+16]
mulps xmm0, [edi]
mulps xmm1, [edi+16]
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
addps xmm0, xmm1
movhlps xmm2, xmm0
addss xmm2, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm2, xmm0
STORE1( 0, xmm2, xmm3 )
}
return;
}
case 2: { // 6x2 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm6, [edi+0*4]
mulps xmm6, xmm0
movlps xmm1, [esi+2*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm7, [edi+4*4]
mulps xmm7, xmm1
addps xmm6, xmm7
movlps xmm2, [esi+4*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm7, [edi+8*4]
mulps xmm7, xmm2
addps xmm6, xmm7
movhlps xmm3, xmm6
addps xmm3, xmm6
STORE2LO( 0, xmm3, xmm7 )
}
return;
}
case 3: { // 6x3 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [edi+(0*3+2)*4]
movhps xmm0, [edi+(0*3+0)*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm6, [esi+0*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movss xmm1, [edi+(1*3+0)*4]
movhps xmm1, [edi+(1*3+1)*4]
movss xmm7, [esi+1*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movss xmm2, [edi+(2*3+2)*4]
movhps xmm2, [edi+(2*3+0)*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm7, [esi+2*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
movss xmm3, [edi+(3*3+0)*4]
movhps xmm3, [edi+(3*3+1)*4]
movss xmm7, [esi+3*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm3
addps xmm6, xmm7
movss xmm4, [edi+(4*3+2)*4]
movhps xmm4, [edi+(4*3+0)*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm7, [esi+4*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movss xmm5, [edi+(5*3+0)*4]
movhps xmm5, [edi+(5*3+1)*4]
movss xmm7, [esi+5*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
STORE1( 0, xmm6, xmm7 )
STORE2HI( 4, xmm6, xmm7 )
}
return;
}
case 4: { // 6x4 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm3, [edi+(0*4+0)*4]
movhps xmm3, [edi+(0*4+2)*4]
movss xmm4, [esi+0*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm4
movlps xmm5, [edi+(1*4+0)*4]
movhps xmm5, [edi+(1*4+2)*4]
movss xmm6, [esi+1*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*4+0)*4]
movhps xmm4, [edi+(2*4+2)*4]
movss xmm6, [esi+2*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(3*4+0)*4]
movhps xmm5, [edi+(3*4+2)*4]
movss xmm6, [esi+3*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(4*4+0)*4]
movhps xmm4, [edi+(4*4+2)*4]
movss xmm6, [esi+4*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(5*4+0)*4]
movhps xmm5, [edi+(5*4+2)*4]
movss xmm6, [esi+5*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
}
return;
}
case 5: { // 6x5 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, [edi+(0*5+0)*4]
movhps xmm6, [edi+(0*5+2)*4]
movss xmm0, [esi+0*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movlps xmm7, [edi+(1*5+0)*4]
movhps xmm7, [edi+(1*5+2)*4]
movss xmm1, [esi+1*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movlps xmm7, [edi+(2*5+0)*4]
movhps xmm7, [edi+(2*5+2)*4]
movss xmm2, [esi+2*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
movlps xmm7, [edi+(3*5+0)*4]
movhps xmm7, [edi+(3*5+2)*4]
movss xmm3, [esi+3*4]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm3
addps xmm6, xmm7
movlps xmm7, [edi+(4*5+0)*4]
movhps xmm7, [edi+(4*5+2)*4]
movss xmm4, [esi+4*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movlps xmm7, [edi+(5*5+0)*4]
movhps xmm7, [edi+(5*5+2)*4]
movss xmm5, [esi+5*4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
STORE4( 0, xmm6, xmm7 )
movss xmm6, [edi+(0*5+4)*4]
mulss xmm6, xmm0
movss xmm7, [edi+(1*5+4)*4]
mulss xmm7, xmm1
addss xmm6, xmm7
movss xmm7, [edi+(2*5+4)*4]
mulss xmm7, xmm2
addss xmm6, xmm7
movss xmm7, [edi+(3*5+4)*4]
mulss xmm7, xmm3
addss xmm6, xmm7
movss xmm7, [edi+(4*5+4)*4]
mulss xmm7, xmm4
addss xmm6, xmm7
movss xmm7, [edi+(5*5+4)*4]
mulss xmm7, xmm5
addss xmm6, xmm7
STORE1( 16, xmm6, xmm7 )
}
return;
}
case 6: { // 6x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movlps xmm2, [esi+4*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(2*6+0)*4]
addps xmm3, xmm6
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm2
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(4*6+0)*4]
addps xmm3, xmm6
movaps xmm6, xmm2
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
movlps xmm5, [edi+(5*6+0)*4]
movhps xmm5, [edi+(5*6+2)*4]
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movlps xmm5, [edi+(4*6+4)*4]
movhps xmm5, [edi+(5*6+4)*4]
mulps xmm5, xmm2
addps xmm3, xmm5
movhlps xmm4, xmm3
addps xmm3, xmm4
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
mPtr++;
}
return;
}
}
break;
default:
int numRows = mat.GetNumRows();
for ( int i = 0; i < numColumns; i++ ) {
mPtr = mat.ToFloatPtr() + i;
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numRows; j++ ) {
mPtr += numColumns;
sum += mPtr[0] * vPtr[j];
}
dstPtr[i] STOREC sum;
}
break;
}
#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}
/*
============
idSIMD_SSE::MatX_TransposeMultiplyAddVecX
optimizes the following matrix multiplications:
Nx6 * Nx1
6xN * 6x1
with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 ) \
__asm movss reg2, [eax+offset] \
__asm addss reg2, reg1 \
__asm movss [eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm addps reg2, reg1 \
__asm movlps [eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 ) \
__asm movhps reg2, [eax+offset] \
__asm addps reg2, reg1 \
__asm movhps [eax+offset], reg2
#define STORE4( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm movhps reg2, [eax+offset+8] \
__asm addps reg2, reg1 \
__asm movlps [eax+offset], reg2 \
__asm movhps [eax+offset+8], reg2
#define STOREC +=
int numColumns;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumRows() );
assert( dst.GetSize() >= mat.GetNumColumns() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numColumns = mat.GetNumColumns();
switch( mat.GetNumRows() ) {
case 1:
switch( numColumns ) {
case 6: { // 1x6 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
STORE4( 0, xmm0, xmm2 )
STORE2LO( 16, xmm1, xmm3 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0];
mPtr++;
}
return;
}
}
break;
case 2:
switch( numColumns ) {
case 6: { // 2x6 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi]
movaps xmm1, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
movaps xmm2, [edi]
mulps xmm2, xmm0
movlps xmm3, [edi+24]
movhps xmm3, [edi+32]
mulps xmm3, xmm1
addps xmm2, xmm3
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm4, [edi+16]
movhps xmm4, [edi+40]
mulps xmm4, xmm0
movhlps xmm3, xmm4
addps xmm3, xmm4
STORE4( 0, xmm2, xmm5 )
STORE2LO( 16, xmm3, xmm6 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
mPtr++;
}
return;
}
}
break;
case 3:
switch( numColumns ) {
case 6: { // 3x6 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movss xmm1, [esi+2*4]
movlps xmm3, [edi+(0*6+0)*4]
movhps xmm3, [edi+(0*6+2)*4]
movaps xmm4, xmm0
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm4
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*6+0)*4]
movhps xmm4, [edi+(2*6+2)*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm1
addps xmm3, xmm4
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movhlps xmm4, xmm3
addps xmm3, xmm4
movlps xmm5, [edi+(2*6+4)*4]
mulps xmm5, xmm1
addps xmm3, xmm5
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
mPtr++;
}
return;
}
}
break;
case 4:
switch( numColumns ) {
case 6: { // 4x6 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*6+0)*4]
movhps xmm4, [edi+(2*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movhlps xmm4, xmm3
addps xmm3, xmm4
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3];
mPtr++;
}
return;
}
}
break;
case 5:
switch( numColumns ) {
case 6: { // 5x6 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movss xmm2, [esi+4*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(2*6+0)*4]
addps xmm3, xmm6
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm4, xmm2
mulps xmm4, [edi+(4*6+0)*4]
addps xmm3, xmm4
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movhlps xmm4, xmm3
addps xmm3, xmm4
movlps xmm5, [edi+(4*6+4)*4]
mulps xmm5, xmm2
addps xmm3, xmm5
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
mPtr++;
}
return;
}
}
break;
case 6:
switch( numColumns ) {
case 1: { // 6x1 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi]
movhps xmm0, [esi+8]
movlps xmm1, [esi+16]
mulps xmm0, [edi]
mulps xmm1, [edi+16]
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
addps xmm0, xmm1
movhlps xmm2, xmm0
addss xmm2, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm2, xmm0
STORE1( 0, xmm2, xmm3 )
}
return;
}
case 2: { // 6x2 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm6, [edi+0*4]
mulps xmm6, xmm0
movlps xmm1, [esi+2*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm7, [edi+4*4]
mulps xmm7, xmm1
addps xmm6, xmm7
movlps xmm2, [esi+4*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm7, [edi+8*4]
mulps xmm7, xmm2
addps xmm6, xmm7
movhlps xmm3, xmm6
addps xmm3, xmm6
STORE2LO( 0, xmm3, xmm7 )
}
return;
}
case 3: { // 6x3 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [edi+(0*3+2)*4]
movhps xmm0, [edi+(0*3+0)*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm6, [esi+0*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movss xmm1, [edi+(1*3+0)*4]
movhps xmm1, [edi+(1*3+1)*4]
movss xmm7, [esi+1*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movss xmm2, [edi+(2*3+2)*4]
movhps xmm2, [edi+(2*3+0)*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm7, [esi+2*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
movss xmm3, [edi+(3*3+0)*4]
movhps xmm3, [edi+(3*3+1)*4]
movss xmm7, [esi+3*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm3
addps xmm6, xmm7
movss xmm4, [edi+(4*3+2)*4]
movhps xmm4, [edi+(4*3+0)*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm7, [esi+4*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movss xmm5, [edi+(5*3+0)*4]
movhps xmm5, [edi+(5*3+1)*4]
movss xmm7, [esi+5*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
STORE1( 0, xmm6, xmm7 )
STORE2HI( 4, xmm6, xmm7 )
}
return;
}
case 4: { // 6x4 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm3, [edi+(0*4+0)*4]
movhps xmm3, [edi+(0*4+2)*4]
movss xmm4, [esi+0*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm4
movlps xmm5, [edi+(1*4+0)*4]
movhps xmm5, [edi+(1*4+2)*4]
movss xmm6, [esi+1*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*4+0)*4]
movhps xmm4, [edi+(2*4+2)*4]
movss xmm6, [esi+2*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(3*4+0)*4]
movhps xmm5, [edi+(3*4+2)*4]
movss xmm6, [esi+3*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(4*4+0)*4]
movhps xmm4, [edi+(4*4+2)*4]
movss xmm6, [esi+4*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(5*4+0)*4]
movhps xmm5, [edi+(5*4+2)*4]
movss xmm6, [esi+5*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
}
return;
}
case 5: { // 6x5 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, [edi+(0*5+0)*4]
movhps xmm6, [edi+(0*5+2)*4]
movss xmm0, [esi+0*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movlps xmm7, [edi+(1*5+0)*4]
movhps xmm7, [edi+(1*5+2)*4]
movss xmm1, [esi+1*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movlps xmm7, [edi+(2*5+0)*4]
movhps xmm7, [edi+(2*5+2)*4]
movss xmm2, [esi+2*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
movlps xmm7, [edi+(3*5+0)*4]
movhps xmm7, [edi+(3*5+2)*4]
movss xmm3, [esi+3*4]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm3
addps xmm6, xmm7
movlps xmm7, [edi+(4*5+0)*4]
movhps xmm7, [edi+(4*5+2)*4]
movss xmm4, [esi+4*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movlps xmm7, [edi+(5*5+0)*4]
movhps xmm7, [edi+(5*5+2)*4]
movss xmm5, [esi+5*4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
STORE4( 0, xmm6, xmm7 )
movss xmm6, [edi+(0*5+4)*4]
mulss xmm6, xmm0
movss xmm7, [edi+(1*5+4)*4]
mulss xmm7, xmm1
addss xmm6, xmm7
movss xmm7, [edi+(2*5+4)*4]
mulss xmm7, xmm2
addss xmm6, xmm7
movss xmm7, [edi+(3*5+4)*4]
mulss xmm7, xmm3
addss xmm6, xmm7
movss xmm7, [edi+(4*5+4)*4]
mulss xmm7, xmm4
addss xmm6, xmm7
movss xmm7, [edi+(5*5+4)*4]
mulss xmm7, xmm5
addss xmm6, xmm7
STORE1( 16, xmm6, xmm7 )
}
return;
}
case 6: { // 6x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movlps xmm2, [esi+4*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(2*6+0)*4]
addps xmm3, xmm6
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm2
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(4*6+0)*4]
addps xmm3, xmm6
movaps xmm6, xmm2
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
movlps xmm5, [edi+(5*6+0)*4]
movhps xmm5, [edi+(5*6+2)*4]
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movlps xmm5, [edi+(4*6+4)*4]
movhps xmm5, [edi+(5*6+4)*4]
mulps xmm5, xmm2
addps xmm3, xmm5
movhlps xmm4, xmm3
addps xmm3, xmm4
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
mPtr++;
}
return;
}
}
break;
default:
int numRows = mat.GetNumRows();
for ( int i = 0; i < numColumns; i++ ) {
mPtr = mat.ToFloatPtr() + i;
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numRows; j++ ) {
mPtr += numColumns;
sum += mPtr[0] * vPtr[j];
}
dstPtr[i] STOREC sum;
}
break;
}
#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}
/*
============
void idSIMD_SSE::MatX_TransposeMultiplySubVecX
optimizes the following matrix multiplications:
Nx6 * Nx1
6xN * 6x1
with N in the range [1-6]
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
#define STORE1( offset, reg1, reg2 ) \
__asm movss reg2, [eax+offset] \
__asm subss reg2, reg1 \
__asm movss [eax+offset], reg2
#define STORE2LO( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm subps reg2, reg1 \
__asm movlps [eax+offset], reg2
#define STORE2HI( offset, reg1, reg2 ) \
__asm movhps reg2, [eax+offset] \
__asm subps reg2, reg1 \
__asm movhps [eax+offset], reg2
#define STORE4( offset, reg1, reg2 ) \
__asm movlps reg2, [eax+offset] \
__asm movhps reg2, [eax+offset+8] \
__asm subps reg2, reg1 \
__asm movlps [eax+offset], reg2 \
__asm movhps [eax+offset+8], reg2
#define STOREC -=
int numColumns;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumRows() );
assert( dst.GetSize() >= mat.GetNumColumns() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numColumns = mat.GetNumColumns();
switch( mat.GetNumRows() ) {
case 1:
switch( numColumns ) {
case 6: { // 1x6 * 1x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
STORE4( 0, xmm0, xmm2 )
STORE2LO( 16, xmm1, xmm3 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0];
mPtr++;
}
return;
}
}
break;
case 2:
switch( numColumns ) {
case 6: { // 2x6 * 2x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi]
movaps xmm1, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
movaps xmm2, [edi]
mulps xmm2, xmm0
movlps xmm3, [edi+24]
movhps xmm3, [edi+32]
mulps xmm3, xmm1
addps xmm2, xmm3
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm4, [edi+16]
movhps xmm4, [edi+40]
mulps xmm4, xmm0
movhlps xmm3, xmm4
addps xmm3, xmm4
STORE4( 0, xmm2, xmm5 )
STORE2LO( 16, xmm3, xmm6 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
mPtr++;
}
return;
}
}
break;
case 3:
switch( numColumns ) {
case 6: { // 3x6 * 3x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movss xmm1, [esi+2*4]
movlps xmm3, [edi+(0*6+0)*4]
movhps xmm3, [edi+(0*6+2)*4]
movaps xmm4, xmm0
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm4
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*6+0)*4]
movhps xmm4, [edi+(2*6+2)*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm1
addps xmm3, xmm4
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movhlps xmm4, xmm3
addps xmm3, xmm4
movlps xmm5, [edi+(2*6+4)*4]
mulps xmm5, xmm1
addps xmm3, xmm5
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
mPtr++;
}
return;
}
}
break;
case 4:
switch( numColumns ) {
case 6: { // 4x6 * 4x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*6+0)*4]
movhps xmm4, [edi+(2*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movhlps xmm4, xmm3
addps xmm3, xmm4
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3];
mPtr++;
}
return;
}
}
break;
case 5:
switch( numColumns ) {
case 6: { // 5x6 * 5x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movss xmm2, [esi+4*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(2*6+0)*4]
addps xmm3, xmm6
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm4, xmm2
mulps xmm4, [edi+(4*6+0)*4]
addps xmm3, xmm4
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movhlps xmm4, xmm3
addps xmm3, xmm4
movlps xmm5, [edi+(4*6+4)*4]
mulps xmm5, xmm2
addps xmm3, xmm5
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
mPtr++;
}
return;
}
}
break;
case 6:
switch( numColumns ) {
case 1: { // 6x1 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi]
movhps xmm0, [esi+8]
movlps xmm1, [esi+16]
mulps xmm0, [edi]
mulps xmm1, [edi+16]
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
addps xmm0, xmm1
movhlps xmm2, xmm0
addss xmm2, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm2, xmm0
STORE1( 0, xmm2, xmm3 )
}
return;
}
case 2: { // 6x2 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm6, [edi+0*4]
mulps xmm6, xmm0
movlps xmm1, [esi+2*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm7, [edi+4*4]
mulps xmm7, xmm1
addps xmm6, xmm7
movlps xmm2, [esi+4*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm7, [edi+8*4]
mulps xmm7, xmm2
addps xmm6, xmm7
movhlps xmm3, xmm6
addps xmm3, xmm6
STORE2LO( 0, xmm3, xmm7 )
}
return;
}
case 3: { // 6x3 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movss xmm0, [edi+(0*3+2)*4]
movhps xmm0, [edi+(0*3+0)*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm6, [esi+0*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movss xmm1, [edi+(1*3+0)*4]
movhps xmm1, [edi+(1*3+1)*4]
movss xmm7, [esi+1*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movss xmm2, [edi+(2*3+2)*4]
movhps xmm2, [edi+(2*3+0)*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm7, [esi+2*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
movss xmm3, [edi+(3*3+0)*4]
movhps xmm3, [edi+(3*3+1)*4]
movss xmm7, [esi+3*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm3
addps xmm6, xmm7
movss xmm4, [edi+(4*3+2)*4]
movhps xmm4, [edi+(4*3+0)*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
movss xmm7, [esi+4*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movss xmm5, [edi+(5*3+0)*4]
movhps xmm5, [edi+(5*3+1)*4]
movss xmm7, [esi+5*4]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
STORE1( 0, xmm6, xmm7 )
STORE2HI( 4, xmm6, xmm7 )
}
return;
}
case 4: { // 6x4 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm3, [edi+(0*4+0)*4]
movhps xmm3, [edi+(0*4+2)*4]
movss xmm4, [esi+0*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm4
movlps xmm5, [edi+(1*4+0)*4]
movhps xmm5, [edi+(1*4+2)*4]
movss xmm6, [esi+1*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(2*4+0)*4]
movhps xmm4, [edi+(2*4+2)*4]
movss xmm6, [esi+2*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(3*4+0)*4]
movhps xmm5, [edi+(3*4+2)*4]
movss xmm6, [esi+3*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
movlps xmm4, [edi+(4*4+0)*4]
movhps xmm4, [edi+(4*4+2)*4]
movss xmm6, [esi+4*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm6
addps xmm3, xmm4
movlps xmm5, [edi+(5*4+0)*4]
movhps xmm5, [edi+(5*4+2)*4]
movss xmm6, [esi+5*4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
}
return;
}
case 5: { // 6x5 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm6, [edi+(0*5+0)*4]
movhps xmm6, [edi+(0*5+2)*4]
movss xmm0, [esi+0*4]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movlps xmm7, [edi+(1*5+0)*4]
movhps xmm7, [edi+(1*5+2)*4]
movss xmm1, [esi+1*4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movlps xmm7, [edi+(2*5+0)*4]
movhps xmm7, [edi+(2*5+2)*4]
movss xmm2, [esi+2*4]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
movlps xmm7, [edi+(3*5+0)*4]
movhps xmm7, [edi+(3*5+2)*4]
movss xmm3, [esi+3*4]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm3
addps xmm6, xmm7
movlps xmm7, [edi+(4*5+0)*4]
movhps xmm7, [edi+(4*5+2)*4]
movss xmm4, [esi+4*4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movlps xmm7, [edi+(5*5+0)*4]
movhps xmm7, [edi+(5*5+2)*4]
movss xmm5, [esi+5*4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
STORE4( 0, xmm6, xmm7 )
movss xmm6, [edi+(0*5+4)*4]
mulss xmm6, xmm0
movss xmm7, [edi+(1*5+4)*4]
mulss xmm7, xmm1
addss xmm6, xmm7
movss xmm7, [edi+(2*5+4)*4]
mulss xmm7, xmm2
addss xmm6, xmm7
movss xmm7, [edi+(3*5+4)*4]
mulss xmm7, xmm3
addss xmm6, xmm7
movss xmm7, [edi+(4*5+4)*4]
mulss xmm7, xmm4
addss xmm6, xmm7
movss xmm7, [edi+(5*5+4)*4]
mulss xmm7, xmm5
addss xmm6, xmm7
STORE1( 16, xmm6, xmm7 )
}
return;
}
case 6: { // 6x6 * 6x1
__asm {
mov esi, vPtr
mov edi, mPtr
mov eax, dstPtr
movlps xmm0, [esi+0*4]
movlps xmm1, [esi+2*4]
movlps xmm2, [esi+4*4]
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, [edi+(0*6+0)*4]
movlps xmm5, [edi+(1*6+0)*4]
movhps xmm5, [edi+(1*6+2)*4]
movaps xmm6, xmm0
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(2*6+0)*4]
addps xmm3, xmm6
movaps xmm6, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
movlps xmm5, [edi+(3*6+0)*4]
movhps xmm5, [edi+(3*6+2)*4]
mulps xmm5, xmm6
addps xmm3, xmm5
movaps xmm6, xmm2
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, [edi+(4*6+0)*4]
addps xmm3, xmm6
movaps xmm6, xmm2
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
movlps xmm5, [edi+(5*6+0)*4]
movhps xmm5, [edi+(5*6+2)*4]
mulps xmm5, xmm6
addps xmm3, xmm5
STORE4( 0, xmm3, xmm7 )
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
movlps xmm3, [edi+(0*6+4)*4]
movhps xmm3, [edi+(1*6+4)*4]
mulps xmm3, xmm0
movlps xmm4, [edi+(2*6+4)*4]
movhps xmm4, [edi+(3*6+4)*4]
mulps xmm4, xmm1
addps xmm3, xmm4
movlps xmm5, [edi+(4*6+4)*4]
movhps xmm5, [edi+(5*6+4)*4]
mulps xmm5, xmm2
addps xmm3, xmm5
movhlps xmm4, xmm3
addps xmm3, xmm4
STORE2LO( 16, xmm3, xmm7 )
}
return;
}
default: {
for ( int i = 0; i < numColumns; i++ ) {
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
mPtr++;
}
return;
}
}
break;
default:
int numRows = mat.GetNumRows();
for ( int i = 0; i < numColumns; i++ ) {
mPtr = mat.ToFloatPtr() + i;
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numRows; j++ ) {
mPtr += numColumns;
sum += mPtr[0] * vPtr[j];
}
dstPtr[i] STOREC sum;
}
break;
}
#undef STOREC
#undef STORE4
#undef STORE2HI
#undef STORE2LO
#undef STORE1
}
/*
============
idSIMD_SSE::MatX_MultiplyMatX
optimizes the following matrix multiplications:
NxN * Nx6
6xN * Nx6
Nx6 * 6xN
6x6 * 6xN
with N in the range [1-6].
The hot cache clock cycle counts are generally better for the SIMD version than the
FPU version. At times up to 40% less clock cycles on a P3. In practise however,
the results are poor probably due to memory access.
============
*/
void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
int i, j, k, l, n;
float *dstPtr;
const float *m1Ptr, *m2Ptr;
double sum;
assert( m1.GetNumColumns() == m2.GetNumRows() );
dstPtr = dst.ToFloatPtr();
m1Ptr = m1.ToFloatPtr();
m2Ptr = m2.ToFloatPtr();
k = m1.GetNumRows();
l = m2.GetNumColumns();
n = m1.GetNumColumns();
switch( n ) {
case 1: {
if ( !(l^6) ) {
switch( k ) {
case 1: { // 1x1 * 1x6, no precision loss compared to FPU version
__asm {
mov esi, m2Ptr
mov edi, m1Ptr
mov eax, dstPtr
movss xmm0, [edi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, [esi]
mulps xmm1, xmm0
movaps [eax], xmm1
movlps xmm2, [esi+16]
mulps xmm2, xmm0
movlps [eax+16], xmm2
}
return;
}
case 6: { // 6x1 * 1x6, no precision loss compared to FPU version
__asm {
mov esi, m2Ptr
mov edi, m1Ptr
mov eax, dstPtr
xorps xmm1, xmm1
movaps xmm0, [edi]
movlps xmm1, [edi+16]
movlhps xmm1, xmm0
movhlps xmm2, xmm0
movlhps xmm2, xmm1
// row 0 and 1
movaps xmm3, [esi]
movaps xmm4, xmm3
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm5, xmm3
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
movaps xmm6, xmm3
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
movaps [eax], xmm4
movaps [eax+16], xmm5
movaps [eax+32], xmm6
// row 2 and 3
movaps xmm4, xmm3
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
movaps xmm5, xmm3
shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm3, xmm2
movaps [eax+48], xmm4
movaps [eax+64], xmm5
movaps [eax+80], xmm3
// row 4 and 5
movlps xmm3, [esi+16]
movaps xmm4, xmm3
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm5, xmm3
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm3, xmm2
movaps [eax+96], xmm4
movaps [eax+112], xmm5
movaps [eax+128], xmm3
}
return;
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
m2Ptr++;
}
m1Ptr++;
}
break;
}
case 2: {
if ( !(l^6) ) {
switch( k ) {
case 2: { // 2x2 * 2x6
#define MUL_Nx2_2x6_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movaps xmm0, [esi] \
__asm movlps xmm1, [esi+16] \
__asm movhps xmm1, [esi+40] \
__asm movlps xmm2, [esi+24] \
__asm movhps xmm2, [esi+32]
#define MUL_Nx2_2x6_ROW2( row ) \
__asm movaps xmm3, [edi+row*16] \
__asm movaps xmm5, xmm0 \
__asm movaps xmm4, xmm3 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm4 \
__asm movaps xmm4, xmm3 \
__asm movaps xmm6, xmm2 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm5, xmm6 \
__asm movaps [eax+row*48], xmm5 \
__asm movaps xmm4, xmm3 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm movaps xmm7, xmm1 \
__asm mulps xmm7, xmm4 \
__asm movaps xmm4, xmm3 \
__asm movaps xmm5, xmm0 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \
__asm mulps xmm5, xmm4 \
__asm movaps xmm4, xmm3 \
__asm movaps xmm6, xmm2 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm5, xmm6 \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \
__asm movaps xmm6, xmm1 \
__asm mulps xmm6, xmm3 \
__asm movaps xmm4, xmm7 \
__asm movlhps xmm7, xmm6 \
__asm movhlps xmm6, xmm4 \
__asm addps xmm6, xmm7 \
__asm movlps [eax+row*48+16], xmm6 \
__asm movlps [eax+row*48+24], xmm5 \
__asm movhps [eax+row*48+32], xmm5 \
__asm movhps [eax+row*48+40], xmm6
MUL_Nx2_2x6_INIT
MUL_Nx2_2x6_ROW2( 0 )
return;
}
case 6: { // 6x2 * 2x6
MUL_Nx2_2x6_INIT
MUL_Nx2_2x6_ROW2( 0 )
MUL_Nx2_2x6_ROW2( 1 )
MUL_Nx2_2x6_ROW2( 2 )
return;
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
m2Ptr++;
}
m1Ptr += 2;
}
break;
}
case 3: {
if ( !(l^6) ) {
switch( k ) {
case 3: { // 3x3 * 3x6
__asm {
mov esi, m2Ptr
mov edi, m1Ptr
mov eax, dstPtr
movaps xmm5, xmmword ptr [esi]
movlps xmm6, qword ptr [esi+24]
movhps xmm6, qword ptr [esi+32]
movaps xmm7, xmmword ptr [esi+48]
movss xmm0, dword ptr [edi]
shufps xmm0, xmm0, 0
mulps xmm0, xmm5
movss xmm1, dword ptr [edi+4]
shufps xmm1, xmm1, 0
mulps xmm1, xmm6
movss xmm2, dword ptr [edi+8]
shufps xmm2, xmm2, 0
mulps xmm2, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
movaps xmmword ptr [eax], xmm0
movss xmm3, dword ptr [edi+12]
shufps xmm3, xmm3, 0
mulps xmm3, xmm5
movss xmm4, dword ptr [edi+16]
shufps xmm4, xmm4, 0
mulps xmm4, xmm6
movss xmm0, dword ptr [edi+20]
shufps xmm0, xmm0, 0
mulps xmm0, xmm7
addps xmm3, xmm4
addps xmm0, xmm3
movlps qword ptr [eax+24], xmm0
movhps qword ptr [eax+32], xmm0
movss xmm1, dword ptr [edi+24]
shufps xmm1, xmm1, 0
mulps xmm1, xmm5
movss xmm2, dword ptr [edi+28]
shufps xmm2, xmm2, 0
mulps xmm2, xmm6
movss xmm3, dword ptr [edi+32]
shufps xmm3, xmm3, 0
mulps xmm3, xmm7
addps xmm1, xmm2
addps xmm1, xmm3
movaps xmmword ptr [eax+48], xmm1
movlps xmm5, qword ptr [esi+16]
movlps xmm6, qword ptr [esi+40]
movlps xmm7, qword ptr [esi+64]
shufps xmm5, xmm5, 0x44
shufps xmm6, xmm6, 0x44
shufps xmm7, xmm7, 0x44
movaps xmm3, xmmword ptr [edi]
movlps xmm4, qword ptr [edi+16]
movaps xmm0, xmm3
shufps xmm0, xmm0, 0xF0
mulps xmm0, xmm5
movaps xmm1, xmm3
shufps xmm1, xmm4, 0x05
mulps xmm1, xmm6
shufps xmm3, xmm4, 0x5A
mulps xmm3, xmm7
addps xmm1, xmm0
addps xmm1, xmm3
movlps qword ptr [eax+16], xmm1
movhps qword ptr [eax+40], xmm1
movss xmm0, dword ptr [edi+24]
shufps xmm0, xmm0, 0
mulps xmm0, xmm5
movss xmm2, dword ptr [edi+28]
shufps xmm2, xmm2, 0
mulps xmm2, xmm6
movss xmm4, dword ptr [edi+32]
shufps xmm4, xmm4, 0
mulps xmm4, xmm7
addps xmm0, xmm2
addps xmm0, xmm4
movlps qword ptr [eax+64], xmm0
}
return;
}
case 6: { // 6x3 * 3x6
#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movlps xmm0, [esi+ 0*4] \
__asm movhps xmm0, [esi+ 2*4] \
__asm movlps xmm1, [esi+ 6*4] \
__asm movhps xmm1, [esi+ 8*4] \
__asm movlps xmm2, [esi+12*4] \
__asm movhps xmm2, [esi+14*4]
#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \
__asm movss xmm3, [edi+(row*3+0)*4] \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm3, xmm0 \
__asm movss xmm4, [edi+(row*3+1)*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm4, xmm1 \
__asm addps xmm3, xmm4 \
__asm movss xmm5, [edi+(row*3+2)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm2 \
__asm addps xmm3, xmm5 \
__asm movlps [eax+(row*6+0)*4], xmm3 \
__asm movhps [eax+(row*6+2)*4], xmm3
#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \
__asm movlps xmm0, [esi+ 4*4] \
__asm movlps xmm1, [esi+10*4] \
__asm movlps xmm2, [esi+16*4] \
__asm shufps xmm0, xmm0, 0x44 \
__asm shufps xmm1, xmm1, 0x44 \
__asm shufps xmm2, xmm2, 0x44 \
__asm movlps xmm3, [edi+0*4] \
__asm movhps xmm3, [edi+2*4] \
__asm movaps xmm4, xmm3 \
__asm movaps xmm5, xmm3 \
__asm shufps xmm3, xmm3, 0xF0 \
__asm mulps xmm3, xmm0 \
__asm movlps xmm6, [edi+4*4] \
__asm movhps xmm6, [edi+6*4] \
__asm shufps xmm4, xmm6, 0x05 \
__asm mulps xmm4, xmm1 \
__asm addps xmm3, xmm4 \
__asm shufps xmm5, xmm6, 0x5A \
__asm mulps xmm5, xmm2 \
__asm addps xmm3, xmm5 \
__asm movlps [eax+4*4], xmm3 \
__asm movhps [eax+10*4], xmm3 \
__asm movaps xmm5, xmm6 \
__asm movlps xmm3, [edi+8*4] \
__asm movhps xmm3, [edi+10*4] \
__asm movaps xmm4, xmm3 \
__asm shufps xmm5, xmm3, 0x5A \
__asm mulps xmm5, xmm0 \
__asm shufps xmm6, xmm3, 0xAF \
__asm mulps xmm6, xmm1 \
__asm addps xmm5, xmm6 \
__asm shufps xmm4, xmm4, 0xF0 \
__asm mulps xmm4, xmm2 \
__asm addps xmm4, xmm5 \
__asm movlps [eax+16*4], xmm4 \
__asm movhps [eax+22*4], xmm4 \
__asm movlps xmm6, [edi+12*4] \
__asm movhps xmm6, [edi+14*4] \
__asm movaps xmm5, xmm6 \
__asm movaps xmm4, xmm6 \
__asm shufps xmm6, xmm6, 0xF0 \
__asm mulps xmm6, xmm0 \
__asm movlps xmm3, [edi+16*4] \
__asm shufps xmm5, xmm3, 0x05 \
__asm mulps xmm5, xmm1 \
__asm addps xmm5, xmm6 \
__asm shufps xmm4, xmm3, 0x5A \
__asm mulps xmm4, xmm2 \
__asm addps xmm4, xmm5 \
__asm movlps [eax+28*4], xmm4 \
__asm movhps [eax+34*4], xmm4
MUL_Nx3_3x6_FIRST4COLUMNS_INIT
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
MUL_Nx3_3x6_LAST2COLUMNS_ROW6
return;
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
m2Ptr++;
}
m1Ptr += 3;
}
break;
}
case 4: {
if ( !(l^6) ) {
switch( k ) {
case 4: { // 4x4 * 4x6
#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movlps xmm0, [esi+ 0*4] \
__asm movhps xmm0, [esi+ 2*4] \
__asm movlps xmm1, [esi+ 6*4] \
__asm movhps xmm1, [esi+ 8*4] \
__asm movlps xmm2, [esi+12*4] \
__asm movhps xmm2, [esi+14*4] \
__asm movlps xmm3, [esi+18*4] \
__asm movhps xmm3, [esi+20*4]
#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \
__asm movss xmm4, [edi+row*16+0*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm4, xmm0 \
__asm movss xmm5, [edi+row*16+1*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm1 \
__asm addps xmm4, xmm5 \
__asm movss xmm6, [edi+row*16+2*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm4, xmm6 \
__asm movss xmm7, [edi+row*16+3*4] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm3 \
__asm addps xmm4, xmm7 \
__asm movlps [eax+row*24+0], xmm4 \
__asm movhps [eax+row*24+8], xmm4
#define MUL_Nx4_4x6_LAST2COLUMNS_INIT \
__asm movlps xmm0, [esi+ 4*4] \
__asm movlps xmm1, [esi+10*4] \
__asm movlps xmm2, [esi+16*4] \
__asm movlps xmm3, [esi+22*4] \
__asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \
__asm movlps xmm7, [edi+row*32+ 0*4] \
__asm movhps xmm7, [edi+row*32+ 4*4] \
__asm movaps xmm6, xmm7 \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \
__asm mulps xmm6, xmm0 \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \
__asm mulps xmm7, xmm1 \
__asm addps xmm6, xmm7 \
__asm movlps xmm4, [edi+row*32+ 2*4] \
__asm movhps xmm4, [edi+row*32+ 6*4] \
__asm movaps xmm5, xmm4 \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \
__asm mulps xmm5, xmm2 \
__asm addps xmm6, xmm5 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \
__asm mulps xmm4, xmm3 \
__asm addps xmm6, xmm4 \
__asm movlps [eax+row*48+ 4*4], xmm6 \
__asm movhps [eax+row*48+10*4], xmm6
MUL_Nx4_4x6_FIRST4COLUMNS_INIT
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
MUL_Nx4_4x6_LAST2COLUMNS_INIT
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
return;
}
case 6: { // 6x4 * 4x6
MUL_Nx4_4x6_FIRST4COLUMNS_INIT
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
MUL_Nx4_4x6_LAST2COLUMNS_INIT
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
return;
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
m1Ptr[3] * m2Ptr[3*l];
m2Ptr++;
}
m1Ptr += 4;
}
break;
}
case 5: {
if ( !(l^6) ) {
switch( k ) {
case 5: { // 5x5 * 5x6
#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movlps xmm0, [esi+ 0*4] \
__asm movhps xmm0, [esi+ 2*4] \
__asm movlps xmm1, [esi+ 6*4] \
__asm movhps xmm1, [esi+ 8*4] \
__asm movlps xmm2, [esi+12*4] \
__asm movhps xmm2, [esi+14*4] \
__asm movlps xmm3, [esi+18*4] \
__asm movhps xmm3, [esi+20*4] \
__asm movlps xmm4, [esi+24*4] \
__asm movhps xmm4, [esi+26*4]
#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \
__asm movss xmm6, [edi+row*20+0*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm0 \
__asm movss xmm5, [edi+row*20+1*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm1 \
__asm addps xmm6, xmm5 \
__asm movss xmm5, [edi+row*20+2*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm2 \
__asm addps xmm6, xmm5 \
__asm movss xmm5, [edi+row*20+3*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm3 \
__asm addps xmm6, xmm5 \
__asm movss xmm5, [edi+row*20+4*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm4 \
__asm addps xmm6, xmm5 \
__asm movlps [eax+row*24+0], xmm6 \
__asm movhps [eax+row*24+8], xmm6
#define MUL_Nx5_5x6_LAST2COLUMNS_INIT \
__asm movlps xmm0, [esi+ 4*4] \
__asm movlps xmm1, [esi+10*4] \
__asm movlps xmm2, [esi+16*4] \
__asm movlps xmm3, [esi+22*4] \
__asm movlps xmm4, [esi+28*4] \
__asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \
__asm movlps xmm7, [edi+row*40+ 0*4] \
__asm movhps xmm7, [edi+row*40+ 6*4] \
__asm movaps xmm6, xmm7 \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \
__asm mulps xmm6, xmm0 \
__asm movaps xmm5, xmm7 \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
__asm mulps xmm5, xmm1 \
__asm addps xmm6, xmm5 \
__asm movlps xmm7, [edi+row*40+ 2*4] \
__asm movhps xmm7, [edi+row*40+ 8*4] \
__asm movaps xmm5, xmm7 \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \
__asm mulps xmm5, xmm2 \
__asm addps xmm6, xmm5 \
__asm movaps xmm5, xmm7 \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
__asm mulps xmm5, xmm3 \
__asm addps xmm6, xmm5 \
__asm movlps xmm5, [edi+row*40+ 4*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm5, xmm4 \
__asm addps xmm6, xmm5 \
__asm movlps [eax+row*48+ 4*4], xmm6 \
__asm movhps [eax+row*48+10*4], xmm6
#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \
__asm movlps xmm6, [edi+20*4+0*4] \
__asm unpcklps xmm6, xmm6 \
__asm mulps xmm6, xmm0 \
__asm movlps xmm5, [edi+20*4+2*4] \
__asm unpcklps xmm5, xmm5 \
__asm mulps xmm5, xmm2 \
__asm addps xmm6, xmm5 \
__asm movss xmm5, [edi+20*4+4*4] \
__asm unpcklps xmm5, xmm5 \
__asm mulps xmm5, xmm4 \
__asm addps xmm6, xmm5 \
__asm movhlps xmm7, xmm6 \
__asm addps xmm6, xmm7 \
__asm movlps [eax+row*24+4*4], xmm6
MUL_Nx5_5x6_FIRST4COLUMNS_INIT
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
MUL_Nx5_5x6_LAST2COLUMNS_INIT
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
return;
}
case 6: { // 6x5 * 5x6
MUL_Nx5_5x6_FIRST4COLUMNS_INIT
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
MUL_Nx5_5x6_LAST2COLUMNS_INIT
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
return;
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
m2Ptr++;
}
m1Ptr += 5;
}
break;
}
case 6: {
switch( k ) {
case 1: {
if ( !(l^1) ) { // 1x6 * 6x1
dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
return;
}
break;
}
case 2: {
if ( !(l^2) ) { // 2x6 * 6x2
#define MUL_Nx6_6x2_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movaps xmm0, [esi] \
__asm movaps xmm1, [esi+16] \
__asm movaps xmm2, [esi+32]
#define MUL_Nx6_6x2_ROW2( row ) \
__asm movaps xmm7, [edi+row*48+0*4] \
__asm movaps xmm6, xmm7 \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm7, xmm0 \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movaps xmm6, [edi+row*48+4*4] \
__asm movaps xmm5, xmm6 \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \
__asm mulps xmm5, xmm0 \
__asm movaps xmm6, [edi+row*48+24+2*4] \
__asm movaps xmm4, xmm6 \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm5, xmm6 \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \
__asm mulps xmm4, xmm2 \
__asm addps xmm5, xmm4 \
__asm movaps xmm4, xmm5 \
__asm movhlps xmm5, xmm7 \
__asm movlhps xmm7, xmm4 \
__asm addps xmm7, xmm5 \
__asm movaps [eax+row*16], xmm7
MUL_Nx6_6x2_INIT
MUL_Nx6_6x2_ROW2( 0 )
return;
}
break;
}
case 3: {
if ( !(l^3) ) { // 3x6 * 6x3
#define MUL_Nx6_6x3_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movss xmm0, [esi+ 0*4] \
__asm movhps xmm0, [esi+ 1*4] \
__asm movss xmm1, [esi+ 3*4] \
__asm movhps xmm1, [esi+ 4*4] \
__asm movss xmm2, [esi+ 6*4] \
__asm movhps xmm2, [esi+ 7*4] \
__asm movss xmm3, [esi+ 9*4] \
__asm movhps xmm3, [esi+10*4] \
__asm movss xmm4, [esi+12*4] \
__asm movhps xmm4, [esi+13*4] \
__asm movss xmm5, [esi+15*4] \
__asm movhps xmm5, [esi+16*4]
#define MUL_Nx6_6x3_ROW( row ) \
__asm movss xmm7, [edi+row*24+0] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm0 \
__asm movss xmm6, [edi+row*24+4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+8] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+12] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+16] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+20] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm5 \
__asm addps xmm7, xmm6 \
__asm movss [eax+row*12+0], xmm7 \
__asm movhps [eax+row*12+4], xmm7
MUL_Nx6_6x3_INIT
MUL_Nx6_6x3_ROW( 0 )
MUL_Nx6_6x3_ROW( 1 )
MUL_Nx6_6x3_ROW( 2 )
return;
}
break;
}
case 4: {
if ( !(l^4) ) { // 4x6 * 6x4
#define MUL_Nx6_6x4_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movaps xmm0, [esi] \
__asm movaps xmm1, [esi+16] \
__asm movaps xmm2, [esi+32] \
__asm movaps xmm3, [esi+48] \
__asm movaps xmm4, [esi+64] \
__asm movaps xmm5, [esi+80]
#define MUL_Nx6_6x4_ROW( row ) \
__asm movss xmm7, [edi+row*24+0] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm0 \
__asm movss xmm6, [edi+row*24+4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+8] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+12] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+16] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+row*24+20] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm5 \
__asm addps xmm7, xmm6 \
__asm movaps [eax+row*16], xmm7
MUL_Nx6_6x4_INIT
MUL_Nx6_6x4_ROW( 0 )
MUL_Nx6_6x4_ROW( 1 )
MUL_Nx6_6x4_ROW( 2 )
MUL_Nx6_6x4_ROW( 3 )
return;
}
break;
}
case 5: {
if ( !(l^5) ) { // 5x6 * 6x5
#define MUL_Nx6_6x5_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movaps xmm0, [esi] \
__asm movlps xmm1, [esi+20] \
__asm movhps xmm1, [esi+28] \
__asm movlps xmm2, [esi+40] \
__asm movhps xmm2, [esi+48] \
__asm movlps xmm3, [esi+60] \
__asm movhps xmm3, [esi+68] \
__asm movaps xmm4, [esi+80] \
__asm movlps xmm5, [esi+100] \
__asm movhps xmm5, [esi+108]
#define MUL_Nx6_6x5_ROW( row ) \
__asm movss xmm7, [edi+row*24+0] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm0 \
__asm fld dword ptr [edi+(row*6+0)*4] \
__asm fmul dword ptr [esi+(4+0*5)*4] \
__asm movss xmm6, [edi+row*24+4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm fld dword ptr [edi+(row*6+1)*4] \
__asm fmul dword ptr [esi+(4+1*5)*4] \
__asm faddp st(1),st \
__asm movss xmm6, [edi+row*24+8] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm fld dword ptr [edi+(row*6+2)*4] \
__asm fmul dword ptr [esi+(4+2*5)*4] \
__asm faddp st(1),st \
__asm movss xmm6, [edi+row*24+12] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm fld dword ptr [edi+(row*6+3)*4] \
__asm fmul dword ptr [esi+(4+3*5)*4] \
__asm faddp st(1),st \
__asm movss xmm6, [edi+row*24+16] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm7, xmm6 \
__asm fld dword ptr [edi+(row*6+4)*4] \
__asm fmul dword ptr [esi+(4+4*5)*4] \
__asm faddp st(1),st \
__asm movss xmm6, [edi+row*24+20] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm5 \
__asm addps xmm7, xmm6 \
__asm fld dword ptr [edi+(row*6+5)*4] \
__asm fmul dword ptr [esi+(4+5*5)*4] \
__asm faddp st(1),st \
__asm fstp dword ptr [eax+(row*5+4)*4] \
__asm movlps [eax+row*20], xmm7 \
__asm movhps [eax+row*20+8], xmm7
MUL_Nx6_6x5_INIT
MUL_Nx6_6x5_ROW( 0 )
MUL_Nx6_6x5_ROW( 1 )
MUL_Nx6_6x5_ROW( 2 )
MUL_Nx6_6x5_ROW( 3 )
MUL_Nx6_6x5_ROW( 4 )
return;
}
break;
}
case 6: {
switch( l ) {
case 1: { // 6x6 * 6x1
__asm {
mov esi, m2Ptr
mov edi, m1Ptr
mov eax, dstPtr
movlps xmm7, qword ptr [esi]
movlps xmm6, qword ptr [esi+8]
shufps xmm7, xmm7, 0x44
shufps xmm6, xmm6, 0x44
movlps xmm0, qword ptr [edi ]
movhps xmm0, qword ptr [edi+ 24]
mulps xmm0, xmm7
movlps xmm3, qword ptr [edi+ 8]
movhps xmm3, qword ptr [edi+ 32]
mulps xmm3, xmm6
movlps xmm1, qword ptr [edi+ 48]
movhps xmm1, qword ptr [edi+ 72]
mulps xmm1, xmm7
movlps xmm2, qword ptr [edi+ 96]
movhps xmm2, qword ptr [edi+120]
mulps xmm2, xmm7
movlps xmm4, qword ptr [edi+ 56]
movhps xmm4, qword ptr [edi+ 80]
movlps xmm5, qword ptr [edi+104]
movhps xmm5, qword ptr [edi+128]
mulps xmm4, xmm6
movlps xmm7, qword ptr [esi+16]
addps xmm0, xmm3
shufps xmm7, xmm7, 0x44
mulps xmm5, xmm6
addps xmm1, xmm4
movlps xmm3, qword ptr [edi+ 16]
movhps xmm3, qword ptr [edi+ 40]
addps xmm2, xmm5
movlps xmm4, qword ptr [edi+ 64]
movhps xmm4, qword ptr [edi+ 88]
mulps xmm3, xmm7
movlps xmm5, qword ptr [edi+112]
movhps xmm5, qword ptr [edi+136]
addps xmm0, xmm3
mulps xmm4, xmm7
mulps xmm5, xmm7
addps xmm1, xmm4
addps xmm2, xmm5
movaps xmm6, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm6, xmm1, 0xDD
movaps xmm7, xmm2
shufps xmm7, xmm2, 0x88
shufps xmm2, xmm2, 0xDD
addps xmm0, xmm6
addps xmm2, xmm7
movlps [eax], xmm0
movhps [eax+8], xmm0
movlps [eax+16], xmm2
}
return;
}
case 2: { // 6x6 * 6x2
MUL_Nx6_6x2_INIT
MUL_Nx6_6x2_ROW2( 0 )
MUL_Nx6_6x2_ROW2( 1 )
MUL_Nx6_6x2_ROW2( 2 )
return;
}
case 3: { // 6x6 * 6x3
MUL_Nx6_6x3_INIT
MUL_Nx6_6x3_ROW( 0 )
MUL_Nx6_6x3_ROW( 1 )
MUL_Nx6_6x3_ROW( 2 )
MUL_Nx6_6x3_ROW( 3 )
MUL_Nx6_6x3_ROW( 4 )
MUL_Nx6_6x3_ROW( 5 )
return;
}
case 4: { // 6x6 * 6x4
MUL_Nx6_6x4_INIT
MUL_Nx6_6x4_ROW( 0 )
MUL_Nx6_6x4_ROW( 1 )
MUL_Nx6_6x4_ROW( 2 )
MUL_Nx6_6x4_ROW( 3 )
MUL_Nx6_6x4_ROW( 4 )
MUL_Nx6_6x4_ROW( 5 )
return;
}
case 5: { // 6x6 * 6x5
MUL_Nx6_6x5_INIT
MUL_Nx6_6x5_ROW( 0 )
MUL_Nx6_6x5_ROW( 1 )
MUL_Nx6_6x5_ROW( 2 )
MUL_Nx6_6x5_ROW( 3 )
MUL_Nx6_6x5_ROW( 4 )
MUL_Nx6_6x5_ROW( 5 )
return;
}
case 6: { // 6x6 * 6x6
__asm {
mov ecx, dword ptr m2Ptr
movlps xmm3, qword ptr [ecx+72]
mov edx, dword ptr m1Ptr
// Loading first 4 columns (upper 4 rows) of m2Ptr.
movaps xmm0, xmmword ptr [ecx]
movlps xmm1, qword ptr [ecx+24]
movhps xmm1, qword ptr [ecx+32]
movaps xmm2, xmmword ptr [ecx+48]
movhps xmm3, qword ptr [ecx+80]
// Calculating first 4 elements in the first row of the destination matrix.
movss xmm4, dword ptr [edx]
movss xmm5, dword ptr [edx+4]
mov eax, dword ptr dstPtr
shufps xmm4, xmm4, 0
movss xmm6, dword ptr [edx+8]
shufps xmm5, xmm5, 0
movss xmm7, dword ptr [edx+12]
mulps xmm4, xmm0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm5, xmm1
mulps xmm6, xmm2
addps xmm5, xmm4
mulps xmm7, xmm3
addps xmm6, xmm5
addps xmm7, xmm6
movaps xmmword ptr [eax], xmm7
// Calculating first 4 elements in the second row of the destination matrix.
movss xmm4, dword ptr [edx+24]
shufps xmm4, xmm4, 0
mulps xmm4, xmm0
movss xmm5, dword ptr [edx+28]
shufps xmm5, xmm5, 0
mulps xmm5, xmm1
movss xmm6, dword ptr [edx+32]
shufps xmm6, xmm6, 0
movss xmm7, dword ptr [edx+36]
shufps xmm7, xmm7, 0
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm7, xmm6
addps xmm5, xmm4
addps xmm7, xmm5
// Calculating first 4 elements in the third row of the destination matrix.
movss xmm4, dword ptr [edx+48]
movss xmm5, dword ptr [edx+52]
movlps qword ptr [eax+24], xmm7 ; save 2nd
movhps qword ptr [eax+32], xmm7 ; row
movss xmm6, dword ptr [edx+56]
movss xmm7, dword ptr [edx+60]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm5, xmm4
addps xmm7, xmm6
addps xmm7, xmm5
movaps xmmword ptr [eax+48], xmm7
// Calculating first 4 elements in the fourth row of the destination matrix.
movss xmm4, dword ptr [edx+72]
movss xmm5, dword ptr [edx+76]
movss xmm6, dword ptr [edx+80]
movss xmm7, dword ptr [edx+84]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm4, xmm5
addps xmm6, xmm4
addps xmm7, xmm6
movlps qword ptr [eax+72], xmm7
movhps qword ptr [eax+80], xmm7
// Calculating first 4 elements in the fifth row of the destination matrix.
movss xmm4, dword ptr [edx+96]
movss xmm5, dword ptr [edx+100]
movss xmm6, dword ptr [edx+104]
movss xmm7, dword ptr [edx+108]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm5, xmm4
addps xmm7, xmm6
addps xmm7, xmm5
movaps xmmword ptr [eax+96], xmm7
// Calculating first 4 elements in the sixth row of the destination matrix.
movss xmm4, dword ptr [edx+120]
movss xmm5, dword ptr [edx+124]
movss xmm6, dword ptr [edx+128]
movss xmm7, dword ptr [edx+132]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm6, xmm6, 0
shufps xmm7, xmm7, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm6, xmm2
mulps xmm7, xmm3
addps xmm4, xmm5
addps xmm6, xmm4
addps xmm7, xmm6
movhps qword ptr [eax+128], xmm7
movlps qword ptr [eax+120], xmm7
// Loading first 4 columns (lower 2 rows) of m2Ptr.
movlps xmm0, qword ptr [ecx+96]
movhps xmm0, qword ptr [ecx+104]
movlps xmm1, qword ptr [ecx+120]
movhps xmm1, qword ptr [ecx+128]
// Calculating first 4 elements in the first row of the destination matrix.
movss xmm2, dword ptr [edx+16]
shufps xmm2, xmm2, 0
movss xmm4, dword ptr [edx+40]
movss xmm3, dword ptr [edx+20]
movss xmm5, dword ptr [edx+44]
movaps xmm6, xmmword ptr [eax]
movlps xmm7, qword ptr [eax+24]
shufps xmm3, xmm3, 0
shufps xmm5, xmm5, 0
movhps xmm7, qword ptr [eax+32]
shufps xmm4, xmm4, 0
mulps xmm5, xmm1
mulps xmm2, xmm0
mulps xmm3, xmm1
mulps xmm4, xmm0
addps xmm6, xmm2
addps xmm7, xmm4
addps xmm7, xmm5
addps xmm6, xmm3
movlps qword ptr [eax+24], xmm7
movaps xmmword ptr [eax], xmm6
movhps qword ptr [eax+32], xmm7
// Calculating first 4 elements in the third row of the destination matrix.
movss xmm2, dword ptr [edx+64]
movss xmm4, dword ptr [edx+88]
movss xmm5, dword ptr [edx+92]
movss xmm3, dword ptr [edx+68]
movaps xmm6, xmmword ptr [eax+48]
movlps xmm7, qword ptr [eax+72]
movhps xmm7, qword ptr [eax+80]
shufps xmm2, xmm2, 0
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
shufps xmm3, xmm3, 0
mulps xmm2, xmm0
mulps xmm4, xmm0
mulps xmm5, xmm1
mulps xmm3, xmm1
addps xmm6, xmm2
addps xmm6, xmm3
addps xmm7, xmm4
addps xmm7, xmm5
movlps qword ptr [eax+72], xmm7
movaps xmmword ptr [eax+48], xmm6
movhps qword ptr [eax+80], xmm7
// Calculating first 4 elements in the fifth row of the destination matrix.
movss xmm2, dword ptr [edx+112]
movss xmm3, dword ptr [edx+116]
movaps xmm6, xmmword ptr [eax+96]
shufps xmm2, xmm2, 0
shufps xmm3, xmm3, 0
mulps xmm2, xmm0
mulps xmm3, xmm1
addps xmm6, xmm2
addps xmm6, xmm3
movaps xmmword ptr [eax+96], xmm6
// Calculating first 4 elements in the sixth row of the destination matrix.
movss xmm4, dword ptr [edx+136]
movss xmm5, dword ptr [edx+140]
movhps xmm7, qword ptr [eax+128]
movlps xmm7, qword ptr [eax+120]
shufps xmm4, xmm4, 0
shufps xmm5, xmm5, 0
mulps xmm4, xmm0
mulps xmm5, xmm1
addps xmm7, xmm4
addps xmm7, xmm5
// Calculating last 2 columns of the destination matrix.
movlps xmm0, qword ptr [ecx+16]
movhps xmm0, qword ptr [ecx+40]
movhps qword ptr [eax+128], xmm7
movlps qword ptr [eax+120], xmm7
movlps xmm2, qword ptr [ecx+64]
movhps xmm2, qword ptr [ecx+88]
movaps xmm3, xmm2
shufps xmm3, xmm3, 4Eh
movlps xmm4, qword ptr [ecx+112]
movhps xmm4, qword ptr [ecx+136]
movaps xmm5, xmm4
shufps xmm5, xmm5, 4Eh
movlps xmm6, qword ptr [edx]
movhps xmm6, qword ptr [edx+24]
movaps xmm7, xmm6
shufps xmm7, xmm7, 0F0h
mulps xmm7, xmm0
shufps xmm6, xmm6, 0A5h
movaps xmm1, xmm0
shufps xmm1, xmm1, 4Eh
mulps xmm1, xmm6
addps xmm7, xmm1
movlps xmm6, qword ptr [edx+8]
movhps xmm6, qword ptr [edx+32]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm2
mulps xmm6, xmm3
addps xmm7, xmm1
addps xmm7, xmm6
movhps xmm6, qword ptr [edx+40]
movlps xmm6, qword ptr [edx+16]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm4
mulps xmm6, xmm5
addps xmm7, xmm1
addps xmm7, xmm6
movlps qword ptr [eax+16], xmm7
movhps qword ptr [eax+40], xmm7
movlps xmm6, qword ptr [edx+48]
movhps xmm6, qword ptr [edx+72]
movaps xmm7, xmm6
shufps xmm7, xmm7, 0F0h
mulps xmm7, xmm0
shufps xmm6, xmm6, 0A5h
movaps xmm1, xmm0
shufps xmm1, xmm1, 4Eh
mulps xmm1, xmm6
addps xmm7, xmm1
movhps xmm6, qword ptr [edx+80]
movlps xmm6, qword ptr [edx+56]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm2
mulps xmm6, xmm3
addps xmm7, xmm1
addps xmm7, xmm6
movlps xmm6, qword ptr [edx+64]
movhps xmm6, qword ptr [edx+88]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm4
mulps xmm6, xmm5
addps xmm7, xmm1
addps xmm7, xmm6
movlps qword ptr [eax+64], xmm7
movhps qword ptr [eax+88], xmm7
movlps xmm6, qword ptr [edx+96]
movhps xmm6, qword ptr [edx+120]
movaps xmm7, xmm6
shufps xmm7, xmm7, 0F0h
mulps xmm7, xmm0
shufps xmm6, xmm6, 0A5h
movaps xmm1, xmm0
shufps xmm1, xmm1, 4Eh
mulps xmm1, xmm6
addps xmm7, xmm1
movlps xmm6, qword ptr [edx+104]
movhps xmm6, qword ptr [edx+128]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm2
mulps xmm6, xmm3
addps xmm7, xmm1
addps xmm7, xmm6
movlps xmm6, qword ptr [edx+112]
movhps xmm6, qword ptr [edx+136]
movaps xmm1, xmm6
shufps xmm1, xmm1, 0F0h
shufps xmm6, xmm6, 0A5h
mulps xmm1, xmm4
mulps xmm6, xmm5
addps xmm7, xmm1
addps xmm7, xmm6
movlps qword ptr [eax+112], xmm7
movhps qword ptr [eax+136], xmm7
}
return;
}
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
m2Ptr++;
}
m1Ptr += 6;
}
break;
}
default: {
for ( i = 0; i < k; i++ ) {
for ( j = 0; j < l; j++ ) {
m2Ptr = m2.ToFloatPtr() + j;
sum = m1Ptr[0] * m2Ptr[0];
for ( n = 1; n < m1.GetNumColumns(); n++ ) {
m2Ptr += l;
sum += m1Ptr[n] * m2Ptr[0];
}
*dstPtr++ = sum;
}
m1Ptr += m1.GetNumColumns();
}
break;
}
}
}
/*
============
idSIMD_SSE::MatX_TransposeMultiplyMatX
optimizes the following transpose matrix multiplications:
Nx6 * NxN
6xN * 6x6
with N in the range [1-6].
============
*/
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
int i, j, k, l, n;
float *dstPtr;
const float *m1Ptr, *m2Ptr;
double sum;
assert( m1.GetNumRows() == m2.GetNumRows() );
m1Ptr = m1.ToFloatPtr();
m2Ptr = m2.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
k = m1.GetNumColumns();
l = m2.GetNumColumns();
switch( m1.GetNumRows() ) {
case 1:
if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1
__asm {
mov esi, m2Ptr
mov edi, m1Ptr
mov eax, dstPtr
movss xmm0, [esi]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm1, xmm0
mulps xmm0, [edi]
mulps xmm1, [edi+16]
movaps [eax], xmm0
movlps [eax+16], xmm1
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
m2Ptr++;
}
m1Ptr++;
}
break;
case 2:
if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2
#define MUL_2xN_2x2_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movlps xmm0, [esi] \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm movlps xmm1, [esi+8] \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
#define MUL_2xN_2x2_ROW2( N, row ) \
__asm movlps xmm6, [edi+(row+0*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm movlps xmm7, [edi+(row+1*N)*4] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm0 \
__asm mulps xmm7, xmm1 \
__asm addps xmm6, xmm7 \
__asm movaps [eax+(row*2)*4], xmm6
MUL_2xN_2x2_INIT
MUL_2xN_2x2_ROW2( 6, 0 )
MUL_2xN_2x2_ROW2( 6, 2 )
MUL_2xN_2x2_ROW2( 6, 4 )
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 3:
if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3
#define MUL_3xN_3x3_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movss xmm0, [esi+(0*3+0)*4] \
__asm movhps xmm0, [esi+(0*3+1)*4] \
__asm movss xmm1, [esi+(1*3+0)*4] \
__asm movhps xmm1, [esi+(1*3+1)*4] \
__asm movss xmm2, [esi+(2*3+0)*4] \
__asm movhps xmm2, [esi+(2*3+1)*4]
#define MUL_3xN_3x3_INIT_ROW4 \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
#define MUL_3xN_3x3_ROW4( N, row ) \
__asm movlps xmm3, [edi+(row+0*N+0)*4] \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \
__asm movlps xmm4, [edi+(row+1*N+0)*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \
__asm movlps xmm5, [edi+(row+2*N+0)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \
__asm mulps xmm3, xmm0 \
__asm mulps xmm4, xmm1 \
__asm mulps xmm5, xmm2 \
__asm addps xmm3, xmm4 \
__asm addps xmm3, xmm5 \
__asm movaps [eax+(row*3+0)*4], xmm3 \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
__asm movlps xmm3, [edi+(row+0*N+1)*4] \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm movlps xmm4, [edi+(row+1*N+1)*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm movlps xmm5, [edi+(row+2*N+1)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm3, xmm0 \
__asm mulps xmm4, xmm1 \
__asm mulps xmm5, xmm2 \
__asm addps xmm3, xmm4 \
__asm addps xmm3, xmm5 \
__asm movaps [eax+(row*3+4)*4], xmm3 \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
__asm movlps xmm3, [edi+(row+0*N+2)*4] \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \
__asm movlps xmm4, [edi+(row+1*N+2)*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \
__asm movlps xmm5, [edi+(row+2*N+2)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \
__asm mulps xmm3, xmm0 \
__asm mulps xmm4, xmm1 \
__asm mulps xmm5, xmm2 \
__asm addps xmm3, xmm4 \
__asm addps xmm3, xmm5 \
__asm movaps [eax+(row*3+8)*4], xmm3
#define MUL_3xN_3x3_INIT_ROW4_ROW4 \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
#define MUL_3xN_3x3_INIT_ROW4_ROW \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
#define MUL_3xN_3x3_ROW( N, row ) \
__asm movss xmm3, [edi+(row+0*N)*4] \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm movss xmm4, [edi+(row+1*N)*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm movss xmm5, [edi+(row+2*N)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm3, xmm0 \
__asm mulps xmm4, xmm1 \
__asm mulps xmm5, xmm2 \
__asm addps xmm3, xmm4 \
__asm addps xmm3, xmm5 \
__asm movss [eax+(row*3+0)*4], xmm3 \
__asm movhps [eax+(row*3+1)*4], xmm3
MUL_3xN_3x3_INIT
MUL_3xN_3x3_INIT_ROW4
MUL_3xN_3x3_ROW4( 6, 0 )
MUL_3xN_3x3_INIT_ROW4_ROW
MUL_3xN_3x3_ROW( 6, 4 )
MUL_3xN_3x3_ROW( 6, 5 )
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 4:
if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4
#define MUL_4xN_4x4_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movaps xmm0, [esi] \
__asm movaps xmm1, [esi+16] \
__asm movaps xmm2, [esi+32] \
__asm movaps xmm3, [esi+48]
#define MUL_4xN_4x4_ROW( N, row ) \
__asm movss xmm7, [edi+(row+0*N)*4] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm0 \
__asm movss xmm6, [edi+(row+1*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(row+2*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(row+3*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm movaps [eax+row*16], xmm7
MUL_4xN_4x4_INIT
MUL_4xN_4x4_ROW( 6, 0 )
MUL_4xN_4x4_ROW( 6, 1 )
MUL_4xN_4x4_ROW( 6, 2 )
MUL_4xN_4x4_ROW( 6, 3 )
MUL_4xN_4x4_ROW( 6, 4 )
MUL_4xN_4x4_ROW( 6, 5 )
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
m1Ptr[3*k] * m2Ptr[3*l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 5:
if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5
#define MUL_5xN_5x5_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movlps xmm0, [esi+ 0*4] \
__asm movhps xmm0, [esi+ 2*4] \
__asm movlps xmm1, [esi+ 5*4] \
__asm movhps xmm1, [esi+ 7*4] \
__asm movlps xmm2, [esi+10*4] \
__asm movhps xmm2, [esi+12*4] \
__asm movlps xmm3, [esi+15*4] \
__asm movhps xmm3, [esi+17*4] \
__asm movlps xmm4, [esi+20*4] \
__asm movhps xmm4, [esi+22*4]
#define MUL_5xN_5x5_ROW( N, row ) \
__asm movss xmm6, [edi+(row+0*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm0 \
__asm fld dword ptr [edi+(row+0*N)*4] \
__asm fmul dword ptr [esi+ 4*4] \
__asm movss xmm5, [edi+(row+1*N)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm1 \
__asm addps xmm6, xmm5 \
__asm fld dword ptr [edi+(row+1*N)*4] \
__asm fmul dword ptr [esi+ 9*4] \
__asm faddp st(1),st \
__asm movss xmm5, [edi+(row+2*N)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm2 \
__asm addps xmm6, xmm5 \
__asm fld dword ptr [edi+(row+2*N)*4] \
__asm fmul dword ptr [esi+14*4] \
__asm faddp st(1),st \
__asm movss xmm5, [edi+(row+3*N)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm3 \
__asm addps xmm6, xmm5 \
__asm fld dword ptr [edi+(row+3*N)*4] \
__asm fmul dword ptr [esi+19*4] \
__asm faddp st(1),st \
__asm movss xmm5, [edi+(row+4*N)*4] \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm5, xmm4 \
__asm addps xmm6, xmm5 \
__asm fld dword ptr [edi+(row+4*N)*4] \
__asm fmul dword ptr [esi+24*4] \
__asm faddp st(1),st \
__asm fstp dword ptr [eax+(row*5+4)*4] \
__asm movlps [eax+(row*5+0)*4], xmm6 \
__asm movhps [eax+(row*5+2)*4], xmm6
MUL_5xN_5x5_INIT
MUL_5xN_5x5_ROW( 6, 0 )
MUL_5xN_5x5_ROW( 6, 1 )
MUL_5xN_5x5_ROW( 6, 2 )
MUL_5xN_5x5_ROW( 6, 3 )
MUL_5xN_5x5_ROW( 6, 4 )
MUL_5xN_5x5_ROW( 6, 5 )
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 6:
if ( !(l^6) ) {
switch( k ) {
case 1: { // 6x1 * 6x6
#define MUL_6xN_6x6_FIRST4COLUMNS_INIT \
__asm mov esi, m2Ptr \
__asm mov edi, m1Ptr \
__asm mov eax, dstPtr \
__asm movlps xmm0, [esi+ 0*4] \
__asm movhps xmm0, [esi+ 2*4] \
__asm movlps xmm1, [esi+ 6*4] \
__asm movhps xmm1, [esi+ 8*4] \
__asm movlps xmm2, [esi+12*4] \
__asm movhps xmm2, [esi+14*4] \
__asm movlps xmm3, [esi+18*4] \
__asm movhps xmm3, [esi+20*4] \
__asm movlps xmm4, [esi+24*4] \
__asm movhps xmm4, [esi+26*4] \
__asm movlps xmm5, [esi+30*4] \
__asm movhps xmm5, [esi+32*4]
#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \
__asm movss xmm7, [edi+(row+0*N)*4] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm0 \
__asm movss xmm6, [edi+(row+1*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(row+2*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(row+3*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(row+4*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(row+5*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm5 \
__asm addps xmm7, xmm6 \
__asm movlps [eax+(row*6+0)*4], xmm7 \
__asm movhps [eax+(row*6+2)*4], xmm7
#define MUL_6xN_6x6_LAST2COLUMNS_INIT \
__asm movlps xmm0, [esi+ 4*4] \
__asm movlps xmm1, [esi+10*4] \
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm movlps xmm2, [esi+16*4] \
__asm movlps xmm3, [esi+22*4] \
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm movlps xmm4, [esi+28*4] \
__asm movlps xmm5, [esi+34*4] \
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \
__asm movlps xmm7, [edi+(row*2+0*N)*4] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm7, xmm0 \
__asm movlps xmm6, [edi+(row*2+1*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movlps xmm6, [edi+(row*2+2*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm movlps xmm6, [edi+(row*2+3*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm movlps xmm6, [edi+(row*2+4*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm7, xmm6 \
__asm movlps xmm6, [edi+(row*2+5*N)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
__asm mulps xmm6, xmm5 \
__asm addps xmm7, xmm6 \
__asm movlps [eax+(row*12+ 4)*4], xmm7 \
__asm movhps [eax+(row*12+10)*4], xmm7
#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \
__asm movss xmm7, [edi+(1*N-1)*4] \
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm7, xmm0 \
__asm movss xmm6, [edi+(2*N-1)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm1 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(3*N-1)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm2 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(4*N-1)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm3 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(5*N-1)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm4 \
__asm addps xmm7, xmm6 \
__asm movss xmm6, [edi+(6*N-1)*4] \
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
__asm mulps xmm6, xmm5 \
__asm addps xmm7, xmm6 \
__asm movlps [eax+(row*6+4)*4], xmm7
MUL_6xN_6x6_FIRST4COLUMNS_INIT
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
MUL_6xN_6x6_LAST2COLUMNS_INIT
MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
return;
}
case 2: { // 6x2 * 6x6
MUL_6xN_6x6_FIRST4COLUMNS_INIT
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
MUL_6xN_6x6_LAST2COLUMNS_INIT
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
return;
}
case 3: { // 6x3 * 6x6
MUL_6xN_6x6_FIRST4COLUMNS_INIT
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
MUL_6xN_6x6_LAST2COLUMNS_INIT
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
return;
}
case 4: { // 6x4 * 6x6
MUL_6xN_6x6_FIRST4COLUMNS_INIT
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
MUL_6xN_6x6_LAST2COLUMNS_INIT
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
return;
}
case 5: { // 6x5 * 6x6
MUL_6xN_6x6_FIRST4COLUMNS_INIT
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
MUL_6xN_6x6_LAST2COLUMNS_INIT
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
return;
}
case 6: { // 6x6 * 6x6
MUL_6xN_6x6_FIRST4COLUMNS_INIT
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
MUL_6xN_6x6_LAST2COLUMNS_INIT
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
return;
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
m2Ptr++;
}
m1Ptr++;
}
break;
default:
for ( i = 0; i < k; i++ ) {
for ( j = 0; j < l; j++ ) {
m1Ptr = m1.ToFloatPtr() + i;
m2Ptr = m2.ToFloatPtr() + j;
sum = m1Ptr[0] * m2Ptr[0];
for ( n = 1; n < m1.GetNumRows(); n++ ) {
m1Ptr += k;
m2Ptr += l;
sum += m1Ptr[0] * m2Ptr[0];
}
*dstPtr++ = sum;
}
}
break;
}
}
/*
============
idSIMD_SSE::MatX_LowerTriangularSolve
solves x in Lx = b for the n * n sub-matrix of L
if skip > 0 the first skip elements of x are assumed to be valid already
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
int nc;
const float *lptr;
if ( skip >= n ) {
return;
}
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
#define NSKIP( n, s ) ((n<<3)|(s&7))
switch( NSKIP( n, skip ) ) {
case NSKIP( 1, 0 ): x[0] = b[0];
return;
case NSKIP( 2, 0 ): x[0] = b[0];
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
return;
case NSKIP( 3, 0 ): x[0] = b[0];
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
return;
case NSKIP( 4, 0 ): x[0] = b[0];
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
return;
case NSKIP( 5, 0 ): x[0] = b[0];
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
return;
case NSKIP( 6, 0 ): x[0] = b[0];
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
return;
case NSKIP( 7, 0 ): x[0] = b[0];
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
return;
}
return;
}
// process first 4 rows
switch( skip ) {
case 0: x[0] = b[0];
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
skip = 4;
}
lptr = L[skip];
// this code assumes n > 4
__asm {
push ebx
mov eax, skip // eax = i
shl eax, 2 // eax = i*4
mov edx, n // edx = n
shl edx, 2 // edx = n*4
mov esi, x // esi = x
mov edi, lptr // edi = lptr
add esi, eax
add edi, eax
mov ebx, b // ebx = b
// check for aligned memory
mov ecx, nc
shl ecx, 2
or ecx, esi
or ecx, edi
and ecx, 15
jnz loopurow
// aligned
looprow:
mov ecx, eax
neg ecx
movaps xmm0, [esi+ecx]
mulps xmm0, [edi+ecx]
add ecx, 12*4
jg donedot8
dot8:
movaps xmm1, [esi+ecx-(8*4)]
mulps xmm1, [edi+ecx-(8*4)]
addps xmm0, xmm1
movaps xmm3, [esi+ecx-(4*4)]
mulps xmm3, [edi+ecx-(4*4)]
addps xmm0, xmm3
add ecx, 8*4
jle dot8
donedot8:
sub ecx, 4*4
jg donedot4
//dot4:
movaps xmm1, [esi+ecx-(4*4)]
mulps xmm1, [edi+ecx-(4*4)]
addps xmm0, xmm1
add ecx, 4*4
donedot4:
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm1
sub ecx, 4*4
jz dot0
add ecx, 4
jz dot1
add ecx, 4
jz dot2
//dot3:
movss xmm1, [esi-(3*4)]
mulss xmm1, [edi-(3*4)]
addss xmm0, xmm1
dot2:
movss xmm3, [esi-(2*4)]
mulss xmm3, [edi-(2*4)]
addss xmm0, xmm3
dot1:
movss xmm5, [esi-(1*4)]
mulss xmm5, [edi-(1*4)]
addss xmm0, xmm5
dot0:
movss xmm1, [ebx+eax]
subss xmm1, xmm0
movss [esi], xmm1
add eax, 4
cmp eax, edx
jge done
add esi, 4
mov ecx, nc
shl ecx, 2
add edi, ecx
add edi, 4
jmp looprow
// unaligned
loopurow:
mov ecx, eax
neg ecx
movups xmm0, [esi+ecx]
movups xmm1, [edi+ecx]
mulps xmm0, xmm1
add ecx, 12*4
jg doneudot8
udot8:
movups xmm1, [esi+ecx-(8*4)]
movups xmm2, [edi+ecx-(8*4)]
mulps xmm1, xmm2
addps xmm0, xmm1
movups xmm3, [esi+ecx-(4*4)]
movups xmm4, [edi+ecx-(4*4)]
mulps xmm3, xmm4
addps xmm0, xmm3
add ecx, 8*4
jle udot8
doneudot8:
sub ecx, 4*4
jg doneudot4
//udot4:
movups xmm1, [esi+ecx-(4*4)]
movups xmm2, [edi+ecx-(4*4)]
mulps xmm1, xmm2
addps xmm0, xmm1
add ecx, 4*4
doneudot4:
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm1
sub ecx, 4*4
jz udot0
add ecx, 4
jz udot1
add ecx, 4
jz udot2
//udot3:
movss xmm1, [esi-(3*4)]
movss xmm2, [edi-(3*4)]
mulss xmm1, xmm2
addss xmm0, xmm1
udot2:
movss xmm3, [esi-(2*4)]
movss xmm4, [edi-(2*4)]
mulss xmm3, xmm4
addss xmm0, xmm3
udot1:
movss xmm5, [esi-(1*4)]
movss xmm6, [edi-(1*4)]
mulss xmm5, xmm6
addss xmm0, xmm5
udot0:
movss xmm1, [ebx+eax]
subss xmm1, xmm0
movss [esi], xmm1
add eax, 4
cmp eax, edx
jge done
add esi, 4
mov ecx, nc
shl ecx, 2
add edi, ecx
add edi, 4
jmp loopurow
done:
pop ebx
}
}
/*
============
idSIMD_SSE::MatX_LowerTriangularSolveTranspose
solves x in L'x = b for the n * n sub-matrix of L
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
int nc;
const float *lptr;
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x[1] = b[1];
x[0] = b[0] - lptr[1*nc+0] * x[1];
return;
case 3:
x[2] = b[2];
x[1] = b[1] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 4:
x[3] = b[3];
x[2] = b[2] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 5:
x[4] = b[4];
x[3] = b[3] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 6:
x[5] = b[5];
x[4] = b[4] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 7:
x[6] = b[6];
x[5] = b[5] - lptr[6*nc+5] * x[6];
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
}
return;
}
#if 1
int i, j, m;
float *xptr;
double s0;
// if the number of columns is not a multiple of 2 we're screwed for alignment.
// however, if the number of columns is a multiple of 2 but the number of to be
// processed rows is not a multiple of 2 we can still run 8 byte aligned
m = n;
if ( m & 1 ) {
m--;
x[m] = b[m];
lptr = L.ToFloatPtr() + m * nc + m - 4;
xptr = x + m;
__asm {
push ebx
mov eax, m // eax = i
mov esi, xptr // esi = xptr
mov edi, lptr // edi = lptr
mov ebx, b // ebx = b
mov edx, nc // edx = nc*sizeof(float)
shl edx, 2
process4rows_1:
movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
xor ecx, ecx
sub eax, m
neg eax
jz done4x4_1
process4x4_1: // process 4x4 blocks
movlps xmm2, [edi+0]
movhps xmm2, [edi+8]
add edi, edx
movss xmm1, [esi+4*ecx+0]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm3, [edi+0]
movhps xmm3, [edi+8]
add edi, edx
mulps xmm1, xmm2
subps xmm0, xmm1
movss xmm1, [esi+4*ecx+4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm4, [edi+0]
movhps xmm4, [edi+8]
add edi, edx
mulps xmm1, xmm3
subps xmm0, xmm1
movss xmm1, [esi+4*ecx+8]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm5, [edi+0]
movhps xmm5, [edi+8]
add edi, edx
mulps xmm1, xmm4
subps xmm0, xmm1
movss xmm1, [esi+4*ecx+12]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
add ecx, 4
cmp ecx, eax
mulps xmm1, xmm5
subps xmm0, xmm1
jl process4x4_1
done4x4_1: // process left over of the 4 rows
movlps xmm2, [edi+0]
movhps xmm2, [edi+8]
movss xmm1, [esi+4*ecx]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm1, xmm2
subps xmm0, xmm1
imul ecx, edx
sub edi, ecx
neg eax
add eax, m
sub eax, 4
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
movaps xmm2, xmm0
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
sub edi, edx
movss [esi-4], xmm3 // xptr[-1] = s3
movss xmm4, xmm3
movss xmm5, xmm3
mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
subss xmm2, xmm3
movss [esi-8], xmm2 // xptr[-2] = s2
movss xmm6, xmm2
sub edi, edx
subss xmm0, xmm5
subss xmm1, xmm4
mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
subss xmm1, xmm2
movss [esi-12], xmm1 // xptr[-3] = s1
subss xmm0, xmm6
sub edi, edx
cmp eax, 4
mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
subss xmm0, xmm1
movss [esi-16], xmm0 // xptr[-4] = s0
jl done4rows_1
sub edi, edx
sub edi, 16
sub esi, 16
jmp process4rows_1
done4rows_1:
pop ebx
}
} else {
lptr = L.ToFloatPtr() + m * nc + m - 4;
xptr = x + m;
__asm {
push ebx
mov eax, m // eax = i
mov esi, xptr // esi = xptr
mov edi, lptr // edi = lptr
mov ebx, b // ebx = b
mov edx, nc // edx = nc*sizeof(float)
shl edx, 2
process4rows:
movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
sub eax, m
jz done4x4
neg eax
xor ecx, ecx
process4x4: // process 4x4 blocks
movlps xmm2, [edi+0]
movhps xmm2, [edi+8]
add edi, edx
movss xmm1, [esi+4*ecx+0]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm3, [edi+0]
movhps xmm3, [edi+8]
add edi, edx
mulps xmm1, xmm2
subps xmm0, xmm1
movss xmm1, [esi+4*ecx+4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm4, [edi+0]
movhps xmm4, [edi+8]
add edi, edx
mulps xmm1, xmm3
subps xmm0, xmm1
movss xmm1, [esi+4*ecx+8]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps xmm5, [edi+0]
movhps xmm5, [edi+8]
add edi, edx
mulps xmm1, xmm4
subps xmm0, xmm1
movss xmm1, [esi+4*ecx+12]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
add ecx, 4
cmp ecx, eax
mulps xmm1, xmm5
subps xmm0, xmm1
jl process4x4
imul ecx, edx
sub edi, ecx
neg eax
done4x4: // process left over of the 4 rows
add eax, m
sub eax, 4
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
movaps xmm2, xmm0
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
movaps xmm3, xmm0
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
sub edi, edx
movss [esi-4], xmm3 // xptr[-1] = s3
movss xmm4, xmm3
movss xmm5, xmm3
mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
subss xmm2, xmm3
movss [esi-8], xmm2 // xptr[-2] = s2
movss xmm6, xmm2
sub edi, edx
subss xmm0, xmm5
subss xmm1, xmm4
mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
subss xmm1, xmm2
movss [esi-12], xmm1 // xptr[-3] = s1
subss xmm0, xmm6
sub edi, edx
cmp eax, 4
mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
subss xmm0, xmm1
movss [esi-16], xmm0 // xptr[-4] = s0
jl done4rows
sub edi, edx
sub edi, 16
sub esi, 16
jmp process4rows
done4rows:
pop ebx
}
}
// process left over rows
for ( i = (m&3)-1; i >= 0; i-- ) {
s0 = b[i];
lptr = L[0] + i;
for ( j = i + 1; j < n; j++ ) {
s0 -= lptr[j*nc] * x[j];
}
x[i] = s0;
}
#else
int i, j, m;
double s0, s1, s2, s3, t;
const float *lptr2;
float *xptr, *xptr2;
m = n;
if ( m & 1 ) {
m--;
x[m] = b[m];
lptr = L.ToFloatPtr() + m * nc + m - 4;
xptr = x + m;
// process 4 rows at a time
for ( i = m; i >= 4; i -= 4 ) {
s0 = b[i-4];
s1 = b[i-3];
s2 = b[i-2];
s3 = b[i-1];
// process 4x4 blocks
xptr2 = xptr; // x + i;
lptr2 = lptr; // ptr = L[i] + i - 4;
for ( j = 0; j < m-i; j += 4 ) {
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
}
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
// process left over of the 4 rows
lptr -= nc;
s0 -= lptr[0] * s3;
s1 -= lptr[1] * s3;
s2 -= lptr[2] * s3;
lptr -= nc;
s0 -= lptr[0] * s2;
s1 -= lptr[1] * s2;
lptr -= nc;
s0 -= lptr[0] * s1;
lptr -= nc;
// store result
xptr[-4] = s0;
xptr[-3] = s1;
xptr[-2] = s2;
xptr[-1] = s3;
// update pointers for next four rows
lptr -= 4;
xptr -= 4;
}
} else {
lptr = L.ToFloatPtr() + m * nc + m - 4;
xptr = x + m;
// process 4 rows at a time
for ( i = m; i >= 4; i -= 4 ) {
s0 = b[i-4];
s1 = b[i-3];
s2 = b[i-2];
s3 = b[i-1];
// process 4x4 blocks
xptr2 = xptr; // x + i;
lptr2 = lptr; // ptr = L[i] + i - 4;
for ( j = 0; j < m-i; j += 4 ) {
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
t = xptr2[0];
s0 -= lptr2[0] * t;
s1 -= lptr2[1] * t;
s2 -= lptr2[2] * t;
s3 -= lptr2[3] * t;
lptr2 += nc;
xptr2++;
}
// process left over of the 4 rows
lptr -= nc;
s0 -= lptr[0] * s3;
s1 -= lptr[1] * s3;
s2 -= lptr[2] * s3;
lptr -= nc;
s0 -= lptr[0] * s2;
s1 -= lptr[1] * s2;
lptr -= nc;
s0 -= lptr[0] * s1;
lptr -= nc;
// store result
xptr[-4] = s0;
xptr[-3] = s1;
xptr[-2] = s2;
xptr[-1] = s3;
// update pointers for next four rows
lptr -= 4;
xptr -= 4;
}
}
// process left over rows
for ( i--; i >= 0; i-- ) {
s0 = b[i];
lptr = L[0] + i;
for ( j = i + 1; j < m; j++ ) {
s0 -= lptr[j*nc] * x[j];
}
x[i] = s0;
}
#endif
}
/*
============
idSIMD_SSE::MatX_LDLTFactor
in-place factorization LDL' of the n * n sub-matrix of mat
the reciprocal of the diagonal elements are stored in invDiag
currently assumes the number of columns of mat is a multiple of 4
============
*/
bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
#if 1
int j, nc;
float *v, *diag, *invDiagPtr, *mptr;
double s0, s1, s2, sum, d;
v = (float *) _alloca16( n * sizeof( float ) );
diag = (float *) _alloca16( n * sizeof( float ) );
invDiagPtr = invDiag.ToFloatPtr();
nc = mat.GetNumColumns();
assert( ( nc & 3 ) == 0 );
if ( n <= 0 ) {
return true;
}
mptr = mat[0];
sum = mptr[0];
if ( sum == 0.0f ) {
return false;
}
diag[0] = sum;
invDiagPtr[0] = d = 1.0f / sum;
if ( n <= 1 ) {
return true;
}
mptr = mat[0];
for ( j = 1; j < n; j++ ) {
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
}
mptr = mat[1];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
sum = mptr[1] - s0;
if ( sum == 0.0f ) {
return false;
}
mat[1][1] = sum;
diag[1] = sum;
invDiagPtr[1] = d = 1.0f / sum;
if ( n <= 2 ) {
return true;
}
mptr = mat[0];
for ( j = 2; j < n; j++ ) {
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
}
mptr = mat[2];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
sum = mptr[2] - s0 - s1;
if ( sum == 0.0f ) {
return false;
}
mat[2][2] = sum;
diag[2] = sum;
invDiagPtr[2] = d = 1.0f / sum;
if ( n <= 3 ) {
return true;
}
mptr = mat[0];
for ( j = 3; j < n; j++ ) {
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
}
mptr = mat[3];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
sum = mptr[3] - s0 - s1 - s2;
if ( sum == 0.0f ) {
return false;
}
mat[3][3] = sum;
diag[3] = sum;
invDiagPtr[3] = d = 1.0f / sum;
if ( n <= 4 ) {
return true;
}
mptr = mat[0];
for ( j = 4; j < n; j++ ) {
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
}
int ncf = nc * sizeof( float );
mptr = mat[0];
__asm {
xorps xmm2, xmm2
xorps xmm3, xmm3
xorps xmm4, xmm4
push ebx
mov ebx, 4
loopRow:
cmp ebx, n
jge done
mov ecx, ebx // esi = i
shl ecx, 2 // esi = i * 4
mov edx, diag // edx = diag
add edx, ecx // edx = &diag[i]
mov edi, ebx // edi = i
imul edi, ncf // edi = i * nc * sizeof( float )
add edi, mptr // edi = mat[i]
add edi, ecx // edi = &mat[i][i]
mov esi, v // ecx = v
add esi, ecx // ecx = &v[i]
mov eax, invDiagPtr // eax = invDiagPtr
add eax, ecx // eax = &invDiagPtr[i]
neg ecx
movaps xmm0, [edx+ecx]
mulps xmm0, [edi+ecx]
movaps [esi+ecx], xmm0
mulps xmm0, [edi+ecx]
add ecx, 12*4
jg doneDot8
dot8:
movaps xmm1, [edx+ecx-(8*4)]
mulps xmm1, [edi+ecx-(8*4)]
movaps [esi+ecx-(8*4)], xmm1
mulps xmm1, [edi+ecx-(8*4)]
addps xmm0, xmm1
movaps xmm2, [edx+ecx-(4*4)]
mulps xmm2, [edi+ecx-(4*4)]
movaps [esi+ecx-(4*4)], xmm2
mulps xmm2, [edi+ecx-(4*4)]
addps xmm0, xmm2
add ecx, 8*4
jle dot8
doneDot8:
sub ecx, 4*4
jg doneDot4
movaps xmm1, [edx+ecx-(4*4)]
mulps xmm1, [edi+ecx-(4*4)]
movaps [esi+ecx-(4*4)], xmm1
mulps xmm1, [edi+ecx-(4*4)]
addps xmm0, xmm1
add ecx, 4*4
doneDot4:
sub ecx, 2*4
jg doneDot2
movlps xmm3, [edx+ecx-(2*4)]
movlps xmm4, [edi+ecx-(2*4)]
mulps xmm3, xmm4
movlps [esi+ecx-(2*4)], xmm3
mulps xmm3, xmm4
addps xmm0, xmm3
add ecx, 2*4
doneDot2:
sub ecx, 1*4
jg doneDot1
movss xmm3, [edx+ecx-(1*4)]
movss xmm4, [edi+ecx-(1*4)]
mulss xmm3, xmm4
movss [esi+ecx-(1*4)], xmm3
mulss xmm3, xmm4
addss xmm0, xmm3
doneDot1:
movhlps xmm2, xmm0
addps xmm0, xmm2
movaps xmm2, xmm0
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm2
movss xmm1, [edi]
subss xmm1, xmm0
movss [edi], xmm1 // mptr[i] = sum;
movss [edx], xmm1 // diag[i] = sum;
// if ( sum == 0.0f ) return false;
movaps xmm2, xmm1
cmpeqss xmm2, SIMD_SP_zero
andps xmm2, SIMD_SP_tiny
orps xmm1, xmm2
rcpss xmm7, xmm1
mulss xmm1, xmm7
mulss xmm1, xmm7
addss xmm7, xmm7
subss xmm7, xmm1
movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum;
mov edx, n // edx = n
sub edx, ebx // edx = n - i
dec edx // edx = n - i - 1
jle doneSubRow // if ( i + 1 >= n ) return true;
mov eax, ebx // eax = i
shl eax, 2 // eax = i * 4
neg eax
loopSubRow:
add edi, ncf
mov ecx, eax
movaps xmm0, [esi+ecx]
mulps xmm0, [edi+ecx]
add ecx, 12*4
jg doneSubDot8
subDot8:
movaps xmm1, [esi+ecx-(8*4)]
mulps xmm1, [edi+ecx-(8*4)]
addps xmm0, xmm1
movaps xmm2, [esi+ecx-(4*4)]
mulps xmm2, [edi+ecx-(4*4)]
addps xmm0, xmm2
add ecx, 8*4
jle subDot8
doneSubDot8:
sub ecx, 4*4
jg doneSubDot4
movaps xmm1, [esi+ecx-(4*4)]
mulps xmm1, [edi+ecx-(4*4)]
addps xmm0, xmm1
add ecx, 4*4
doneSubDot4:
sub ecx, 2*4
jg doneSubDot2
movlps xmm3, [esi+ecx-(2*4)]
movlps xmm4, [edi+ecx-(2*4)]
mulps xmm3, xmm4
addps xmm0, xmm3
add ecx, 2*4
doneSubDot2:
sub ecx, 1*4
jg doneSubDot1
movss xmm3, [esi+ecx-(1*4)]
movss xmm4, [edi+ecx-(1*4)]
mulss xmm3, xmm4
addss xmm0, xmm3
doneSubDot1:
movhlps xmm2, xmm0
addps xmm0, xmm2
movaps xmm2, xmm0
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
addss xmm0, xmm2
movss xmm1, [edi]
subss xmm1, xmm0
mulss xmm1, xmm7
movss [edi], xmm1
dec edx
jg loopSubRow
doneSubRow:
inc ebx
jmp loopRow
done:
pop ebx
}
return true;
#else
int i, j, k, nc;
float *v, *diag, *mptr;
double s0, s1, s2, s3, sum, d;
v = (float *) _alloca16( n * sizeof( float ) );
diag = (float *) _alloca16( n * sizeof( float ) );
nc = mat.GetNumColumns();
if ( n <= 0 ) {
return true;
}
mptr = mat[0];
sum = mptr[0];
if ( sum == 0.0f ) {
return false;
}
diag[0] = sum;
invDiag[0] = d = 1.0f / sum;
if ( n <= 1 ) {
return true;
}
mptr = mat[0];
for ( j = 1; j < n; j++ ) {
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
}
mptr = mat[1];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
sum = mptr[1] - s0;
if ( sum == 0.0f ) {
return false;
}
mat[1][1] = sum;
diag[1] = sum;
invDiag[1] = d = 1.0f / sum;
if ( n <= 2 ) {
return true;
}
mptr = mat[0];
for ( j = 2; j < n; j++ ) {
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
}
mptr = mat[2];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
sum = mptr[2] - s0 - s1;
if ( sum == 0.0f ) {
return false;
}
mat[2][2] = sum;
diag[2] = sum;
invDiag[2] = d = 1.0f / sum;
if ( n <= 3 ) {
return true;
}
mptr = mat[0];
for ( j = 3; j < n; j++ ) {
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
}
mptr = mat[3];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
sum = mptr[3] - s0 - s1 - s2;
if ( sum == 0.0f ) {
return false;
}
mat[3][3] = sum;
diag[3] = sum;
invDiag[3] = d = 1.0f / sum;
if ( n <= 4 ) {
return true;
}
mptr = mat[0];
for ( j = 4; j < n; j++ ) {
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
}
for ( i = 4; i < n; i++ ) {
mptr = mat[i];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
for ( k = 4; k < i-3; k += 4 ) {
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
}
switch( i - k ) {
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
}
sum = s3;
sum += s2;
sum += s1;
sum += s0;
sum = mptr[i] - sum;
if ( sum == 0.0f ) {
return false;
}
mat[i][i] = sum;
diag[i] = sum;
invDiag[i] = d = 1.0f / sum;
if ( i + 1 >= n ) {
return true;
}
mptr = mat[i+1];
for ( j = i+1; j < n; j++ ) {
s0 = mptr[0] * v[0];
s1 = mptr[1] * v[1];
s2 = mptr[2] * v[2];
s3 = mptr[3] * v[3];
for ( k = 4; k < i-7; k += 8 ) {
s0 += mptr[k+0] * v[k+0];
s1 += mptr[k+1] * v[k+1];
s2 += mptr[k+2] * v[k+2];
s3 += mptr[k+3] * v[k+3];
s0 += mptr[k+4] * v[k+4];
s1 += mptr[k+5] * v[k+5];
s2 += mptr[k+6] * v[k+6];
s3 += mptr[k+7] * v[k+7];
}
switch( i - k ) {
case 7: s0 += mptr[k+6] * v[k+6];
case 6: s1 += mptr[k+5] * v[k+5];
case 5: s2 += mptr[k+4] * v[k+4];
case 4: s3 += mptr[k+3] * v[k+3];
case 3: s0 += mptr[k+2] * v[k+2];
case 2: s1 += mptr[k+1] * v[k+1];
case 1: s2 += mptr[k+0] * v[k+0];
}
sum = s3;
sum += s2;
sum += s1;
sum += s0;
mptr[i] = ( mptr[i] - sum ) * d;
mptr += nc;
}
}
return true;
#endif
}
/*
============
idSIMD_SSE::BlendJoints
============
*/
#define REFINE_BLENDJOINTS_RECIPROCAL
void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
int i;
if ( lerp <= 0.0f ) {
return;
} else if ( lerp >= 1.0f ) {
for ( i = 0; i < numJoints; i++ ) {
int j = index[i];
joints[j] = blendJoints[j];
}
return;
}
for ( i = 0; i <= numJoints - 4; i += 4 ) {
ALIGN16( float jointVert0[4] );
ALIGN16( float jointVert1[4] );
ALIGN16( float jointVert2[4] );
ALIGN16( float blendVert0[4] );
ALIGN16( float blendVert1[4] );
ALIGN16( float blendVert2[4] );
ALIGN16( float jointQuat0[4] );
ALIGN16( float jointQuat1[4] );
ALIGN16( float jointQuat2[4] );
ALIGN16( float jointQuat3[4] );
ALIGN16( float blendQuat0[4] );
ALIGN16( float blendQuat1[4] );
ALIGN16( float blendQuat2[4] );
ALIGN16( float blendQuat3[4] );
for ( int j = 0; j < 4; j++ ) {
int n = index[i+j];
jointVert0[j] = joints[n].t[0];
jointVert1[j] = joints[n].t[1];
jointVert2[j] = joints[n].t[2];
blendVert0[j] = blendJoints[n].t[0];
blendVert1[j] = blendJoints[n].t[1];
blendVert2[j] = blendJoints[n].t[2];
jointQuat0[j] = joints[n].q[0];
jointQuat1[j] = joints[n].q[1];
jointQuat2[j] = joints[n].q[2];
jointQuat3[j] = joints[n].q[3];
blendQuat0[j] = blendJoints[n].q[0];
blendQuat1[j] = blendJoints[n].q[1];
blendQuat2[j] = blendJoints[n].q[2];
blendQuat3[j] = blendJoints[n].q[3];
}
#if 1
__asm {
// lerp translation
movss xmm7, lerp
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
movaps xmm0, blendVert0
subps xmm0, jointVert0
mulps xmm0, xmm7
addps xmm0, jointVert0
movaps jointVert0, xmm0
movaps xmm1, blendVert1
subps xmm1, jointVert1
mulps xmm1, xmm7
addps xmm1, jointVert1
movaps jointVert1, xmm1
movaps xmm2, blendVert2
subps xmm2, jointVert2
mulps xmm2, xmm7
addps xmm2, jointVert2
movaps jointVert2, xmm2
// lerp quaternions
movaps xmm0, jointQuat0
mulps xmm0, blendQuat0
movaps xmm1, jointQuat1
mulps xmm1, blendQuat1
addps xmm0, xmm1
movaps xmm2, jointQuat2
mulps xmm2, blendQuat2
addps xmm0, xmm2
movaps xmm3, jointQuat3
mulps xmm3, blendQuat3
addps xmm0, xmm3 // xmm0 = cosom
movaps xmm1, xmm0
movaps xmm2, xmm0
andps xmm1, SIMD_SP_signBitMask // xmm1 = signBit
xorps xmm0, xmm1
mulps xmm2, xmm2
xorps xmm4, xmm4
movaps xmm3, SIMD_SP_one
subps xmm3, xmm2 // xmm3 = scale0
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#ifdef REFINE_BLENDJOINTS_RECIPROCAL
movaps xmm2, xmm3
rsqrtps xmm4, xmm2
mulps xmm2, xmm4
mulps xmm2, xmm4
subps xmm2, SIMD_SP_rsqrt_c0
mulps xmm4, SIMD_SP_rsqrt_c1
mulps xmm2, xmm4
#else
rsqrtps xmm2, xmm3 // xmm2 = sinom
#endif
mulps xmm3, xmm2 // xmm3 = sqrt( scale0 )
// omega0 = atan2( xmm3, xmm0 )
movaps xmm4, xmm0
minps xmm0, xmm3
maxps xmm3, xmm4
cmpeqps xmm4, xmm0
#ifdef REFINE_BLENDJOINTS_RECIPROCAL
rcpps xmm5, xmm3
mulps xmm3, xmm5
mulps xmm3, xmm5
addps xmm5, xmm5
subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x
mulps xmm0, xmm5 // xmm0 = x / y or y / x
#else
rcpps xmm3, xmm3 // xmm3 = 1 / y or 1 / x
mulps xmm0, xmm3 // xmm0 = x / y or y / x
#endif
movaps xmm3, xmm4
andps xmm3, SIMD_SP_signBitMask
xorps xmm0, xmm3 // xmm0 = -x / y or y / x
andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f
movaps xmm3, xmm0
mulps xmm3, xmm3 // xmm3 = s
movaps xmm5, SIMD_SP_atan_c0
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c1
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c2
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c3
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c4
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c5
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c6
mulps xmm5, xmm3
addps xmm5, SIMD_SP_atan_c7
mulps xmm5, xmm3
addps xmm5, SIMD_SP_one
mulps xmm5, xmm0
addps xmm5, xmm4 // xmm5 = omega0
movaps xmm6, xmm7 // xmm6 = lerp
mulps xmm6, xmm5 // xmm6 = omega1
subps xmm5, xmm6 // xmm5 = omega0
// scale0 = sin( xmm5 ) * xmm2
// scale1 = sin( xmm6 ) * xmm2
movaps xmm3, xmm5
movaps xmm7, xmm6
mulps xmm3, xmm3
mulps xmm7, xmm7
movaps xmm4, SIMD_SP_sin_c0
movaps xmm0, SIMD_SP_sin_c0
mulps xmm4, xmm3
mulps xmm0, xmm7
addps xmm4, SIMD_SP_sin_c1
addps xmm0, SIMD_SP_sin_c1
mulps xmm4, xmm3
mulps xmm0, xmm7
addps xmm4, SIMD_SP_sin_c2
addps xmm0, SIMD_SP_sin_c2
mulps xmm4, xmm3
mulps xmm0, xmm7
addps xmm4, SIMD_SP_sin_c3
addps xmm0, SIMD_SP_sin_c3
mulps xmm4, xmm3
mulps xmm0, xmm7
addps xmm4, SIMD_SP_sin_c4
addps xmm0, SIMD_SP_sin_c4
mulps xmm4, xmm3
mulps xmm0, xmm7
addps xmm4, SIMD_SP_one
addps xmm0, SIMD_SP_one
mulps xmm5, xmm4
mulps xmm6, xmm0
mulps xmm5, xmm2 // xmm5 = scale0
mulps xmm6, xmm2 // xmm6 = scale1
xorps xmm6, xmm1
movaps xmm0, jointQuat0
mulps xmm0, xmm5
movaps xmm1, blendQuat0
mulps xmm1, xmm6
addps xmm0, xmm1
movaps jointQuat0, xmm0
movaps xmm1, jointQuat1
mulps xmm1, xmm5
movaps xmm2, blendQuat1
mulps xmm2, xmm6
addps xmm1, xmm2
movaps jointQuat1, xmm1
movaps xmm2, jointQuat2
mulps xmm2, xmm5
movaps xmm3, blendQuat2
mulps xmm3, xmm6
addps xmm2, xmm3
movaps jointQuat2, xmm2
movaps xmm3, jointQuat3
mulps xmm3, xmm5
movaps xmm4, blendQuat3
mulps xmm4, xmm6
addps xmm3, xmm4
movaps jointQuat3, xmm3
}
#else
jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
ALIGN16( float cosom[4] );
ALIGN16( float sinom[4] );
ALIGN16( float omega0[4] );
ALIGN16( float omega1[4] );
ALIGN16( float scale0[4] );
ALIGN16( float scale1[4] );
ALIGN16( unsigned long signBit[4] );
cosom[0] = jointQuat0[0] * blendQuat0[0];
cosom[1] = jointQuat0[1] * blendQuat0[1];
cosom[2] = jointQuat0[2] * blendQuat0[2];
cosom[3] = jointQuat0[3] * blendQuat0[3];
cosom[0] += jointQuat1[0] * blendQuat1[0];
cosom[1] += jointQuat1[1] * blendQuat1[1];
cosom[2] += jointQuat1[2] * blendQuat1[2];
cosom[3] += jointQuat1[3] * blendQuat1[3];
cosom[0] += jointQuat2[0] * blendQuat2[0];
cosom[1] += jointQuat2[1] * blendQuat2[1];
cosom[2] += jointQuat2[2] * blendQuat2[2];
cosom[3] += jointQuat2[3] * blendQuat2[3];
cosom[0] += jointQuat3[0] * blendQuat3[0];
cosom[1] += jointQuat3[1] * blendQuat3[1];
cosom[2] += jointQuat3[2] * blendQuat3[2];
cosom[3] += jointQuat3[3] * blendQuat3[3];
signBit[0] = (*(unsigned long *)&cosom[0]) & ( 1 << 31 );
signBit[1] = (*(unsigned long *)&cosom[1]) & ( 1 << 31 );
signBit[2] = (*(unsigned long *)&cosom[2]) & ( 1 << 31 );
signBit[3] = (*(unsigned long *)&cosom[3]) & ( 1 << 31 );
(*(unsigned long *)&cosom[0]) ^= signBit[0];
(*(unsigned long *)&cosom[1]) ^= signBit[1];
(*(unsigned long *)&cosom[2]) ^= signBit[2];
(*(unsigned long *)&cosom[3]) ^= signBit[3];
scale0[0] = 1.0f - cosom[0] * cosom[0];
scale0[1] = 1.0f - cosom[1] * cosom[1];
scale0[2] = 1.0f - cosom[2] * cosom[2];
scale0[3] = 1.0f - cosom[3] * cosom[3];
scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
sinom[0] = idMath::RSqrt( scale0[0] );
sinom[1] = idMath::RSqrt( scale0[1] );
sinom[2] = idMath::RSqrt( scale0[2] );
sinom[3] = idMath::RSqrt( scale0[3] );
scale0[0] *= sinom[0];
scale0[1] *= sinom[1];
scale0[2] *= sinom[2];
scale0[3] *= sinom[3];
omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
omega1[0] = lerp * omega0[0];
omega1[1] = lerp * omega0[1];
omega1[2] = lerp * omega0[2];
omega1[3] = lerp * omega0[3];
omega0[0] -= omega1[0];
omega0[1] -= omega1[1];
omega0[2] -= omega1[2];
omega0[3] -= omega1[3];
scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
(*(unsigned long *)&scale1[0]) ^= signBit[0];
(*(unsigned long *)&scale1[1]) ^= signBit[1];
(*(unsigned long *)&scale1[2]) ^= signBit[2];
(*(unsigned long *)&scale1[3]) ^= signBit[3];
jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
#endif
for ( int j = 0; j < 4; j++ ) {
int n = index[i+j];
joints[n].t[0] = jointVert0[j];
joints[n].t[1] = jointVert1[j];
joints[n].t[2] = jointVert2[j];
joints[n].q[0] = jointQuat0[j];
joints[n].q[1] = jointQuat1[j];
joints[n].q[2] = jointQuat2[j];
joints[n].q[3] = jointQuat3[j];
}
}
for ( ; i < numJoints; i++ ) {
int n = index[i];
idVec3 &jointVert = joints[n].t;
const idVec3 &blendVert = blendJoints[n].t;
jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
idQuat &jointQuat = joints[n].q;
const idQuat &blendQuat = blendJoints[n].q;
float cosom;
float sinom;
float omega;
float scale0;
float scale1;
unsigned long signBit;
cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
signBit = (*(unsigned long *)&cosom) & ( 1 << 31 );
(*(unsigned long *)&cosom) ^= signBit;
scale0 = 1.0f - cosom * cosom;
scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
sinom = idMath::InvSqrt( scale0 );
omega = idMath::ATan16( scale0 * sinom, cosom );
scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
scale1 = idMath::Sin16( lerp * omega ) * sinom;
(*(unsigned long *)&scale1) ^= signBit;
jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
}
}
/*
============
idSIMD_SSE::ConvertJointQuatsToJointMats
============
*/
void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
for ( int i = 0; i < numJoints; i++ ) {
const float *q = jointQuats[i].q.ToFloatPtr();
float *m = jointMats[i].ToFloatPtr();
m[0*4+3] = q[4];
m[1*4+3] = q[5];
m[2*4+3] = q[6];
float x2 = q[0] + q[0];
float y2 = q[1] + q[1];
float z2 = q[2] + q[2];
{
float xx = q[0] * x2;
float yy = q[1] * y2;
float zz = q[2] * z2;
m[0*4+0] = 1.0f - yy - zz;
m[1*4+1] = 1.0f - xx - zz;
m[2*4+2] = 1.0f - xx - yy;
}
{
float yz = q[1] * z2;
float wx = q[3] * x2;
m[2*4+1] = yz - wx;
m[1*4+2] = yz + wx;
}
{
float xy = q[0] * y2;
float wz = q[3] * z2;
m[1*4+0] = xy - wz;
m[0*4+1] = xy + wz;
}
{
float xz = q[0] * z2;
float wy = q[3] * y2;
m[0*4+2] = xz - wy;
m[2*4+0] = xz + wy;
}
}
}
/*
============
idSIMD_SSE::ConvertJointMatsToJointQuats
============
*/
void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
#if 1
ALIGN16( byte shuffle[16] );
__asm {
mov eax, numJoints
mov esi, jointMats
mov edi, jointQuats
and eax, ~3
jz done4
imul eax, JOINTMAT_SIZE
add esi, eax
neg eax
loopMat4:
movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
movss xmm5, xmm0
movss xmm6, xmm1
movss xmm7, xmm2
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
movss xmm5, xmm0
movss xmm6, xmm1
movss xmm7, xmm2
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
movss xmm5, xmm0
movss xmm6, xmm1
movss xmm7, xmm2
// -------------------
movaps xmm0, xmm5
addps xmm0, xmm6
addps xmm0, xmm7
cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
movaps xmm1, xmm5
movaps xmm2, xmm5
cmpnltps xmm1, xmm6
cmpnltps xmm2, xmm7
andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
movaps xmm4, xmm6
cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
movaps xmm1, xmm0
andnps xmm1, xmm2
orps xmm2, xmm0
movaps xmm3, xmm2
andnps xmm2, xmm4
orps xmm3, xmm2
xorps xmm3, SIMD_SP_not
andps xmm0, SIMD_DW_mat2quatShuffle0
movaps xmm4, xmm1
andps xmm4, SIMD_DW_mat2quatShuffle1
orps xmm0, xmm4
movaps xmm4, xmm2
andps xmm4, SIMD_DW_mat2quatShuffle2
orps xmm0, xmm4
movaps xmm4, xmm3
andps xmm4, SIMD_DW_mat2quatShuffle3
orps xmm4, xmm0
movaps shuffle, xmm4
movaps xmm0, xmm2
orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
andps xmm0, SIMD_SP_signBitMask
andps xmm1, SIMD_SP_signBitMask
andps xmm2, SIMD_SP_signBitMask
xorps xmm5, xmm0
xorps xmm6, xmm1
xorps xmm7, xmm2
addps xmm5, xmm6
addps xmm7, SIMD_SP_one
addps xmm5, xmm7 // xmm5 = t
movaps xmm7, xmm5 // xmm7 = t
rsqrtps xmm6, xmm5
mulps xmm5, xmm6
mulps xmm5, xmm6
subps xmm5, SIMD_SP_rsqrt_c0
mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1
mulps xmm6, xmm5 // xmm5 = s
mulps xmm7, xmm6 // xmm7 = s * t
xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
// -------------------
add edi, 4*JOINTQUAT_SIZE
movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0
movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
movzx edx, byte ptr shuffle[0*4+1] // edx = k1
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
xorps xmm4, xmm2
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
mulss xmm4, xmm6
movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2
movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
xorps xmm3, xmm1
subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
mulss xmm3, xmm6
movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
movzx edx, byte ptr shuffle[0*4+3] // edx = k3
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
xorps xmm4, xmm0
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
mulss xmm4, xmm6
movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0
movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
movzx edx, byte ptr shuffle[1*4+1] // edx = k1
movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
xorps xmm4, xmm2
subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
mulss xmm4, xmm6
movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2
movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
xorps xmm3, xmm1
subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
mulss xmm3, xmm6
movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
movzx edx, byte ptr shuffle[1*4+3] // edx = k3
movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
xorps xmm4, xmm0
subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
mulss xmm4, xmm6
movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0
movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
movzx edx, byte ptr shuffle[2*4+1] // edx = k1
movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
xorps xmm4, xmm2
subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
mulss xmm4, xmm6
movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2
movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
xorps xmm3, xmm1
subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
mulss xmm3, xmm6
movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
movzx edx, byte ptr shuffle[2*4+3] // edx = k3
movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
xorps xmm4, xmm0
subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
mulss xmm4, xmm6
movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
movzx edx, byte ptr shuffle[3*4+1] // edx = k1
movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
xorps xmm4, xmm2
subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
mulss xmm4, xmm6
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2
movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
xorps xmm3, xmm1
subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
mulss xmm3, xmm6
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
movzx edx, byte ptr shuffle[3*4+3] // edx = k3
movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
xorps xmm4, xmm0
subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
mulss xmm4, xmm6
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
add eax, 4*JOINTMAT_SIZE
jl loopMat4
done4:
mov eax, numJoints
and eax, 3
jz done1
imul eax, JOINTMAT_SIZE
add esi, eax
neg eax
loopMat1:
movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
// -------------------
movaps xmm0, xmm5
addss xmm0, xmm6
addss xmm0, xmm7
cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
movaps xmm1, xmm5
movaps xmm2, xmm5
cmpnltss xmm1, xmm6
cmpnltss xmm2, xmm7
andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
movaps xmm4, xmm6
cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
movaps xmm1, xmm0
andnps xmm1, xmm2
orps xmm2, xmm0
movaps xmm3, xmm2
andnps xmm2, xmm4
orps xmm3, xmm2
xorps xmm3, SIMD_SP_not
andps xmm0, SIMD_DW_mat2quatShuffle0
movaps xmm4, xmm1
andps xmm4, SIMD_DW_mat2quatShuffle1
orps xmm0, xmm4
movaps xmm4, xmm2
andps xmm4, SIMD_DW_mat2quatShuffle2
orps xmm0, xmm4
movaps xmm4, xmm3
andps xmm4, SIMD_DW_mat2quatShuffle3
orps xmm4, xmm0
movss shuffle, xmm4
movaps xmm0, xmm2
orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
andps xmm0, SIMD_SP_signBitMask
andps xmm1, SIMD_SP_signBitMask
andps xmm2, SIMD_SP_signBitMask
xorps xmm5, xmm0
xorps xmm6, xmm1
xorps xmm7, xmm2
addss xmm5, xmm6
addss xmm7, SIMD_SP_one
addss xmm5, xmm7 // xmm5 = t
movss xmm7, xmm5 // xmm7 = t
rsqrtss xmm6, xmm5
mulss xmm5, xmm6
mulss xmm5, xmm6
subss xmm5, SIMD_SP_rsqrt_c0
mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1
mulss xmm6, xmm5 // xmm5 = s
mulss xmm7, xmm6 // xmm7 = s * t
xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
// -------------------
movzx ecx, byte ptr shuffle[0] // ecx = k0
add edi, JOINTQUAT_SIZE
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
movzx edx, byte ptr shuffle[1] // edx = k1
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
xorps xmm4, xmm2
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
mulss xmm4, xmm6
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
movzx ecx, byte ptr shuffle[2] // ecx = k2
movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
xorps xmm3, xmm1
subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
mulss xmm3, xmm6
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
movzx edx, byte ptr shuffle[3] // edx = k3
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
xorps xmm4, xmm0
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
mulss xmm4, xmm6
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
add eax, JOINTMAT_SIZE
jl loopMat1
done1:
}
#elif 0
for ( int i = 0; i < numJoints; i++ ) {
float s0, s1, s2;
int k0, k1, k2, k3;
float *q = jointQuats[i].q.ToFloatPtr();
const float *m = jointMats[i].ToFloatPtr();
if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
k0 = 3;
k1 = 2;
k2 = 1;
k3 = 0;
s0 = 1.0f;
s1 = 1.0f;
s2 = 1.0f;
} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
k0 = 0;
k1 = 1;
k2 = 2;
k3 = 3;
s0 = 1.0f;
s1 = -1.0f;
s2 = -1.0f;
} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
k0 = 1;
k1 = 0;
k2 = 3;
k3 = 2;
s0 = -1.0f;
s1 = 1.0f;
s2 = -1.0f;
} else {
k0 = 2;
k1 = 3;
k2 = 0;
k3 = 1;
s0 = -1.0f;
s1 = -1.0f;
s2 = 1.0f;
}
float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[k0] = s * t;
q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
q[4] = m[0 * 4 + 3];
q[5] = m[1 * 4 + 3];
q[6] = m[2 * 4 + 3];
}
#elif 1
for ( int i = 0; i < numJoints; i++ ) {
float *q = jointQuats[i].q.ToFloatPtr();
const float *m = jointMats[i].ToFloatPtr();
if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[3] = s * t;
q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[0] = s * t;
q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[1] = s * t;
q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
} else {
float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[2] = s * t;
q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
}
q[4] = m[0 * 4 + 3];
q[5] = m[1 * 4 + 3];
q[6] = m[2 * 4 + 3];
}
#endif
}
/*
============
idSIMD_SSE::TransformJoints
============
*/
void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
#if 1
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
__asm {
mov ecx, firstJoint
mov eax, lastJoint
sub eax, ecx
jl done
imul ecx, 4
mov edi, parents
add edi, ecx
imul ecx, 12
mov esi, jointMats
imul eax, 4
add edi, eax
neg eax
loopJoint:
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
mov edx, [edi+eax]
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
imul edx, JOINTMAT_SIZE
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
movss xmm4, [esi+edx+ 0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm0
movss xmm5, [esi+edx+ 4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm1
addps xmm4, xmm5
movss xmm6, [esi+edx+ 8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm2
addps xmm4, xmm6
movss xmm5, [esi+edx+16]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm0
movss xmm7, [esi+edx+12]
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
addps xmm4, xmm7
movaps [esi+ecx+ 0], xmm4
movss xmm6, [esi+edx+20]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm1
addps xmm5, xmm6
movss xmm7, [esi+edx+24]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm5, xmm7
movss xmm6, [esi+edx+32]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movss xmm3, [esi+edx+28]
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
addps xmm5, xmm3
movaps [esi+ecx+16], xmm5
movss xmm7, [esi+edx+36]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movss xmm3, [esi+edx+40]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm2
addps xmm6, xmm3
movss xmm7, [esi+edx+44]
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
addps xmm6, xmm7
movaps [esi+ecx+32], xmm6
add ecx, JOINTMAT_SIZE
add eax, 4
jle loopJoint
done:
}
#else
int i;
for( i = firstJoint; i <= lastJoint; i++ ) {
assert( parents[i] < i );
jointMats[i] *= jointMats[parents[i]];
}
#endif
}
/*
============
idSIMD_SSE::UntransformJoints
============
*/
void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
#if 1
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
__asm {
mov edx, firstJoint
mov eax, lastJoint
mov ecx, eax
sub eax, edx
jl done
mov esi, jointMats
imul ecx, JOINTMAT_SIZE
imul edx, 4
mov edi, parents
add edi, edx
imul eax, 4
loopJoint:
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
mov edx, [edi+eax]
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
imul edx, JOINTMAT_SIZE
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
movss xmm6, [esi+edx+12]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
subps xmm0, xmm6
movss xmm7, [esi+edx+28]
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
subps xmm1, xmm7
movss xmm3, [esi+edx+44]
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
subps xmm2, xmm3
movss xmm4, [esi+edx+ 0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm4, xmm0
movss xmm5, [esi+edx+16]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm1
addps xmm4, xmm5
movss xmm6, [esi+edx+32]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm2
addps xmm4, xmm6
movaps [esi+ecx+ 0], xmm4
movss xmm5, [esi+edx+ 4]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm5, xmm0
movss xmm6, [esi+edx+20]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm1
addps xmm5, xmm6
movss xmm7, [esi+edx+36]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm5, xmm7
movaps [esi+ecx+16], xmm5
movss xmm6, [esi+edx+ 8]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movss xmm7, [esi+edx+24]
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movss xmm3, [esi+edx+40]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm3, xmm2
addps xmm6, xmm3
movaps [esi+ecx+32], xmm6
sub ecx, JOINTMAT_SIZE
sub eax, 4
jge loopJoint
done:
}
#else
int i;
for( i = lastJoint; i >= firstJoint; i-- ) {
assert( parents[i] < i );
jointMats[i] /= jointMats[parents[i]];
}
#endif
}
/*
============
idSIMD_SSE::TransformVerts
============
*/
void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
#if 1
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, index
mov esi, weights
mov edi, joints
add ecx, eax
neg eax
loopVert:
mov ebx, [edx]
movaps xmm2, [esi]
add edx, 8
movaps xmm0, xmm2
add esi, JOINTWEIGHT_SIZE
movaps xmm1, xmm2
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
cmp dword ptr [edx-4], 0
jne doneWeight
loopWeight:
mov ebx, [edx]
movaps xmm5, [esi]
add edx, 8
movaps xmm3, xmm5
add esi, JOINTWEIGHT_SIZE
movaps xmm4, xmm5
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
cmp dword ptr [edx-4], 0
addps xmm0, xmm3
addps xmm1, xmm4
addps xmm2, xmm5
je loopWeight
doneWeight:
add eax, DRAWVERT_SIZE
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6
movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8
addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
movss [ecx+eax-DRAWVERT_SIZE+8], xmm5
jl loopVert
done:
}
#else
int i, j;
const byte *jointsPtr = (byte *)joints;
for( j = i = 0; i < numVerts; i++ ) {
idVec3 v;
v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
while( index[j*2+1] == 0 ) {
j++;
v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
}
j++;
verts[i].xyz = v;
}
#endif
}
/*
============
idSIMD_SSE::TracePointCull
============
*/
void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
#if 1
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__asm {
push ebx
mov eax, numVerts
test eax, eax
jz done
mov edi, planes
movlps xmm1, [edi] // xmm1 = 0, 1, X, X
movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5
movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X
movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7
movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X
movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13
movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X
movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15
movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
movss xmm7, radius
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
xor edx, edx
mov esi, verts
mov edi, cullBits
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
loopVert:
movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
mulps xmm4, xmm0
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
mulps xmm5, xmm1
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
addps xmm4, xmm5
mulps xmm6, xmm2
addps xmm4, xmm3
addps xmm4, xmm6
movaps xmm5, xmm4
xorps xmm5, SIMD_SP_signBitMask
cmpltps xmm4, xmm7
movmskps ecx, xmm4
cmpltps xmm5, xmm7
movmskps ebx, xmm5
shl cx, 4
or cl, bl
inc edi
or dl, cl
add eax, DRAWVERT_SIZE
mov byte ptr [edi-1], cl
jl loopVert
done:
mov esi, totalOr
mov byte ptr [esi], dl
pop ebx
}
#else
int i;
byte tOr;
tOr = 0;
for ( i = 0; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, t;
const idVec3 &v = verts[i].xyz;
d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
t = d0 + radius;
bits = FLOATSIGNBITSET( t ) << 0;
t = d1 + radius;
bits |= FLOATSIGNBITSET( t ) << 1;
t = d2 + radius;
bits |= FLOATSIGNBITSET( t ) << 2;
t = d3 + radius;
bits |= FLOATSIGNBITSET( t ) << 3;
t = d0 - radius;
bits |= FLOATSIGNBITSET( t ) << 4;
t = d1 - radius;
bits |= FLOATSIGNBITSET( t ) << 5;
t = d2 - radius;
bits |= FLOATSIGNBITSET( t ) << 6;
t = d3 - radius;
bits |= FLOATSIGNBITSET( t ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
totalOr = tOr;
#endif
}
/*
============
idSIMD_SSE::DecalPointCull
============
*/
void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
#if 1
ALIGN16( float p0[4] );
ALIGN16( float p1[4] );
ALIGN16( float p2[4] );
ALIGN16( float p3[4] );
ALIGN16( float p4[4] );
ALIGN16( float p5[4] );
ALIGN16( float p6[4] );
ALIGN16( float p7[4] );
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__asm {
mov ecx, planes
movlps xmm1, [ecx] // xmm1 = 0, 1, X, X
movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5
movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X
movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7
movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X
movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13
movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X
movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15
movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
movaps p0, xmm0
movaps p1, xmm1
movaps p2, xmm2
movaps p3, xmm3
movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X
movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51
movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51
movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X
movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53
movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53
movaps p4, xmm4
movaps p5, xmm5
movaps p6, xmm6
movaps p7, xmm7
mov esi, verts
mov edi, cullBits
mov eax, numVerts
and eax, ~1
jz done2
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
loopVert2:
movaps xmm6, p0
movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movaps xmm7, p1
movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movaps xmm7, p2
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
addps xmm6, p3
cmpnltps xmm6, SIMD_SP_zero
movmskps ecx, xmm6
movaps xmm6, p0
movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm3
movaps xmm7, p1
movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm4
addps xmm6, xmm7
movaps xmm7, p2
movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm5
addps xmm6, xmm7
addps xmm6, p3
cmpnltps xmm6, SIMD_SP_zero
movmskps edx, xmm6
mov ch, dl
shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm0, p4
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm1, p5
addps xmm0, xmm1
shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm2, p6
addps xmm0, xmm2
addps xmm0, p7
cmpnltps xmm0, SIMD_SP_zero
movmskps edx, xmm0
add edi, 2
mov dh, dl
shl dl, 4
shl dh, 2
and edx, (3<<4)|(3<<12)
or ecx, edx
add eax, 2*DRAWVERT_SIZE
mov word ptr [edi-2], cx
jl loopVert2
done2:
mov eax, numVerts
and eax, 1
jz done
movaps xmm6, p0
movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm6, xmm0
movaps xmm7, p1
movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm1
addps xmm6, xmm7
movaps xmm7, p2
movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm7, xmm2
addps xmm6, xmm7
addps xmm6, p3
cmpnltps xmm6, SIMD_SP_zero
movmskps ecx, xmm6
mulps xmm0, p4
mulps xmm1, p5
addps xmm0, xmm1
mulps xmm2, p6
addps xmm0, xmm2
addps xmm0, p7
cmpnltps xmm0, SIMD_SP_zero
movmskps edx, xmm0
and edx, 3
shl edx, 4
or ecx, edx
mov byte ptr [edi], cl
done:
}
#else
int i;
for ( i = 0; i < numVerts; i += 2 ) {
unsigned short bits0, bits1;
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
const idVec3 &v0 = verts[i+0].xyz;
const idVec3 &v1 = verts[i+1].xyz;
d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
bits0 = FLOATSIGNBITSET( d0 ) << (0+0);
bits0 |= FLOATSIGNBITSET( d1 ) << (0+1);
bits0 |= FLOATSIGNBITSET( d2 ) << (0+2);
bits0 |= FLOATSIGNBITSET( d3 ) << (0+3);
bits0 |= FLOATSIGNBITSET( d4 ) << (0+4);
bits0 |= FLOATSIGNBITSET( d5 ) << (0+5);
bits1 = FLOATSIGNBITSET( d6 ) << (8+0);
bits1 |= FLOATSIGNBITSET( d7 ) << (8+1);
bits1 |= FLOATSIGNBITSET( d8 ) << (8+2);
bits1 |= FLOATSIGNBITSET( d9 ) << (8+3);
bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
*(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
}
if ( numVerts & 1 ) {
byte bits;
float d0, d1, d2, d3, d4, d5;
const idVec3 &v = verts[numVerts - 1].xyz;
d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 2;
bits |= FLOATSIGNBITSET( d3 ) << 3;
bits |= FLOATSIGNBITSET( d4 ) << 4;
bits |= FLOATSIGNBITSET( d5 ) << 5;
cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits
}
#endif
}
/*
============
idSIMD_SSE::OverlayPointCull
============
*/
void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
#if 1
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__asm {
mov eax, numVerts
mov edx, verts
mov esi, texCoords
mov edi, cullBits
mov ecx, planes
movss xmm4, [ecx+ 0]
movss xmm5, [ecx+16]
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
movss xmm5, [ecx+ 4]
movss xmm6, [ecx+20]
shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
movss xmm6, [ecx+ 8]
movss xmm7, [ecx+24]
shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
movss xmm7, [ecx+12]
movss xmm0, [ecx+28]
shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
and eax, ~1
jz done2
add edi, eax
neg eax
loopVert2:
movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm0, xmm4
movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm1, xmm5
movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm2, xmm6
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm7
movaps [esi], xmm0
movaps xmm1, xmm0
movaps xmm2, SIMD_SP_one
subps xmm2, xmm0
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
add edx, 2*DRAWVERT_SIZE
movmskps ecx, xmm0
mov byte ptr [edi+eax+0], cl
add esi, 4*4
movmskps ecx, xmm1
mov byte ptr [edi+eax+1], cl
add eax, 2
jl loopVert2
done2:
mov eax, numVerts
and eax, 1
jz done
movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm0, xmm4
movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm1, xmm5
movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm2, xmm6
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm7
movlps [esi], xmm0
movaps xmm1, xmm0
movaps xmm2, SIMD_SP_one
subps xmm2, xmm0
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
movmskps ecx, xmm0
mov byte ptr [edi], cl
done:
}
#else
const idPlane &p0 = planes[0];
const idPlane &p1 = planes[1];
for ( int i = 0; i < numVerts - 1; i += 2 ) {
unsigned short bits;
float d0, d1, d2, d3;
const idVec3 &v0 = verts[i+0].xyz;
const idVec3 &v1 = verts[i+1].xyz;
d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
texCoords[i+0][0] = d0;
texCoords[i+0][1] = d1;
texCoords[i+1][0] = d2;
texCoords[i+1][1] = d3;
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 8;
bits |= FLOATSIGNBITSET( d3 ) << 9;
d0 = 1.0f - d0;
d1 = 1.0f - d1;
d2 = 1.0f - d2;
d3 = 1.0f - d3;
bits |= FLOATSIGNBITSET( d0 ) << 2;
bits |= FLOATSIGNBITSET( d1 ) << 3;
bits |= FLOATSIGNBITSET( d2 ) << 10;
bits |= FLOATSIGNBITSET( d3 ) << 11;
*(unsigned short *)(cullBits + i) = bits;
}
if ( numVerts & 1 ) {
byte bits;
float d0, d1;
const idPlane &p0 = planes[0];
const idPlane &p1 = planes[1];
const idVec3 &v0 = verts[numVerts - 1].xyz;
d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
texCoords[i][0] = d0;
texCoords[i][1] = d1;
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
d0 = 1.0f - d0;
d1 = 1.0f - d1;
bits |= FLOATSIGNBITSET( d0 ) << 2;
bits |= FLOATSIGNBITSET( d1 ) << 3;
cullBits[numVerts - 1] = bits;
}
#endif
}
/*
============
idSIMD_SSE::DeriveTriPlanes
============
*/
void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
#if 1
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
__asm {
mov eax, numIndexes
shl eax, 2
mov esi, verts
mov edi, indexes
mov edx, planes
add edi, eax
neg eax
add eax, 4*12
jge done4
loopPlane4:
mov ebx, [edi+eax-4*12+4]
imul ebx, DRAWVERT_SIZE
mov ecx, [edi+eax-4*12+0]
imul ecx, DRAWVERT_SIZE
movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
mov ebx, [edi+eax-4*12+8]
imul ebx, DRAWVERT_SIZE
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
mov ebx, [edi+eax-3*12+4]
imul ebx, DRAWVERT_SIZE
mov ecx, [edi+eax-3*12+0]
imul ecx, DRAWVERT_SIZE
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm0, xmm6
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm1, xmm7
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm2, xmm6
mov ebx, [edi+eax-3*12+8]
imul ebx, DRAWVERT_SIZE
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm3, xmm7
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm4, xmm6
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm5, xmm7
mov ebx, [edi+eax-2*12+4]
imul ebx, DRAWVERT_SIZE
mov ecx, [edi+eax-2*12+0]
imul ecx, DRAWVERT_SIZE
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm0, xmm6
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm1, xmm7
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm2, xmm6
mov ebx, [edi+eax-2*12+8]
imul ebx, DRAWVERT_SIZE
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm3, xmm7
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm4, xmm6
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm5, xmm7
mov ebx, [edi+eax-1*12+4]
imul ebx, DRAWVERT_SIZE
mov ecx, [edi+eax-1*12+0]
imul ecx, DRAWVERT_SIZE
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm0, xmm6
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm1, xmm7
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm2, xmm6
mov ebx, [edi+eax-1*12+8]
imul ebx, DRAWVERT_SIZE
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm3, xmm7
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm4, xmm6
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm5, xmm7
movaps xmm6, xmm4
mulps xmm6, xmm2
movaps xmm7, xmm5
mulps xmm7, xmm1
subps xmm6, xmm7
mulps xmm5, xmm0
mulps xmm2, xmm3
subps xmm5, xmm2
mulps xmm3, xmm1
mulps xmm4, xmm0
subps xmm3, xmm4
movaps xmm0, xmm6
mulps xmm6, xmm6
movaps xmm1, xmm5
mulps xmm5, xmm5
movaps xmm2, xmm3
mulps xmm3, xmm3
addps xmm3, xmm5
addps xmm3, xmm6
rsqrtps xmm3, xmm3
add edx, 4*16
mov ecx, [edi+eax-1*12+0]
imul ecx, DRAWVERT_SIZE
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
movss [edx-1*16+0], xmm0
movss [edx-1*16+4], xmm1
movss [edx-1*16+8], xmm2
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
xorps xmm0, SIMD_SP_singleSignBitMask
subss xmm0, xmm1
subss xmm0, xmm2
movss [edx-1*16+12], xmm0
mov ecx, [edi+eax-2*12+0]
imul ecx, DRAWVERT_SIZE
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [edx-2*16+0], xmm0
movss [edx-2*16+4], xmm1
movss [edx-2*16+8], xmm2
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
xorps xmm0, SIMD_SP_singleSignBitMask
subss xmm0, xmm1
subss xmm0, xmm2
movss [edx-2*16+12], xmm0
mov ecx, [edi+eax-3*12+0]
imul ecx, DRAWVERT_SIZE
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [edx-3*16+0], xmm0
movss [edx-3*16+4], xmm1
movss [edx-3*16+8], xmm2
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
xorps xmm0, SIMD_SP_singleSignBitMask
subss xmm0, xmm1
subss xmm0, xmm2
movss [edx-3*16+12], xmm0
mov ecx, [edi+eax-4*12+0]
imul ecx, DRAWVERT_SIZE
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [edx-4*16+0], xmm0
movss [edx-4*16+4], xmm1
movss [edx-4*16+8], xmm2
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
xorps xmm0, SIMD_SP_singleSignBitMask
subss xmm0, xmm1
subss xmm0, xmm2
movss [edx-4*16+12], xmm0
add eax, 4*12
jle loopPlane4
done4:
sub eax, 4*12
jge done
loopPlane1:
mov ebx, [edi+eax+4]
imul ebx, DRAWVERT_SIZE
mov ecx, [edi+eax+0]
imul ecx, DRAWVERT_SIZE
movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
mov ebx, [edi+eax+8]
imul ebx, DRAWVERT_SIZE
movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
movss xmm6, xmm4
mulss xmm6, xmm2
movss xmm7, xmm5
mulss xmm7, xmm1
subss xmm6, xmm7
mulss xmm5, xmm0
mulss xmm2, xmm3
subss xmm5, xmm2
mulss xmm3, xmm1
mulss xmm4, xmm0
subss xmm3, xmm4
movss xmm0, xmm6
mulss xmm6, xmm6
movss xmm1, xmm5
mulss xmm5, xmm5
movss xmm2, xmm3
mulss xmm3, xmm3
addss xmm3, xmm5
addss xmm3, xmm6
rsqrtss xmm3, xmm3
add edx, 1*16
mulss xmm0, xmm3
mulss xmm1, xmm3
mulss xmm2, xmm3
movss [edx-1*16+0], xmm0
movss [edx-1*16+4], xmm1
movss [edx-1*16+8], xmm2
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
xorps xmm0, SIMD_SP_singleSignBitMask
subss xmm0, xmm1
subss xmm0, xmm2
movss [edx-1*16+12], xmm0
add eax, 1*12
jl loopPlane1
done:
}
#else
int i, j;
for ( i = 0; i <= numIndexes - 12; i += 12 ) {
ALIGN16( float d0[4] );
ALIGN16( float d1[4] );
ALIGN16( float d2[4] );
ALIGN16( float d3[4] );
ALIGN16( float d4[4] );
ALIGN16( float d5[4] );
ALIGN16( float n0[4] );
ALIGN16( float n1[4] );
ALIGN16( float n2[4] );
for ( j = 0; j < 4; j++ ) {
const idDrawVert *a, *b, *c;
a = verts + indexes[i + j * 3 + 0];
b = verts + indexes[i + j * 3 + 1];
c = verts + indexes[i + j * 3 + 2];
d0[j] = b->xyz[0] - a->xyz[0];
d1[j] = b->xyz[1] - a->xyz[1];
d2[j] = b->xyz[2] - a->xyz[2];
d3[j] = c->xyz[0] - a->xyz[0];
d4[j] = c->xyz[1] - a->xyz[1];
d5[j] = c->xyz[2] - a->xyz[2];
}
ALIGN16( float tmp[4] );
n0[0] = d4[0] * d2[0];
n0[1] = d4[1] * d2[1];
n0[2] = d4[2] * d2[2];
n0[3] = d4[3] * d2[3];
n0[0] -= d5[0] * d1[0];
n0[1] -= d5[1] * d1[1];
n0[2] -= d5[2] * d1[2];
n0[3] -= d5[3] * d1[3];
n1[0] = d5[0] * d0[0];
n1[1] = d5[1] * d0[1];
n1[2] = d5[2] * d0[2];
n1[3] = d5[3] * d0[3];
n1[0] -= d3[0] * d2[0];
n1[1] -= d3[1] * d2[1];
n1[2] -= d3[2] * d2[2];
n1[3] -= d3[3] * d2[3];
n2[0] = d3[0] * d1[0];
n2[1] = d3[1] * d1[1];
n2[2] = d3[2] * d1[2];
n2[3] = d3[3] * d1[3];
n2[0] -= d4[0] * d0[0];
n2[1] -= d4[1] * d0[1];
n2[2] -= d4[2] * d0[2];
n2[3] -= d4[3] * d0[3];
tmp[0] = n0[0] * n0[0];
tmp[1] = n0[1] * n0[1];
tmp[2] = n0[2] * n0[2];
tmp[3] = n0[3] * n0[3];
tmp[0] += n1[0] * n1[0];
tmp[1] += n1[1] * n1[1];
tmp[2] += n1[2] * n1[2];
tmp[3] += n1[3] * n1[3];
tmp[0] += n2[0] * n2[0];
tmp[1] += n2[1] * n2[1];
tmp[2] += n2[2] * n2[2];
tmp[3] += n2[3] * n2[3];
tmp[0] = idMath::RSqrt( tmp[0] );
tmp[1] = idMath::RSqrt( tmp[1] );
tmp[2] = idMath::RSqrt( tmp[2] );
tmp[3] = idMath::RSqrt( tmp[3] );
n0[0] *= tmp[0];
n0[1] *= tmp[1];
n0[2] *= tmp[2];
n0[3] *= tmp[3];
n1[0] *= tmp[0];
n1[1] *= tmp[1];
n1[2] *= tmp[2];
n1[3] *= tmp[3];
n2[0] *= tmp[0];
n2[1] *= tmp[1];
n2[2] *= tmp[2];
n2[3] *= tmp[3];
for ( j = 0; j < 4; j++ ) {
const idDrawVert *a;
a = verts + indexes[i + j * 3];
planes->Normal()[0] = n0[j];
planes->Normal()[1] = n1[j];
planes->Normal()[2] = n2[j];
planes->FitThroughPoint( a->xyz );
planes++;
}
}
for ( ; i < numIndexes; i += 3 ) {
const idDrawVert *a, *b, *c;
float d0, d1, d2, d3, d4, d5;
float n0, n1, n2;
a = verts + indexes[i + 0];
b = verts + indexes[i + 1];
c = verts + indexes[i + 2];
d0 = b->xyz[0] - a->xyz[0];
d1 = b->xyz[1] - a->xyz[1];
d2 = b->xyz[2] - a->xyz[2];
d3 = c->xyz[0] - a->xyz[0];
d4 = c->xyz[1] - a->xyz[1];
d5 = c->xyz[2] - a->xyz[2];
float tmp;
n0 = d4 * d2 - d5 * d1;
n1 = d5 * d0 - d3 * d2;
n2 = d3 * d1 - d4 * d0;
tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
n0 *= tmp;
n1 *= tmp;
n2 *= tmp;
planes->Normal()[0] = n0;
planes->Normal()[1] = n1;
planes->Normal()[2] = n2;
planes->FitThroughPoint( a->xyz );
planes++;
}
#endif
}
/*
============
idSIMD_SSE::DeriveTangents
============
*/
//#define REFINE_TANGENT_SQUAREROOT
#define FIX_DEGENERATE_TANGENT
void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
int i;
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
assert( planes != NULL );
assert( verts != NULL );
assert( numVerts >= 0 );
#ifdef REFINE_TANGENT_SQUAREROOT
__asm {
movaps xmm6, SIMD_SP_rsqrt_c0
movaps xmm7, SIMD_SP_rsqrt_c1
}
#endif
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
memset( used, 0, numVerts * sizeof( used[0] ) );
for ( i = 0; i <= numIndexes - 12; i += 12 ) {
idDrawVert *a, *b, *c;
ALIGN16( unsigned long signBit[4] );
ALIGN16( float d0[4] );
ALIGN16( float d1[4] );
ALIGN16( float d2[4] );
ALIGN16( float d3[4] );
ALIGN16( float d4[4] );
ALIGN16( float d5[4] );
ALIGN16( float d6[4] );
ALIGN16( float d7[4] );
ALIGN16( float d8[4] );
ALIGN16( float d9[4] );
ALIGN16( float n0[4] );
ALIGN16( float n1[4] );
ALIGN16( float n2[4] );
ALIGN16( float t0[4] );
ALIGN16( float t1[4] );
ALIGN16( float t2[4] );
ALIGN16( float t3[4] );
ALIGN16( float t4[4] );
ALIGN16( float t5[4] );
for ( int j = 0; j < 4; j++ ) {
a = verts + indexes[i + j * 3 + 0];
b = verts + indexes[i + j * 3 + 1];
c = verts + indexes[i + j * 3 + 2];
d0[j] = b->xyz[0] - a->xyz[0];
d1[j] = b->xyz[1] - a->xyz[1];
d2[j] = b->xyz[2] - a->xyz[2];
d3[j] = b->st[0] - a->st[0];
d4[j] = b->st[1] - a->st[1];
d5[j] = c->xyz[0] - a->xyz[0];
d6[j] = c->xyz[1] - a->xyz[1];
d7[j] = c->xyz[2] - a->xyz[2];
d8[j] = c->st[0] - a->st[0];
d9[j] = c->st[1] - a->st[1];
}
#if 1
__asm {
// normal
movaps xmm0, d6
mulps xmm0, d2
movaps xmm1, d7
mulps xmm1, d1
subps xmm0, xmm1
movaps xmm1, d7
mulps xmm1, d0
movaps xmm2, d5
mulps xmm2, d2
subps xmm1, xmm2
movaps xmm2, d5
mulps xmm2, d1
movaps xmm3, d6
mulps xmm3, d0
subps xmm2, xmm3
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, xmm3
mulps xmm4, xmm4
mulps xmm5, xmm5
addps xmm3, xmm4
addps xmm3, xmm5
#ifdef FIX_DEGENERATE_TANGENT
xorps xmm4, xmm4
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#endif
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtps xmm4, xmm3
mulps xmm3, xmm4
mulps xmm3, xmm4
subps xmm3, xmm6
mulps xmm4, xmm7
mulps xmm3, xmm4
#else
rsqrtps xmm3, xmm3
#endif
mulps xmm0, xmm3
movaps n0, xmm0
mulps xmm1, xmm3
movaps n1, xmm1
mulps xmm2, xmm3
movaps n2, xmm2
// area sign bit
movaps xmm0, d3
mulps xmm0, d9
movaps xmm1, d4
mulps xmm1, d8
subps xmm0, xmm1
andps xmm0, SIMD_SP_signBitMask
movaps signBit, xmm0
// first tangent
movaps xmm0, d0
mulps xmm0, d9
movaps xmm1, d4
mulps xmm1, d5
subps xmm0, xmm1
movaps xmm1, d1
mulps xmm1, d9
movaps xmm2, d4
mulps xmm2, d6
subps xmm1, xmm2
movaps xmm2, d2
mulps xmm2, d9
movaps xmm3, d4
mulps xmm3, d7
subps xmm2, xmm3
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, xmm3
mulps xmm4, xmm4
mulps xmm5, xmm5
addps xmm3, xmm4
addps xmm3, xmm5
#ifdef FIX_DEGENERATE_TANGENT
xorps xmm4, xmm4
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#endif
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtps xmm4, xmm3
mulps xmm3, xmm4
mulps xmm3, xmm4
subps xmm3, xmm6
mulps xmm4, xmm7
mulps xmm3, xmm4
#else
rsqrtps xmm3, xmm3
#endif
xorps xmm3, signBit
mulps xmm0, xmm3
movaps t0, xmm0
mulps xmm1, xmm3
movaps t1, xmm1
mulps xmm2, xmm3
movaps t2, xmm2
// second tangent
movaps xmm0, d3
mulps xmm0, d5
movaps xmm1, d0
mulps xmm1, d8
subps xmm0, xmm1
movaps xmm1, d3
mulps xmm1, d6
movaps xmm2, d1
mulps xmm2, d8
subps xmm1, xmm2
movaps xmm2, d3
mulps xmm2, d7
movaps xmm3, d2
mulps xmm3, d8
subps xmm2, xmm3
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, xmm3
mulps xmm4, xmm4
mulps xmm5, xmm5
addps xmm3, xmm4
addps xmm3, xmm5
#ifdef FIX_DEGENERATE_TANGENT
xorps xmm4, xmm4
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#endif
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtps xmm4, xmm3
mulps xmm3, xmm4
mulps xmm3, xmm4
subps xmm3, xmm6
mulps xmm4, xmm7
mulps xmm3, xmm4
#else
rsqrtps xmm3, xmm3
#endif
xorps xmm3, signBit
mulps xmm0, xmm3
movaps t3, xmm0
mulps xmm1, xmm3
movaps t4, xmm1
mulps xmm2, xmm3
movaps t5, xmm2
}
#else
ALIGN16( float tmp[4] );
// normal
n0[0] = d6[0] * d2[0];
n0[1] = d6[1] * d2[1];
n0[2] = d6[2] * d2[2];
n0[3] = d6[3] * d2[3];
n0[0] -= d7[0] * d1[0];
n0[1] -= d7[1] * d1[1];
n0[2] -= d7[2] * d1[2];
n0[3] -= d7[3] * d1[3];
n1[0] = d7[0] * d0[0];
n1[1] = d7[1] * d0[1];
n1[2] = d7[2] * d0[2];
n1[3] = d7[3] * d0[3];
n1[0] -= d5[0] * d2[0];
n1[1] -= d5[1] * d2[1];
n1[2] -= d5[2] * d2[2];
n1[3] -= d5[3] * d2[3];
n2[0] = d5[0] * d1[0];
n2[1] = d5[1] * d1[1];
n2[2] = d5[2] * d1[2];
n2[3] = d5[3] * d1[3];
n2[0] -= d6[0] * d0[0];
n2[1] -= d6[1] * d0[1];
n2[2] -= d6[2] * d0[2];
n2[3] -= d6[3] * d0[3];
tmp[0] = n0[0] * n0[0];
tmp[1] = n0[1] * n0[1];
tmp[2] = n0[2] * n0[2];
tmp[3] = n0[3] * n0[3];
tmp[0] += n1[0] * n1[0];
tmp[1] += n1[1] * n1[1];
tmp[2] += n1[2] * n1[2];
tmp[3] += n1[3] * n1[3];
tmp[0] += n2[0] * n2[0];
tmp[1] += n2[1] * n2[1];
tmp[2] += n2[2] * n2[2];
tmp[3] += n2[3] * n2[3];
tmp[0] = idMath::RSqrt( tmp[0] );
tmp[1] = idMath::RSqrt( tmp[1] );
tmp[2] = idMath::RSqrt( tmp[2] );
tmp[3] = idMath::RSqrt( tmp[3] );
n0[0] *= tmp[0];
n0[1] *= tmp[1];
n0[2] *= tmp[2];
n0[3] *= tmp[3];
n1[0] *= tmp[0];
n1[1] *= tmp[1];
n1[2] *= tmp[2];
n1[3] *= tmp[3];
n2[0] *= tmp[0];
n2[1] *= tmp[1];
n2[2] *= tmp[2];
n2[3] *= tmp[3];
// area sign bit
tmp[0] = d3[0] * d9[0];
tmp[1] = d3[1] * d9[1];
tmp[2] = d3[2] * d9[2];
tmp[3] = d3[3] * d9[3];
tmp[0] -= d4[0] * d8[0];
tmp[1] -= d4[1] * d8[1];
tmp[2] -= d4[2] * d8[2];
tmp[3] -= d4[3] * d8[3];
signBit[0] = ( *(unsigned long *)&tmp[0] ) & ( 1 << 31 );
signBit[1] = ( *(unsigned long *)&tmp[1] ) & ( 1 << 31 );
signBit[2] = ( *(unsigned long *)&tmp[2] ) & ( 1 << 31 );
signBit[3] = ( *(unsigned long *)&tmp[3] ) & ( 1 << 31 );
// first tangent
t0[0] = d0[0] * d9[0];
t0[1] = d0[1] * d9[1];
t0[2] = d0[2] * d9[2];
t0[3] = d0[3] * d9[3];
t0[0] -= d4[0] * d5[0];
t0[1] -= d4[1] * d5[1];
t0[2] -= d4[2] * d5[2];
t0[3] -= d4[3] * d5[3];
t1[0] = d1[0] * d9[0];
t1[1] = d1[1] * d9[1];
t1[2] = d1[2] * d9[2];
t1[3] = d1[3] * d9[3];
t1[0] -= d4[0] * d6[0];
t1[1] -= d4[1] * d6[1];
t1[2] -= d4[2] * d6[2];
t1[3] -= d4[3] * d6[3];
t2[0] = d2[0] * d9[0];
t2[1] = d2[1] * d9[1];
t2[2] = d2[2] * d9[2];
t2[3] = d2[3] * d9[3];
t2[0] -= d4[0] * d7[0];
t2[1] -= d4[1] * d7[1];
t2[2] -= d4[2] * d7[2];
t2[3] -= d4[3] * d7[3];
tmp[0] = t0[0] * t0[0];
tmp[1] = t0[1] * t0[1];
tmp[2] = t0[2] * t0[2];
tmp[3] = t0[3] * t0[3];
tmp[0] += t1[0] * t1[0];
tmp[1] += t1[1] * t1[1];
tmp[2] += t1[2] * t1[2];
tmp[3] += t1[3] * t1[3];
tmp[0] += t2[0] * t2[0];
tmp[1] += t2[1] * t2[1];
tmp[2] += t2[2] * t2[2];
tmp[3] += t2[3] * t2[3];
tmp[0] = idMath::RSqrt( tmp[0] );
tmp[1] = idMath::RSqrt( tmp[1] );
tmp[2] = idMath::RSqrt( tmp[2] );
tmp[3] = idMath::RSqrt( tmp[3] );
*(unsigned long *)&tmp[0] ^= signBit[0];
*(unsigned long *)&tmp[1] ^= signBit[1];
*(unsigned long *)&tmp[2] ^= signBit[2];
*(unsigned long *)&tmp[3] ^= signBit[3];
t0[0] *= tmp[0];
t0[1] *= tmp[1];
t0[2] *= tmp[2];
t0[3] *= tmp[3];
t1[0] *= tmp[0];
t1[1] *= tmp[1];
t1[2] *= tmp[2];
t1[3] *= tmp[3];
t2[0] *= tmp[0];
t2[1] *= tmp[1];
t2[2] *= tmp[2];
t2[3] *= tmp[3];
// second tangent
t3[0] = d3[0] * d5[0];
t3[1] = d3[1] * d5[1];
t3[2] = d3[2] * d5[2];
t3[3] = d3[3] * d5[3];
t3[0] -= d0[0] * d8[0];
t3[1] -= d0[1] * d8[1];
t3[2] -= d0[2] * d8[2];
t3[3] -= d0[3] * d8[3];
t4[0] = d3[0] * d6[0];
t4[1] = d3[1] * d6[1];
t4[2] = d3[2] * d6[2];
t4[3] = d3[3] * d6[3];
t4[0] -= d1[0] * d8[0];
t4[1] -= d1[1] * d8[1];
t4[2] -= d1[2] * d8[2];
t4[3] -= d1[3] * d8[3];
t5[0] = d3[0] * d7[0];
t5[1] = d3[1] * d7[1];
t5[2] = d3[2] * d7[2];
t5[3] = d3[3] * d7[3];
t5[0] -= d2[0] * d8[0];
t5[1] -= d2[1] * d8[1];
t5[2] -= d2[2] * d8[2];
t5[3] -= d2[3] * d8[3];
tmp[0] = t3[0] * t3[0];
tmp[1] = t3[1] * t3[1];
tmp[2] = t3[2] * t3[2];
tmp[3] = t3[3] * t3[3];
tmp[0] += t4[0] * t4[0];
tmp[1] += t4[1] * t4[1];
tmp[2] += t4[2] * t4[2];
tmp[3] += t4[3] * t4[3];
tmp[0] += t5[0] * t5[0];
tmp[1] += t5[1] * t5[1];
tmp[2] += t5[2] * t5[2];
tmp[3] += t5[3] * t5[3];
tmp[0] = idMath::RSqrt( tmp[0] );
tmp[1] = idMath::RSqrt( tmp[1] );
tmp[2] = idMath::RSqrt( tmp[2] );
tmp[3] = idMath::RSqrt( tmp[3] );
*(unsigned long *)&tmp[0] ^= signBit[0];
*(unsigned long *)&tmp[1] ^= signBit[1];
*(unsigned long *)&tmp[2] ^= signBit[2];
*(unsigned long *)&tmp[3] ^= signBit[3];
t3[0] *= tmp[0];
t3[1] *= tmp[1];
t3[2] *= tmp[2];
t3[3] *= tmp[3];
t4[0] *= tmp[0];
t4[1] *= tmp[1];
t4[2] *= tmp[2];
t4[3] *= tmp[3];
t5[0] *= tmp[0];
t5[1] *= tmp[1];
t5[2] *= tmp[2];
t5[3] *= tmp[3];
#endif
for ( int j = 0; j < 4; j++ ) {
const int v0 = indexes[i + j * 3 + 0];
const int v1 = indexes[i + j * 3 + 1];
const int v2 = indexes[i + j * 3 + 2];
a = verts + v0;
b = verts + v1;
c = verts + v2;
planes->Normal()[0] = n0[j];
planes->Normal()[1] = n1[j];
planes->Normal()[2] = n2[j];
planes->FitThroughPoint( a->xyz );
planes++;
if ( used[v0] ) {
a->normal[0] += n0[j];
a->normal[1] += n1[j];
a->normal[2] += n2[j];
a->tangents[0][0] += t0[j];
a->tangents[0][1] += t1[j];
a->tangents[0][2] += t2[j];
a->tangents[1][0] += t3[j];
a->tangents[1][1] += t4[j];
a->tangents[1][2] += t5[j];
} else {
a->normal[0] = n0[j];
a->normal[1] = n1[j];
a->normal[2] = n2[j];
a->tangents[0][0] = t0[j];
a->tangents[0][1] = t1[j];
a->tangents[0][2] = t2[j];
a->tangents[1][0] = t3[j];
a->tangents[1][1] = t4[j];
a->tangents[1][2] = t5[j];
used[v0] = true;
}
if ( used[v1] ) {
b->normal[0] += n0[j];
b->normal[1] += n1[j];
b->normal[2] += n2[j];
b->tangents[0][0] += t0[j];
b->tangents[0][1] += t1[j];
b->tangents[0][2] += t2[j];
b->tangents[1][0] += t3[j];
b->tangents[1][1] += t4[j];
b->tangents[1][2] += t5[j];
} else {
b->normal[0] = n0[j];
b->normal[1] = n1[j];
b->normal[2] = n2[j];
b->tangents[0][0] = t0[j];
b->tangents[0][1] = t1[j];
b->tangents[0][2] = t2[j];
b->tangents[1][0] = t3[j];
b->tangents[1][1] = t4[j];
b->tangents[1][2] = t5[j];
used[v1] = true;
}
if ( used[v2] ) {
c->normal[0] += n0[j];
c->normal[1] += n1[j];
c->normal[2] += n2[j];
c->tangents[0][0] += t0[j];
c->tangents[0][1] += t1[j];
c->tangents[0][2] += t2[j];
c->tangents[1][0] += t3[j];
c->tangents[1][1] += t4[j];
c->tangents[1][2] += t5[j];
} else {
c->normal[0] = n0[j];
c->normal[1] = n1[j];
c->normal[2] = n2[j];
c->tangents[0][0] = t0[j];
c->tangents[0][1] = t1[j];
c->tangents[0][2] = t2[j];
c->tangents[1][0] = t3[j];
c->tangents[1][1] = t4[j];
c->tangents[1][2] = t5[j];
used[v2] = true;
}
}
}
for ( ; i < numIndexes; i += 3 ) {
idDrawVert *a, *b, *c;
ALIGN16( unsigned long signBit[4] );
float d0, d1, d2, d3, d4;
float d5, d6, d7, d8, d9;
float n0, n1, n2;
float t0, t1, t2;
float t3, t4, t5;
const int v0 = indexes[i + 0];
const int v1 = indexes[i + 1];
const int v2 = indexes[i + 2];
a = verts + v0;
b = verts + v1;
c = verts + v2;
d0 = b->xyz[0] - a->xyz[0];
d1 = b->xyz[1] - a->xyz[1];
d2 = b->xyz[2] - a->xyz[2];
d3 = b->st[0] - a->st[0];
d4 = b->st[1] - a->st[1];
d5 = c->xyz[0] - a->xyz[0];
d6 = c->xyz[1] - a->xyz[1];
d7 = c->xyz[2] - a->xyz[2];
d8 = c->st[0] - a->st[0];
d9 = c->st[1] - a->st[1];
#if 1
__asm {
// normal
movss xmm0, d6
mulss xmm0, d2
movss xmm1, d7
mulss xmm1, d1
subss xmm0, xmm1
movss xmm1, d7
mulss xmm1, d0
movss xmm2, d5
mulss xmm2, d2
subss xmm1, xmm2
movss xmm2, d5
mulss xmm2, d1
movss xmm3, d6
mulss xmm3, d0
subss xmm2, xmm3
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, xmm3
mulss xmm4, xmm4
mulss xmm5, xmm5
addss xmm3, xmm4
addss xmm3, xmm5
#ifdef FIX_DEGENERATE_TANGENT
xorps xmm4, xmm4
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#endif
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtss xmm4, xmm3
mulss xmm3, xmm4
mulss xmm3, xmm4
subss xmm3, xmm6
mulss xmm4, xmm7
mulss xmm3, xmm4
#else
rsqrtss xmm3, xmm3
#endif
mulss xmm0, xmm3
movss n0, xmm0
mulss xmm1, xmm3
movss n1, xmm1
mulss xmm2, xmm3
movss n2, xmm2
// area sign bit
movss xmm0, d3
mulss xmm0, d9
movss xmm1, d4
mulss xmm1, d8
subss xmm0, xmm1
andps xmm0, SIMD_SP_signBitMask
movaps signBit, xmm0
// first tangent
movss xmm0, d0
mulss xmm0, d9
movss xmm1, d4
mulss xmm1, d5
subss xmm0, xmm1
movss xmm1, d1
mulss xmm1, d9
movss xmm2, d4
mulss xmm2, d6
subss xmm1, xmm2
movss xmm2, d2
mulss xmm2, d9
movss xmm3, d4
mulss xmm3, d7
subss xmm2, xmm3
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, xmm3
mulss xmm4, xmm4
mulss xmm5, xmm5
addss xmm3, xmm4
addss xmm3, xmm5
#ifdef FIX_DEGENERATE_TANGENT
xorps xmm4, xmm4
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#endif
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtss xmm4, xmm3
mulss xmm3, xmm4
mulss xmm3, xmm4
subss xmm3, xmm6
mulss xmm4, xmm7
mulss xmm3, xmm4
#else
rsqrtss xmm3, xmm3
#endif
xorps xmm3, signBit
mulss xmm0, xmm3
movss t0, xmm0
mulss xmm1, xmm3
movss t1, xmm1
mulss xmm2, xmm3
movss t2, xmm2
// second tangent
movss xmm0, d3
mulss xmm0, d5
movss xmm1, d0
mulss xmm1, d8
subss xmm0, xmm1
movss xmm1, d3
mulss xmm1, d6
movss xmm2, d1
mulss xmm2, d8
subss xmm1, xmm2
movss xmm2, d3
mulss xmm2, d7
movss xmm3, d2
mulss xmm3, d8
subss xmm2, xmm3
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, xmm3
mulss xmm4, xmm4
mulss xmm5, xmm5
addss xmm3, xmm4
addss xmm3, xmm5
#ifdef FIX_DEGENERATE_TANGENT
xorps xmm4, xmm4
cmpeqps xmm4, xmm3
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
andps xmm3, SIMD_SP_absMask // make sure the values are positive
orps xmm3, xmm4
#endif
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtss xmm4, xmm3
mulss xmm3, xmm4
mulss xmm3, xmm4
subss xmm3, xmm6
mulss xmm4, xmm7
mulss xmm3, xmm4
#else
rsqrtss xmm3, xmm3
#endif
xorps xmm3, signBit
mulss xmm0, xmm3
movss t3, xmm0
mulss xmm1, xmm3
movss t4, xmm1
mulss xmm2, xmm3
movss t5, xmm2
}
#else
float tmp;
// normal
n0 = d6 * d2 - d7 * d1;
n1 = d7 * d0 - d5 * d2;
n2 = d5 * d1 - d6 * d0;
tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
n0 *= tmp;
n1 *= tmp;
n2 *= tmp;
// area sign bit
tmp = d3 * d9 - d4 * d8;
signBit[0] = ( *(unsigned long *)&tmp ) & ( 1 << 31 );
// first tangent
t0 = d0 * d9 - d4 * d5;
t1 = d1 * d9 - d4 * d6;
t2 = d2 * d9 - d4 * d7;
tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
*(unsigned long *)&tmp ^= signBit[0];
t0 *= tmp;
t1 *= tmp;
t2 *= tmp;
// second tangent
t3 = d3 * d5 - d0 * d8;
t4 = d3 * d6 - d1 * d8;
t5 = d3 * d7 - d2 * d8;
tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
*(unsigned long *)&tmp ^= signBit[0];
t3 *= tmp;
t4 *= tmp;
t5 *= tmp;
#endif
planes->Normal()[0] = n0;
planes->Normal()[1] = n1;
planes->Normal()[2] = n2;
planes->FitThroughPoint( a->xyz );
planes++;
if ( used[v0] ) {
a->normal[0] += n0;
a->normal[1] += n1;
a->normal[2] += n2;
a->tangents[0][0] += t0;
a->tangents[0][1] += t1;
a->tangents[0][2] += t2;
a->tangents[1][0] += t3;
a->tangents[1][1] += t4;
a->tangents[1][2] += t5;
} else {
a->normal[0] = n0;
a->normal[1] = n1;
a->normal[2] = n2;
a->tangents[0][0] = t0;
a->tangents[0][1] = t1;
a->tangents[0][2] = t2;
a->tangents[1][0] = t3;
a->tangents[1][1] = t4;
a->tangents[1][2] = t5;
used[v0] = true;
}
if ( used[v1] ) {
b->normal[0] += n0;
b->normal[1] += n1;
b->normal[2] += n2;
b->tangents[0][0] += t0;
b->tangents[0][1] += t1;
b->tangents[0][2] += t2;
b->tangents[1][0] += t3;
b->tangents[1][1] += t4;
b->tangents[1][2] += t5;
} else {
b->normal[0] = n0;
b->normal[1] = n1;
b->normal[2] = n2;
b->tangents[0][0] = t0;
b->tangents[0][1] = t1;
b->tangents[0][2] = t2;
b->tangents[1][0] = t3;
b->tangents[1][1] = t4;
b->tangents[1][2] = t5;
used[v1] = true;
}
if ( used[v2] ) {
c->normal[0] += n0;
c->normal[1] += n1;
c->normal[2] += n2;
c->tangents[0][0] += t0;
c->tangents[0][1] += t1;
c->tangents[0][2] += t2;
c->tangents[1][0] += t3;
c->tangents[1][1] += t4;
c->tangents[1][2] += t5;
} else {
c->normal[0] = n0;
c->normal[1] = n1;
c->normal[2] = n2;
c->tangents[0][0] = t0;
c->tangents[0][1] = t1;
c->tangents[0][2] = t2;
c->tangents[1][0] = t3;
c->tangents[1][1] = t4;
c->tangents[1][2] = t5;
used[v2] = true;
}
}
}
/*
============
idSIMD_SSE::DeriveUnsmoothedTangents
============
*/
#define DERIVE_UNSMOOTHED_BITANGENT
void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
int i, j;
for ( i = 0; i <= numVerts - 4; i += 4 ) {
ALIGN16( float s0[4] );
ALIGN16( float s1[4] );
ALIGN16( float s2[4] );
ALIGN16( float d0[4] );
ALIGN16( float d1[4] );
ALIGN16( float d2[4] );
ALIGN16( float d3[4] );
ALIGN16( float d4[4] );
ALIGN16( float d5[4] );
ALIGN16( float d6[4] );
ALIGN16( float d7[4] );
ALIGN16( float d8[4] );
ALIGN16( float d9[4] );
ALIGN16( float n0[4] );
ALIGN16( float n1[4] );
ALIGN16( float n2[4] );
ALIGN16( float t0[4] );
ALIGN16( float t1[4] );
ALIGN16( float t2[4] );
ALIGN16( float t3[4] );
ALIGN16( float t4[4] );
ALIGN16( float t5[4] );
for ( j = 0; j < 4; j++ ) {
const idDrawVert *a, *b, *c;
const dominantTri_s &dt = dominantTris[i+j];
s0[j] = dt.normalizationScale[0];
s1[j] = dt.normalizationScale[1];
s2[j] = dt.normalizationScale[2];
a = verts + i + j;
b = verts + dt.v2;
c = verts + dt.v3;
d0[j] = b->xyz[0] - a->xyz[0];
d1[j] = b->xyz[1] - a->xyz[1];
d2[j] = b->xyz[2] - a->xyz[2];
d3[j] = b->st[0] - a->st[0];
d4[j] = b->st[1] - a->st[1];
d5[j] = c->xyz[0] - a->xyz[0];
d6[j] = c->xyz[1] - a->xyz[1];
d7[j] = c->xyz[2] - a->xyz[2];
d8[j] = c->st[0] - a->st[0];
d9[j] = c->st[1] - a->st[1];
}
#if 1
__asm {
movaps xmm0, d6
mulps xmm0, d2
movaps xmm1, d7
mulps xmm1, d1
movaps xmm2, d7
mulps xmm2, d0
movaps xmm3, d5
mulps xmm3, d2
movaps xmm4, d5
mulps xmm4, d1
movaps xmm5, d6
mulps xmm5, d0
subps xmm0, xmm1
subps xmm2, xmm3
movaps xmm7, s2
subps xmm4, xmm5
mulps xmm0, xmm7
movaps n0, xmm0
mulps xmm2, xmm7
movaps n1, xmm2
mulps xmm4, xmm7
movaps n2, xmm4
movaps xmm0, d0
mulps xmm0, d9
movaps xmm1, d4
mulps xmm1, d5
movaps xmm2, d1
mulps xmm2, d9
movaps xmm3, d4
mulps xmm3, d6
movaps xmm4, d2
mulps xmm4, d9
movaps xmm5, d4
mulps xmm5, d7
subps xmm0, xmm1
subps xmm2, xmm3
movaps xmm7, s0
subps xmm4, xmm5
mulps xmm0, xmm7
movaps t0, xmm0
mulps xmm2, xmm7
movaps t1, xmm2
mulps xmm4, xmm7
movaps t2, xmm4
#ifndef DERIVE_UNSMOOTHED_BITANGENT
movaps xmm0, d3
mulps xmm0, d5
movaps xmm1, d0
mulps xmm1, d8
movaps xmm2, d3
mulps xmm2, d6
movaps xmm3, d1
mulps xmm3, d8
movaps xmm4, d3
mulps xmm4, d7
movaps xmm5, d2
mulps xmm5, d8
#else
movaps xmm0, n2
mulps xmm0, t1
movaps xmm1, n1
mulps xmm1, t2
movaps xmm2, n0
mulps xmm2, t2
movaps xmm3, n2
mulps xmm3, t0
movaps xmm4, n1
mulps xmm4, t0
movaps xmm5, n0
mulps xmm5, t1
#endif
subps xmm0, xmm1
subps xmm2, xmm3
movaps xmm7, s1
subps xmm4, xmm5
mulps xmm0, xmm7
movaps t3, xmm0
mulps xmm2, xmm7
movaps t4, xmm2
mulps xmm4, xmm7
movaps t5, xmm4
}
#else
n0[0] = d6[0] * d2[0];
n0[1] = d6[1] * d2[1];
n0[2] = d6[2] * d2[2];
n0[3] = d6[3] * d2[3];
n1[0] = d7[0] * d0[0];
n1[1] = d7[1] * d0[1];
n1[2] = d7[2] * d0[2];
n1[3] = d7[3] * d0[3];
n2[0] = d5[0] * d1[0];
n2[1] = d5[1] * d1[1];
n2[2] = d5[2] * d1[2];
n2[3] = d5[3] * d1[3];
n0[0] -= d7[0] * d1[0];
n0[1] -= d7[1] * d1[1];
n0[2] -= d7[2] * d1[2];
n0[3] -= d7[3] * d1[3];
n1[0] -= d5[0] * d2[0];
n1[1] -= d5[1] * d2[1];
n1[2] -= d5[2] * d2[2];
n1[3] -= d5[3] * d2[3];
n2[0] -= d6[0] * d0[0];
n2[1] -= d6[1] * d0[1];
n2[2] -= d6[2] * d0[2];
n2[3] -= d6[3] * d0[3];
n0[0] *= s2[0];
n0[1] *= s2[1];
n0[2] *= s2[2];
n0[3] *= s2[3];
n1[0] *= s2[0];
n1[1] *= s2[1];
n1[2] *= s2[2];
n1[3] *= s2[3];
n2[0] *= s2[0];
n2[1] *= s2[1];
n2[2] *= s2[2];
n2[3] *= s2[3];
t0[0] = d0[0] * d9[0];
t0[1] = d0[1] * d9[1];
t0[2] = d0[2] * d9[2];
t0[3] = d0[3] * d9[3];
t1[0] = d1[0] * d9[0];
t1[1] = d1[1] * d9[1];
t1[2] = d1[2] * d9[2];
t1[3] = d1[3] * d9[3];
t2[0] = d2[0] * d9[0];
t2[1] = d2[1] * d9[1];
t2[2] = d2[2] * d9[2];
t2[3] = d2[3] * d9[3];
t0[0] -= d4[0] * d5[0];
t0[1] -= d4[1] * d5[1];
t0[2] -= d4[2] * d5[2];
t0[3] -= d4[3] * d5[3];
t1[0] -= d4[0] * d6[0];
t1[1] -= d4[1] * d6[1];
t1[2] -= d4[2] * d6[2];
t1[3] -= d4[3] * d6[3];
t2[0] -= d4[0] * d7[0];
t2[1] -= d4[1] * d7[1];
t2[2] -= d4[2] * d7[2];
t2[3] -= d4[3] * d7[3];
t0[0] *= s0[0];
t0[1] *= s0[1];
t0[2] *= s0[2];
t0[3] *= s0[3];
t1[0] *= s0[0];
t1[1] *= s0[1];
t1[2] *= s0[2];
t1[3] *= s0[3];
t2[0] *= s0[0];
t2[1] *= s0[1];
t2[2] *= s0[2];
t2[3] *= s0[3];
#ifndef DERIVE_UNSMOOTHED_BITANGENT
t3[0] = d3[0] * d5[0];
t3[1] = d3[1] * d5[1];
t3[2] = d3[2] * d5[2];
t3[3] = d3[3] * d5[3];
t4[0] = d3[0] * d6[0];
t4[1] = d3[1] * d6[1];
t4[2] = d3[2] * d6[2];
t4[3] = d3[3] * d6[3];
t5[0] = d3[0] * d7[0];
t5[1] = d3[1] * d7[1];
t5[2] = d3[2] * d7[2];
t5[3] = d3[3] * d7[3];
t3[0] -= d0[0] * d8[0];
t3[1] -= d0[1] * d8[1];
t3[2] -= d0[2] * d8[2];
t3[3] -= d0[3] * d8[3];
t4[0] -= d1[0] * d8[0];
t4[1] -= d1[1] * d8[1];
t4[2] -= d1[2] * d8[2];
t4[3] -= d1[3] * d8[3];
t5[0] -= d2[0] * d8[0];
t5[1] -= d2[1] * d8[1];
t5[2] -= d2[2] * d8[2];
t5[3] -= d2[3] * d8[3];
#else
t3[0] = n2[0] * t1[0];
t3[1] = n2[1] * t1[1];
t3[2] = n2[2] * t1[2];
t3[3] = n2[3] * t1[3];
t4[0] = n0[0] * t2[0];
t4[1] = n0[1] * t2[1];
t4[2] = n0[2] * t2[2];
t4[3] = n0[3] * t2[3];
t5[0] = n1[0] * t0[0];
t5[1] = n1[1] * t0[1];
t5[2] = n1[2] * t0[2];
t5[3] = n1[3] * t0[3];
t3[0] -= n1[0] * t2[0];
t3[1] -= n1[1] * t2[1];
t3[2] -= n1[2] * t2[2];
t3[3] -= n1[3] * t2[3];
t4[0] -= n2[0] * t0[0];
t4[1] -= n2[1] * t0[1];
t4[2] -= n2[2] * t0[2];
t4[3] -= n2[3] * t0[3];
t5[0] -= n0[0] * t1[0];
t5[1] -= n0[1] * t1[1];
t5[2] -= n0[2] * t1[2];
t5[3] -= n0[3] * t1[3];
#endif
t3[0] *= s1[0];
t3[1] *= s1[1];
t3[2] *= s1[2];
t3[3] *= s1[3];
t4[0] *= s1[0];
t4[1] *= s1[1];
t4[2] *= s1[2];
t4[3] *= s1[3];
t5[0] *= s1[0];
t5[1] *= s1[1];
t5[2] *= s1[2];
t5[3] *= s1[3];
#endif
for ( j = 0; j < 4; j++ ) {
idDrawVert *a;
a = verts + i + j;
a->normal[0] = n0[j];
a->normal[1] = n1[j];
a->normal[2] = n2[j];
a->tangents[0][0] = t0[j];
a->tangents[0][1] = t1[j];
a->tangents[0][2] = t2[j];
a->tangents[1][0] = t3[j];
a->tangents[1][1] = t4[j];
a->tangents[1][2] = t5[j];
}
}
for ( ; i < numVerts; i++ ) {
idDrawVert *a, *b, *c;
float d0, d1, d2, d3, d4;
float d5, d6, d7, d8, d9;
float s0, s1, s2;
float n0, n1, n2;
float t0, t1, t2;
float t3, t4, t5;
const dominantTri_s &dt = dominantTris[i];
s0 = dt.normalizationScale[0];
s1 = dt.normalizationScale[1];
s2 = dt.normalizationScale[2];
a = verts + i;
b = verts + dt.v2;
c = verts + dt.v3;
d0 = b->xyz[0] - a->xyz[0];
d1 = b->xyz[1] - a->xyz[1];
d2 = b->xyz[2] - a->xyz[2];
d3 = b->st[0] - a->st[0];
d4 = b->st[1] - a->st[1];
d5 = c->xyz[0] - a->xyz[0];
d6 = c->xyz[1] - a->xyz[1];
d7 = c->xyz[2] - a->xyz[2];
d8 = c->st[0] - a->st[0];
d9 = c->st[1] - a->st[1];
#if 1
__asm {
movss xmm0, d6
mulss xmm0, d2
movss xmm1, d7
mulss xmm1, d1
movss xmm2, d7
mulss xmm2, d0
movss xmm3, d5
mulss xmm3, d2
movss xmm4, d5
mulss xmm4, d1
movss xmm5, d6
mulss xmm5, d0
subss xmm0, xmm1
subss xmm2, xmm3
movss xmm7, s2
subss xmm4, xmm5
mulss xmm0, xmm7
movss n0, xmm0
mulss xmm2, xmm7
movss n1, xmm2
mulss xmm4, xmm7
movss n2, xmm4
movss xmm0, d0
mulss xmm0, d9
movss xmm1, d4
mulss xmm1, d5
movss xmm2, d1
mulss xmm2, d9
movss xmm3, d4
mulss xmm3, d6
movss xmm4, d2
mulss xmm4, d9
movss xmm5, d4
mulss xmm5, d7
subss xmm0, xmm1
subss xmm2, xmm3
movss xmm7, s0
subss xmm4, xmm5
mulss xmm0, xmm7
movss t0, xmm0
mulss xmm2, xmm7
movss t1, xmm2
mulss xmm4, xmm7
movss t2, xmm4
#ifndef DERIVE_UNSMOOTHED_BITANGENT
movss xmm0, d3
mulss xmm0, d5
movss xmm1, d0
mulss xmm1, d8
movss xmm2, d3
mulss xmm2, d6
movss xmm3, d1
mulss xmm3, d8
movss xmm4, d3
mulss xmm4, d7
movss xmm5, d2
mulss xmm5, d8
#else
movss xmm0, n2
mulss xmm0, t1
movss xmm1, n1
mulss xmm1, t2
movss xmm2, n0
mulss xmm2, t2
movss xmm3, n2
mulss xmm3, t0
movss xmm4, n1
mulss xmm4, t0
movss xmm5, n0
mulss xmm5, t1
#endif
subss xmm0, xmm1
subss xmm2, xmm3
movss xmm7, s1
subss xmm4, xmm5
mulss xmm0, xmm7
movss t3, xmm0
mulss xmm2, xmm7
movss t4, xmm2
mulss xmm4, xmm7
movss t5, xmm4
}
#else
n0 = s2 * ( d6 * d2 - d7 * d1 );
n1 = s2 * ( d7 * d0 - d5 * d2 );
n2 = s2 * ( d5 * d1 - d6 * d0 );
t0 = s0 * ( d0 * d9 - d4 * d5 );
t1 = s0 * ( d1 * d9 - d4 * d6 );
t2 = s0 * ( d2 * d9 - d4 * d7 );
#ifndef DERIVE_UNSMOOTHED_BITANGENT
t3 = s1 * ( d3 * d5 - d0 * d8 );
t4 = s1 * ( d3 * d6 - d1 * d8 );
t5 = s1 * ( d3 * d7 - d2 * d8 );
#else
t3 = s1 * ( n2 * t1 - n1 * t2 );
t4 = s1 * ( n0 * t2 - n2 * t0 );
t5 = s1 * ( n1 * t0 - n0 * t1 );
#endif
#endif
a->normal[0] = n0;
a->normal[1] = n1;
a->normal[2] = n2;
a->tangents[0][0] = t0;
a->tangents[0][1] = t1;
a->tangents[0][2] = t2;
a->tangents[1][0] = t3;
a->tangents[1][1] = t4;
a->tangents[1][2] = t5;
}
}
/*
============
idSIMD_SSE::NormalizeTangents
============
*/
void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
ALIGN16( float normal[12] );
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
assert( verts != NULL );
assert( numVerts >= 0 );
__asm {
mov eax, numVerts
test eax, eax
jz done
#ifdef REFINE_TANGENT_SQUAREROOT
movaps xmm6, SIMD_SP_rsqrt_c0
movaps xmm7, SIMD_SP_rsqrt_c1
#endif
mov esi, verts
imul eax, DRAWVERT_SIZE
add esi, eax
neg eax
add eax, DRAWVERT_SIZE*4
jle loopVert4
sub eax, DRAWVERT_SIZE*4
jl loopVert1
loopVert4:
sub eax, DRAWVERT_SIZE*4
// normalize 4 idDrawVert::normal
movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0] // 0, X, X, X
movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0] // 0, X, 3, 4
movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8] // 5, X, X, X
movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4] // 5, X, 1, 2
movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0] // 6, X, X, X
movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0] // 6, X, 9, 10
movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8] // 11, X, X, X
movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4] // 11, X, 7, 8
movaps xmm1, xmm0
movaps xmm5, xmm2
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, xmm3
mulps xmm4, xmm4
mulps xmm5, xmm5
addps xmm3, xmm4
addps xmm3, xmm5
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtps xmm4, xmm3
mulps xmm3, xmm4
mulps xmm3, xmm4
subps xmm3, xmm6
mulps xmm4, xmm7
mulps xmm3, xmm4
#else
rsqrtps xmm3, xmm3
#endif
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
// save the 4 idDrawVert::normal to project the tangents
movaps [normal+ 0], xmm0
movaps [normal+16], xmm1
movaps [normal+32], xmm2
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
// project and normalize 4 idDrawVert::tangent[0]
movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X
movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4
movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X
movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2
movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X
movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10
movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X
movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8
movaps xmm1, xmm0
movaps xmm5, xmm2
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, [normal+ 0]
mulps xmm4, [normal+16]
mulps xmm5, [normal+32]
addps xmm3, xmm4
addps xmm3, xmm5
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, [normal+ 0]
mulps xmm4, [normal+16]
mulps xmm5, [normal+32]
subps xmm0, xmm3
subps xmm1, xmm4
subps xmm2, xmm5
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, xmm3
mulps xmm4, xmm4
mulps xmm5, xmm5
addps xmm3, xmm4
addps xmm3, xmm5
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtps xmm4, xmm3
mulps xmm3, xmm4
mulps xmm3, xmm4
subps xmm3, xmm6
mulps xmm4, xmm7
mulps xmm3, xmm4
#else
rsqrtps xmm3, xmm3
#endif
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
// project and normalize 4 idDrawVert::tangent[1]
movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X
movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4
movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X
movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2
movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X
movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10
movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X
movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8
movaps xmm1, xmm0
movaps xmm5, xmm2
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, [normal+ 0]
mulps xmm4, [normal+16]
mulps xmm5, [normal+32]
addps xmm3, xmm4
addps xmm3, xmm5
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, [normal+ 0]
mulps xmm4, [normal+16]
mulps xmm5, [normal+32]
subps xmm0, xmm3
subps xmm1, xmm4
subps xmm2, xmm5
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
mulps xmm3, xmm3
mulps xmm4, xmm4
mulps xmm5, xmm5
addps xmm3, xmm4
addps xmm3, xmm5
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtps xmm4, xmm3
mulps xmm3, xmm4
mulps xmm3, xmm4
subps xmm3, xmm6
mulps xmm4, xmm7
mulps xmm3, xmm4
#else
rsqrtps xmm3, xmm3
#endif
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
add eax, DRAWVERT_SIZE*8
jle loopVert4
sub eax, DRAWVERT_SIZE*4
jge done
loopVert1:
// normalize one idDrawVert::normal
movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, xmm3
mulss xmm4, xmm4
mulss xmm5, xmm5
addss xmm3, xmm4
addss xmm3, xmm5
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtss xmm4, xmm3
mulss xmm3, xmm4
mulss xmm3, xmm4
subss xmm3, xmm6
mulss xmm4, xmm7
mulss xmm3, xmm4
#else
rsqrtss xmm3, xmm3
#endif
mulss xmm0, xmm3
mulss xmm1, xmm3
mulss xmm2, xmm3
movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
// project and normalize one idDrawVert::tangent[0]
movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
addss xmm3, xmm4
addss xmm3, xmm5
movss xmm4, xmm3
movss xmm5, xmm3
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
subss xmm0, xmm3
subss xmm1, xmm4
subss xmm2, xmm5
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, xmm3
mulss xmm4, xmm4
mulss xmm5, xmm5
addss xmm3, xmm4
addss xmm3, xmm5
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtss xmm4, xmm3
mulss xmm3, xmm4
mulss xmm3, xmm4
subss xmm3, xmm6
mulss xmm4, xmm7
mulss xmm3, xmm4
#else
rsqrtss xmm3, xmm3
#endif
mulss xmm0, xmm3
mulss xmm1, xmm3
mulss xmm2, xmm3
movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
// project and normalize one idDrawVert::tangent[1]
movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
addss xmm3, xmm4
addss xmm3, xmm5
movss xmm4, xmm3
movss xmm5, xmm3
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
subss xmm0, xmm3
subss xmm1, xmm4
subss xmm2, xmm5
movss xmm3, xmm0
movss xmm4, xmm1
movss xmm5, xmm2
mulss xmm3, xmm3
mulss xmm4, xmm4
mulss xmm5, xmm5
addss xmm3, xmm4
addss xmm3, xmm5
#ifdef REFINE_TANGENT_SQUAREROOT
rsqrtss xmm4, xmm3
mulss xmm3, xmm4
mulss xmm3, xmm4
subss xmm3, xmm6
mulss xmm4, xmm7
mulss xmm3, xmm4
#else
rsqrtss xmm3, xmm3
#endif
mulss xmm0, xmm3
mulss xmm1, xmm3
mulss xmm2, xmm3
movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
add eax, DRAWVERT_SIZE
jl loopVert1
done:
}
}
/*
============
idSIMD_SSE::CreateTextureSpaceLightVectors
============
*/
void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
memset( used, 0, numVerts * sizeof( used[0] ) );
for ( int i = numIndexes - 1; i >= 0; i-- ) {
used[indexes[i]] = true;
}
#if 0
__asm {
mov eax, numVerts
mov esi, used
add esi, eax
mov edi, verts
sub edi, DRAWVERT_SIZE
neg eax
dec eax
mov ecx, lightOrigin
movss xmm7, [ecx+0]
movhps xmm7, [ecx+4]
mov ecx, lightVectors
sub ecx, 3*4
loopVert:
inc eax
jge done
add edi, DRAWVERT_SIZE
add ecx, 3*4
cmp byte ptr [esi+eax], 0
je loopVert
movaps xmm0, xmm7
movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
subps xmm0, xmm1
// 0, X, 1, 2
// 3, X, 4, 5
// 6, X, 7, 8
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
mulps xmm2, xmm0
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
mulps xmm3, xmm0
movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
mulps xmm4, xmm0
movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
addps xmm5, xmm4
addps xmm5, xmm2
movlps [ecx+0], xmm5
shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
movss [ecx+8], xmm5
jmp loopVert
done:
}
#elif 1
for ( int i = 0; i < numVerts; i++ ) {
if ( !used[i] ) {
continue;
}
const idDrawVert *v = &verts[i];
idVec3 lightDir;
lightDir[0] = lightOrigin[0] - v->xyz[0];
lightDir[1] = lightOrigin[1] - v->xyz[1];
lightDir[2] = lightOrigin[2] - v->xyz[2];
lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
}
#elif 1
ALIGN16( int usedVertNums[4] );
ALIGN16( float lightDir0[4] );
ALIGN16( float lightDir1[4] );
ALIGN16( float lightDir2[4] );
ALIGN16( float normal0[4] );
ALIGN16( float normal1[4] );
ALIGN16( float normal2[4] );
ALIGN16( float tangent0[4] );
ALIGN16( float tangent1[4] );
ALIGN16( float tangent2[4] );
ALIGN16( float tangent3[4] );
ALIGN16( float tangent4[4] );
ALIGN16( float tangent5[4] );
idVec3 localLightOrigin = lightOrigin;
__asm {
xor ecx, ecx
mov eax, numVerts
mov esi, used
add esi, eax
mov edi, verts
sub edi, DRAWVERT_SIZE
neg eax
dec eax
loopVert4:
inc eax
jge done4
add edi, DRAWVERT_SIZE
cmp byte ptr [esi+eax], 0
je loopVert4
mov usedVertNums[ecx*4], eax
inc ecx
cmp ecx, 4
movss xmm0, localLightOrigin[0]
movss xmm1, localLightOrigin[4]
movss xmm2, localLightOrigin[8]
subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
movss lightDir0[ecx*4-4], xmm0
movss lightDir1[ecx*4-4], xmm1
movss lightDir2[ecx*4-4], xmm2
movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
movss normal0[ecx*4-4], xmm3
movss normal1[ecx*4-4], xmm4
movss normal2[ecx*4-4], xmm5
movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
movss tangent0[ecx*4-4], xmm0
movss tangent1[ecx*4-4], xmm1
movss tangent2[ecx*4-4], xmm2
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
movss tangent3[ecx*4-4], xmm3
movss tangent4[ecx*4-4], xmm4
movss tangent5[ecx*4-4], xmm5
jl loopVert4
movaps xmm0, lightDir0
movaps xmm1, lightDir1
movaps xmm2, lightDir2
movaps xmm3, tangent0
mulps xmm3, xmm0
movaps xmm4, tangent1
mulps xmm4, xmm1
movaps xmm5, tangent2
mulps xmm5, xmm2
addps xmm3, xmm4
addps xmm5, xmm3
movaps xmm3, tangent3
mulps xmm3, xmm0
movaps xmm4, tangent4
mulps xmm4, xmm1
movaps xmm6, tangent5
mulps xmm6, xmm2
addps xmm3, xmm4
addps xmm6, xmm3
mulps xmm0, normal0
mulps xmm1, normal1
mulps xmm2, normal2
addps xmm0, xmm1
addps xmm0, xmm2
mov ecx, numVerts
imul ecx, 12
mov edx, usedVertNums[0]
add ecx, lightVectors
imul edx, 12
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
mov edx, usedVertNums[4]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
imul edx, 12
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
mov edx, usedVertNums[8]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
imul edx, 12
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
mov edx, usedVertNums[12]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
imul edx, 12
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
xor ecx, ecx
jmp loopVert4
done4:
test ecx, ecx
jz done
xor eax, eax
mov edi, numVerts
imul edi, 12
add edi, lightVectors
loopVert1:
movss xmm0, lightDir0[eax*4]
movss xmm1, lightDir1[eax*4]
movss xmm2, lightDir2[eax*4]
mov edx, usedVertNums[eax*4]
imul edx, 12
movss xmm3, tangent0[eax*4]
mulss xmm3, xmm0
movss xmm4, tangent1[eax*4]
mulss xmm4, xmm1
movss xmm5, tangent2[eax*4]
mulss xmm5, xmm2
addss xmm3, xmm4
addss xmm5, xmm3
movss [edi+edx+0], xmm5
movss xmm3, tangent3[eax*4]
mulss xmm3, xmm0
movss xmm4, tangent4[eax*4]
mulss xmm4, xmm1
movss xmm6, tangent5[eax*4]
mulss xmm6, xmm2
addss xmm3, xmm4
addss xmm6, xmm3
movss [edi+edx+4], xmm6
mulss xmm0, normal0[eax*4]
mulss xmm1, normal1[eax*4]
mulss xmm2, normal2[eax*4]
addss xmm0, xmm1
addss xmm0, xmm2
movss [edi+edx+8], xmm0
inc eax
dec ecx
jg loopVert1
done:
}
#else
ALIGN16( float lightVectors0[4] );
ALIGN16( float lightVectors1[4] );
ALIGN16( float lightVectors2[4] );
int numUsedVerts = 0;
for ( int i = 0; i < numVerts; i++ ) {
if ( !used[i] ) {
continue;
}
const idDrawVert *v = &verts[i];
lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];
normal0[numUsedVerts] = v->normal[0];
normal1[numUsedVerts] = v->normal[1];
normal2[numUsedVerts] = v->normal[2];
tangent0[numUsedVerts] = v->tangents[0][0];
tangent1[numUsedVerts] = v->tangents[0][1];
tangent2[numUsedVerts] = v->tangents[0][2];
tangent3[numUsedVerts] = v->tangents[1][0];
tangent4[numUsedVerts] = v->tangents[1][1];
tangent5[numUsedVerts] = v->tangents[1][2];
usedVertNums[numUsedVerts++] = i;
if ( numUsedVerts < 4 ) {
continue;
}
lightVectors0[0] = lightDir0[0] * tangent0[0];
lightVectors0[1] = lightDir0[1] * tangent0[1];
lightVectors0[2] = lightDir0[2] * tangent0[2];
lightVectors0[3] = lightDir0[3] * tangent0[3];
lightVectors0[0] += lightDir1[0] * tangent1[0];
lightVectors0[1] += lightDir1[1] * tangent1[1];
lightVectors0[2] += lightDir1[2] * tangent1[2];
lightVectors0[3] += lightDir1[3] * tangent1[3];
lightVectors0[0] += lightDir2[0] * tangent2[0];
lightVectors0[1] += lightDir2[1] * tangent2[1];
lightVectors0[2] += lightDir2[2] * tangent2[2];
lightVectors0[3] += lightDir2[3] * tangent2[3];
lightVectors1[0] = lightDir0[0] * tangent3[0];
lightVectors1[1] = lightDir0[1] * tangent3[1];
lightVectors1[2] = lightDir0[2] * tangent3[2];
lightVectors1[3] = lightDir0[3] * tangent3[3];
lightVectors1[0] += lightDir1[0] * tangent4[0];
lightVectors1[1] += lightDir1[1] * tangent4[1];
lightVectors1[2] += lightDir1[2] * tangent4[2];
lightVectors1[3] += lightDir1[3] * tangent4[3];
lightVectors1[0] += lightDir2[0] * tangent5[0];
lightVectors1[1] += lightDir2[1] * tangent5[1];
lightVectors1[2] += lightDir2[2] * tangent5[2];
lightVectors1[3] += lightDir2[3] * tangent5[3];
lightVectors2[0] = lightDir0[0] * normal0[0];
lightVectors2[1] = lightDir0[1] * normal0[1];
lightVectors2[2] = lightDir0[2] * normal0[2];
lightVectors2[3] = lightDir0[3] * normal0[3];
lightVectors2[0] += lightDir1[0] * normal1[0];
lightVectors2[1] += lightDir1[1] * normal1[1];
lightVectors2[2] += lightDir1[2] * normal1[2];
lightVectors2[3] += lightDir1[3] * normal1[3];
lightVectors2[0] += lightDir2[0] * normal2[0];
lightVectors2[1] += lightDir2[1] * normal2[1];
lightVectors2[2] += lightDir2[2] * normal2[2];
lightVectors2[3] += lightDir2[3] * normal2[3];
for ( int j = 0; j < 4; j++ ) {
int n = usedVertNums[j];
lightVectors[n][0] = lightVectors0[j];
lightVectors[n][1] = lightVectors1[j];
lightVectors[n][2] = lightVectors2[j];
}
numUsedVerts = 0;
}
for ( int i = 0; i < numUsedVerts; i++ ) {
lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
int n = usedVertNums[i];
lightVectors[n][0] = lightVectors0[i];
lightVectors[n][1] = lightVectors1[i];
lightVectors[n][2] = lightVectors2[i];
}
#endif
}
/*
============
idSIMD_SSE::CreateSpecularTextureCoords
============
*/
void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
memset( used, 0, numVerts * sizeof( used[0] ) );
for ( int i = numIndexes - 1; i >= 0; i-- ) {
used[indexes[i]] = true;
}
#if 0
__asm {
mov eax, numVerts
mov esi, used
add esi, eax
mov edi, verts
sub edi, DRAWVERT_SIZE
neg eax
dec eax
mov ecx, viewOrigin
movss xmm6, [ecx+0]
movhps xmm6, [ecx+4]
mov ecx, lightOrigin
movss xmm7, [ecx+0]
movhps xmm7, [ecx+4]
mov ecx, texCoords
sub ecx, 4*4
loopVert:
inc eax
jge done
add edi, DRAWVERT_SIZE
add ecx, 4*4
cmp byte ptr [esi+eax], 0
je loopVert
movaps xmm0, xmm7
movaps xmm1, xmm6
movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
subps xmm0, xmm2
subps xmm1, xmm2
movaps xmm3, xmm0
movaps xmm4, xmm1
mulps xmm3, xmm3
mulps xmm4, xmm4
// 0, X, 1, 2
// 3, X, 4, 5
movaps xmm5, xmm3 // xmm5 = 0, X, 1, 2
unpcklps xmm5, xmm4 // xmm5 = 0, 3, X, X
unpckhps xmm3, xmm4 // xmm3 = 1, 4, 2, 5
movhlps xmm4, xmm3 // xmm4 = 2, 5, 4, 5
addps xmm5, xmm3
addps xmm5, xmm4
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
rsqrtps xmm5, xmm5
movaps xmm4, xmm5
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm0, xmm4
mulps xmm1, xmm5
addps xmm0, xmm1
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
mulps xmm2, xmm0
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
mulps xmm3, xmm0
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
mulps xmm4, xmm0
movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
movaps xmm3, SIMD_SP_one
addps xmm5, xmm4
addps xmm5, xmm2
movaps [ecx+0], xmm5
movss [ecx+12], xmm3
jmp loopVert
done:
}
#elif 0
for ( int i = 0; i < numVerts; i++ ) {
if ( !used[i] ) {
continue;
}
const idDrawVert *v = &verts[i];
idVec3 lightDir = lightOrigin - v->xyz;
idVec3 viewDir = viewOrigin - v->xyz;
float ilength;
ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
lightDir[0] *= ilength;
lightDir[1] *= ilength;
lightDir[2] *= ilength;
ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
viewDir[0] *= ilength;
viewDir[1] *= ilength;
viewDir[2] *= ilength;
lightDir += viewDir;
texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
texCoords[i][3] = 1.0f;
}
#elif 1
ALIGN16( int usedVertNums[4] );
ALIGN16( float lightDir0[4] );
ALIGN16( float lightDir1[4] );
ALIGN16( float lightDir2[4] );
ALIGN16( float viewDir0[4] );
ALIGN16( float viewDir1[4] );
ALIGN16( float viewDir2[4] );
ALIGN16( float normal0[4] );
ALIGN16( float normal1[4] );
ALIGN16( float normal2[4] );
ALIGN16( float tangent0[4] );
ALIGN16( float tangent1[4] );
ALIGN16( float tangent2[4] );
ALIGN16( float tangent3[4] );
ALIGN16( float tangent4[4] );
ALIGN16( float tangent5[4] );
idVec3 localLightOrigin = lightOrigin;
idVec3 localViewOrigin = viewOrigin;
__asm {
xor ecx, ecx
mov eax, numVerts
mov esi, used
add esi, eax
mov edi, verts
sub edi, DRAWVERT_SIZE
neg eax
dec eax
loopVert4:
inc eax
jge done4
add edi, DRAWVERT_SIZE
cmp byte ptr [esi+eax], 0
je loopVert4
mov usedVertNums[ecx*4], eax
inc ecx
cmp ecx, 4
movss xmm3, localLightOrigin[0]
movss xmm4, localLightOrigin[4]
movss xmm5, localLightOrigin[8]
subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
movss lightDir0[ecx*4-4], xmm3
movss lightDir1[ecx*4-4], xmm4
movss lightDir2[ecx*4-4], xmm5
movss xmm0, localViewOrigin[0]
movss xmm1, localViewOrigin[4]
movss xmm2, localViewOrigin[8]
subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
movss viewDir0[ecx*4-4], xmm0
movss viewDir1[ecx*4-4], xmm1
movss viewDir2[ecx*4-4], xmm2
movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
movss normal0[ecx*4-4], xmm3
movss normal1[ecx*4-4], xmm4
movss normal2[ecx*4-4], xmm5
movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
movss tangent0[ecx*4-4], xmm0
movss tangent1[ecx*4-4], xmm1
movss tangent2[ecx*4-4], xmm2
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
movss tangent3[ecx*4-4], xmm3
movss tangent4[ecx*4-4], xmm4
movss tangent5[ecx*4-4], xmm5
jl loopVert4
movaps xmm6, lightDir0
movaps xmm0, xmm6
mulps xmm6, xmm6
movaps xmm7, lightDir1
movaps xmm1, xmm7
mulps xmm7, xmm7
addps xmm6, xmm7
movaps xmm5, lightDir2
movaps xmm2, xmm5
mulps xmm5, xmm5
addps xmm6, xmm5
rsqrtps xmm6, xmm6
mulps xmm0, xmm6
mulps xmm1, xmm6
mulps xmm2, xmm6
movaps xmm3, viewDir0
movaps xmm7, xmm3
mulps xmm7, xmm7
movaps xmm4, viewDir1
movaps xmm6, xmm4
mulps xmm6, xmm6
addps xmm7, xmm6
movaps xmm5, viewDir2
movaps xmm6, xmm5
mulps xmm6, xmm6
addps xmm7, xmm6
rsqrtps xmm7, xmm7
mulps xmm3, xmm7
addps xmm0, xmm3
mulps xmm4, xmm7
addps xmm1, xmm4
mulps xmm5, xmm7
addps xmm2, xmm5
movaps xmm3, tangent0
mulps xmm3, xmm0
movaps xmm4, tangent1
mulps xmm4, xmm1
addps xmm3, xmm4
movaps xmm5, tangent2
mulps xmm5, xmm2
addps xmm5, xmm3
movaps xmm3, tangent3
mulps xmm3, xmm0
movaps xmm4, tangent4
mulps xmm4, xmm1
addps xmm3, xmm4
movaps xmm6, tangent5
mulps xmm6, xmm2
addps xmm6, xmm3
mulps xmm0, normal0
mulps xmm1, normal1
addps xmm0, xmm1
mulps xmm2, normal2
addps xmm0, xmm2
mov ecx, numVerts
shl ecx, 4
mov edx, usedVertNums[0]
add ecx, texCoords
shl edx, 4
movss xmm3, SIMD_SP_one
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
movss [ecx+edx+12], xmm3
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
mov edx, usedVertNums[4]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
shl edx, 4
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
movss [ecx+edx+12], xmm3
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
mov edx, usedVertNums[8]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
shl edx, 4
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
movss [ecx+edx+12], xmm3
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
mov edx, usedVertNums[12]
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
shl edx, 4
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
movss [ecx+edx+0], xmm5
movss [ecx+edx+4], xmm6
movss [ecx+edx+8], xmm0
movss [ecx+edx+12], xmm3
xor ecx, ecx
jmp loopVert4
done4:
test ecx, ecx
jz done
xor eax, eax
mov edi, numVerts
shl edi, 4
add edi, texCoords
loopVert1:
movss xmm6, lightDir0[eax*4]
movss xmm0, xmm6
mulss xmm6, xmm6
movss xmm7, lightDir1[eax*4]
movss xmm1, xmm7
mulss xmm7, xmm7
addss xmm6, xmm7
movss xmm5, lightDir2[eax*4]
movss xmm2, xmm5
mulss xmm5, xmm5
addss xmm6, xmm5
rsqrtss xmm6, xmm6
mulss xmm0, xmm6
mulss xmm1, xmm6
mulss xmm2, xmm6
movss xmm3, viewDir0[eax*4]
movss xmm7, xmm3
mulss xmm7, xmm7
movss xmm4, viewDir1[eax*4]
movss xmm6, xmm4
mulss xmm6, xmm6
addss xmm7, xmm6
movss xmm5, viewDir2[eax*4]
movss xmm6, xmm5
mulss xmm6, xmm6
addss xmm7, xmm6
rsqrtss xmm7, xmm7
mulss xmm3, xmm7
addss xmm0, xmm3
mulss xmm4, xmm7
addss xmm1, xmm4
mulss xmm5, xmm7
addss xmm2, xmm5
mov edx, usedVertNums[eax*4]
shl edx, 4
movss xmm3, tangent0[eax*4]
mulss xmm3, xmm0
movss xmm4, tangent1[eax*4]
mulss xmm4, xmm1
addss xmm3, xmm4
movss xmm5, tangent2[eax*4]
mulss xmm5, xmm2
addss xmm5, xmm3
movss [edi+edx+0], xmm5
movss xmm3, tangent3[eax*4]
mulss xmm3, xmm0
movss xmm4, tangent4[eax*4]
mulss xmm4, xmm1
addss xmm3, xmm4
movss xmm6, tangent5[eax*4]
mulss xmm6, xmm2
addss xmm6, xmm3
movss [edi+edx+4], xmm6
mulss xmm0, normal0[eax*4]
mulss xmm1, normal1[eax*4]
addss xmm0, xmm1
mulss xmm2, normal2[eax*4]
addss xmm0, xmm2
movss [edi+edx+8], xmm0
movss xmm3, SIMD_SP_one
movss [edi+edx+12], xmm3
inc eax
dec ecx
jg loopVert1
done:
}
#else
ALIGN16( int usedVertNums[4] );
ALIGN16( float lightDir0[4] );
ALIGN16( float lightDir1[4] );
ALIGN16( float lightDir2[4] );
ALIGN16( float viewDir0[4] );
ALIGN16( float viewDir1[4] );
ALIGN16( float viewDir2[4] );
ALIGN16( float normal0[4] );
ALIGN16( float normal1[4] );
ALIGN16( float normal2[4] );
ALIGN16( float tangent0[4] );
ALIGN16( float tangent1[4] );
ALIGN16( float tangent2[4] );
ALIGN16( float tangent3[4] );
ALIGN16( float tangent4[4] );
ALIGN16( float tangent5[4] );
ALIGN16( float texCoords0[4] );
ALIGN16( float texCoords1[4] );
ALIGN16( float texCoords2[4] );
idVec3 localLightOrigin = lightOrigin;
idVec3 localViewOrigin = viewOrigin;
int numUsedVerts = 0;
for ( int i = 0; i < numVerts; i++ ) {
if ( !used[i] ) {
continue;
}
const idDrawVert *v = &verts[i];
lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];
viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];
normal0[numUsedVerts] = v->normal[0];
normal1[numUsedVerts] = v->normal[1];
normal2[numUsedVerts] = v->normal[2];
tangent0[numUsedVerts] = v->tangents[0][0];
tangent1[numUsedVerts] = v->tangents[0][1];
tangent2[numUsedVerts] = v->tangents[0][2];
tangent3[numUsedVerts] = v->tangents[1][0];
tangent4[numUsedVerts] = v->tangents[1][1];
tangent5[numUsedVerts] = v->tangents[1][2];
usedVertNums[numUsedVerts++] = i;
if ( numUsedVerts < 4 ) {
continue;
}
ALIGN16( float temp[4] );
temp[0] = lightDir0[0] * lightDir0[0];
temp[1] = lightDir0[1] * lightDir0[1];
temp[2] = lightDir0[2] * lightDir0[2];
temp[3] = lightDir0[3] * lightDir0[3];
temp[0] += lightDir1[0] * lightDir1[0];
temp[1] += lightDir1[1] * lightDir1[1];
temp[2] += lightDir1[2] * lightDir1[2];
temp[3] += lightDir1[3] * lightDir1[3];
temp[0] += lightDir2[0] * lightDir2[0];
temp[1] += lightDir2[1] * lightDir2[1];
temp[2] += lightDir2[2] * lightDir2[2];
temp[3] += lightDir2[3] * lightDir2[3];
temp[0] = idMath::RSqrt( temp[0] );
temp[1] = idMath::RSqrt( temp[1] );
temp[2] = idMath::RSqrt( temp[2] );
temp[3] = idMath::RSqrt( temp[3] );
lightDir0[0] *= temp[0];
lightDir0[1] *= temp[1];
lightDir0[2] *= temp[2];
lightDir0[3] *= temp[3];
lightDir1[0] *= temp[0];
lightDir1[1] *= temp[1];
lightDir1[2] *= temp[2];
lightDir1[3] *= temp[3];
lightDir2[0] *= temp[0];
lightDir2[1] *= temp[1];
lightDir2[2] *= temp[2];
lightDir2[3] *= temp[3];
temp[0] = viewDir0[0] * viewDir0[0];
temp[1] = viewDir0[1] * viewDir0[1];
temp[2] = viewDir0[2] * viewDir0[2];
temp[3] = viewDir0[3] * viewDir0[3];
temp[0] += viewDir1[0] * viewDir1[0];
temp[1] += viewDir1[1] * viewDir1[1];
temp[2] += viewDir1[2] * viewDir1[2];
temp[3] += viewDir1[3] * viewDir1[3];
temp[0] += viewDir2[0] * viewDir2[0];
temp[1] += viewDir2[1] * viewDir2[1];
temp[2] += viewDir2[2] * viewDir2[2];
temp[3] += viewDir2[3] * viewDir2[3];
temp[0] = idMath::RSqrt( temp[0] );
temp[1] = idMath::RSqrt( temp[1] );
temp[2] = idMath::RSqrt( temp[2] );
temp[3] = idMath::RSqrt( temp[3] );
viewDir0[0] *= temp[0];
viewDir0[1] *= temp[1];
viewDir0[2] *= temp[2];
viewDir0[3] *= temp[3];
viewDir1[0] *= temp[0];
viewDir1[1] *= temp[1];
viewDir1[2] *= temp[2];
viewDir1[3] *= temp[3];
viewDir2[0] *= temp[0];
viewDir2[1] *= temp[1];
viewDir2[2] *= temp[2];
viewDir2[3] *= temp[3];
lightDir0[0] += viewDir0[0];
lightDir0[1] += viewDir0[1];
lightDir0[2] += viewDir0[2];
lightDir0[3] += viewDir0[3];
lightDir1[0] += viewDir1[0];
lightDir1[1] += viewDir1[1];
lightDir1[2] += viewDir1[2];
lightDir1[3] += viewDir1[3];
lightDir2[0] += viewDir2[0];
lightDir2[1] += viewDir2[1];
lightDir2[2] += viewDir2[2];
lightDir2[3] += viewDir2[3];
texCoords0[0] = lightDir0[0] * tangent0[0];
texCoords0[1] = lightDir0[1] * tangent0[1];
texCoords0[2] = lightDir0[2] * tangent0[2];
texCoords0[3] = lightDir0[3] * tangent0[3];
texCoords0[0] += lightDir1[0] * tangent1[0];
texCoords0[1] += lightDir1[1] * tangent1[1];
texCoords0[2] += lightDir1[2] * tangent1[2];
texCoords0[3] += lightDir1[3] * tangent1[3];
texCoords0[0] += lightDir2[0] * tangent2[0];
texCoords0[1] += lightDir2[1] * tangent2[1];
texCoords0[2] += lightDir2[2] * tangent2[2];
texCoords0[3] += lightDir2[3] * tangent2[3];
texCoords1[0] = lightDir0[0] * tangent3[0];
texCoords1[1] = lightDir0[1] * tangent3[1];
texCoords1[2] = lightDir0[2] * tangent3[2];
texCoords1[3] = lightDir0[3] * tangent3[3];
texCoords1[0] += lightDir1[0] * tangent4[0];
texCoords1[1] += lightDir1[1] * tangent4[1];
texCoords1[2] += lightDir1[2] * tangent4[2];
texCoords1[3] += lightDir1[3] * tangent4[3];
texCoords1[0] += lightDir2[0] * tangent5[0];
texCoords1[1] += lightDir2[1] * tangent5[1];
texCoords1[2] += lightDir2[2] * tangent5[2];
texCoords1[3] += lightDir2[3] * tangent5[3];
texCoords2[0] = lightDir0[0] * normal0[0];
texCoords2[1] = lightDir0[1] * normal0[1];
texCoords2[2] = lightDir0[2] * normal0[2];
texCoords2[3] = lightDir0[3] * normal0[3];
texCoords2[0] += lightDir1[0] * normal1[0];
texCoords2[1] += lightDir1[1] * normal1[1];
texCoords2[2] += lightDir1[2] * normal1[2];
texCoords2[3] += lightDir1[3] * normal1[3];
texCoords2[0] += lightDir2[0] * normal2[0];
texCoords2[1] += lightDir2[1] * normal2[1];
texCoords2[2] += lightDir2[2] * normal2[2];
texCoords2[3] += lightDir2[3] * normal2[3];
for ( int j = 0; j < 4; j++ ) {
int n = usedVertNums[j];
texCoords[n][0] = texCoords0[j];
texCoords[n][1] = texCoords1[j];
texCoords[n][2] = texCoords2[j];
texCoords[n][3] = 1.0f;
}
numUsedVerts = 0;
}
for ( int i = 0; i < numUsedVerts; i++ ) {
float temp;
temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
temp = idMath::RSqrt( temp );
lightDir0[i] *= temp;
lightDir1[i] *= temp;
lightDir2[i] *= temp;
temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
temp = idMath::RSqrt( temp );
viewDir0[i] *= temp;
viewDir1[i] *= temp;
viewDir2[i] *= temp;
lightDir0[i] += viewDir0[i];
lightDir1[i] += viewDir1[i];
lightDir2[i] += viewDir2[i];
texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
int n = usedVertNums[i];
texCoords[n][0] = texCoords0;
texCoords[n][1] = texCoords1;
texCoords[n][2] = texCoords2;
texCoords[n][3] = 1.0f;
}
#endif
}
/*
============
idSIMD_SSE::CreateShadowCache
============
*/
int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
#if 1
int outVerts;
__asm {
push ebx
mov esi, lightOrigin
movaps xmm5, SIMD_SP_lastOne
movss xmm6, [esi+0]
movhps xmm6, [esi+4]
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
orps xmm6, SIMD_SP_lastOne
movaps xmm7, xmm6
xor ebx, ebx
xor ecx, ecx
mov edx, vertRemap
mov esi, verts
mov edi, vertexCache
mov eax, numVerts
and eax, ~3
jz done4
shl eax, 2
add edx, eax
neg eax
loop4:
prefetchnta [edx+128]
prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
cmp dword ptr [edx+eax+0], ebx
jne skip1
mov dword ptr [edx+eax+0], ecx
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
add ecx, 2
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
orps xmm0, xmm5
movaps [edi+0*16], xmm0
subps xmm0, xmm6
movaps [edi+1*16], xmm0
add edi, 2*16
skip1:
cmp dword ptr [edx+eax+4], ebx
jne skip2
mov dword ptr [edx+eax+4], ecx
movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
add ecx, 2
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
orps xmm1, xmm5
movaps [edi+0*16], xmm1
subps xmm1, xmm7
movaps [edi+1*16], xmm1
add edi, 2*16
skip2:
cmp dword ptr [edx+eax+8], ebx
jne skip3
mov dword ptr [edx+eax+8], ecx
movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
add ecx, 2
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
orps xmm2, xmm5
movaps [edi+0*16], xmm2
subps xmm2, xmm6
movaps [edi+1*16], xmm2
add edi, 2*16
skip3:
cmp dword ptr [edx+eax+12], ebx
jne skip4
mov dword ptr [edx+eax+12], ecx
movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
add ecx, 2
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
orps xmm3, xmm5
movaps [edi+0*16], xmm3
subps xmm3, xmm7
movaps [edi+1*16], xmm3
add edi, 2*16
skip4:
add esi, 4*DRAWVERT_SIZE
add eax, 4*4
jl loop4
done4:
mov eax, numVerts
and eax, 3
jz done1
shl eax, 2
add edx, eax
neg eax
loop1:
cmp dword ptr [edx+eax+0], ebx
jne skip0
mov dword ptr [edx+eax+0], ecx
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
add ecx, 2
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
orps xmm0, xmm5
movaps [edi+0*16], xmm0
subps xmm0, xmm6
movaps [edi+1*16], xmm0
add edi, 2*16
skip0:
add esi, DRAWVERT_SIZE
add eax, 4
jl loop1
done1:
pop ebx
mov outVerts, ecx
}
return outVerts;
#else
int outVerts = 0;
for ( int i = 0; i < numVerts; i++ ) {
if ( vertRemap[i] ) {
continue;
}
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[outVerts+0][0] = v[0];
vertexCache[outVerts+0][1] = v[1];
vertexCache[outVerts+0][2] = v[2];
vertexCache[outVerts+0][3] = 1.0f;
// R_SetupProjection() builds the projection matrix with a slight crunch
// for depth, which keeps this w=0 division from rasterizing right at the
// wrap around point and causing depth fighting with the rear caps
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
vertexCache[outVerts+1][3] = 0.0f;
vertRemap[i] = outVerts;
outVerts += 2;
}
return outVerts;
#endif
}
/*
============
idSIMD_SSE::CreateVertexProgramShadowCache
============
*/
int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
#if 1
__asm {
movaps xmm4, SIMD_SP_lastOne
movaps xmm5, xmm4
movaps xmm6, xmm4
movaps xmm7, xmm4
mov esi, verts
mov edi, vertexCache
mov eax, numVerts
and eax, ~3
jz done4
shl eax, 5
add edi, eax
neg eax
loop4:
prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
movaps [edi+eax+1*16], xmm0
orps xmm0, xmm4
movaps [edi+eax+0*16], xmm0
movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
movaps [edi+eax+3*16], xmm1
orps xmm1, xmm5
movaps [edi+eax+2*16], xmm1
movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
movaps [edi+eax+5*16], xmm2
orps xmm2, xmm6
movaps [edi+eax+4*16], xmm2
movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
movaps [edi+eax+7*16], xmm3
orps xmm3, xmm7
movaps [edi+eax+6*16], xmm3
add esi, 4*DRAWVERT_SIZE
add eax, 4*8*4
jl loop4
done4:
mov eax, numVerts
and eax, 3
jz done1
shl eax, 5
add edi, eax
neg eax
loop1:
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
movaps [edi+eax+1*16], xmm0
orps xmm0, xmm4
movaps [edi+eax+0*16], xmm0
add esi, DRAWVERT_SIZE
add eax, 8*4
jl loop1
done1:
}
return numVerts * 2;
#else
for ( int i = 0; i < numVerts; i++ ) {
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[i*2+0][0] = v[0];
vertexCache[i*2+0][1] = v[1];
vertexCache[i*2+0][2] = v[2];
vertexCache[i*2+0][3] = 1.0f;
vertexCache[i*2+1][0] = v[0];
vertexCache[i*2+1][1] = v[1];
vertexCache[i*2+1][2] = v[2];
vertexCache[i*2+1][3] = 0.0f;
}
return numVerts * 2;
#endif
}
/*
============
SSE_UpSample11kHzMonoPCMTo44kHz
============
*/
static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
__asm {
mov esi, src
mov edi, dest
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 1
add esi, eax
neg eax
align 16
loop2:
add edi, 2*4*4
movsx ecx, word ptr [esi+eax+0]
cvtsi2ss xmm0, ecx
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi-2*4*4+0], xmm0
movhps [edi-2*4*4+8], xmm0
movsx edx, word ptr [esi+eax+2]
cvtsi2ss xmm1, edx
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi-1*4*4+0], xmm1
movhps [edi-1*4*4+8], xmm1
add eax, 2*2
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movsx ecx, word ptr [esi]
cvtsi2ss xmm0, ecx
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi+0], xmm0
movhps [edi+8], xmm0
done:
}
}
/*
============
SSE_UpSample11kHzStereoPCMTo44kHz
============
*/
static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
__asm {
mov esi, src
mov edi, dest
mov eax, numSamples
test eax, ~1
jz done2
shl eax, 1
add esi, eax
neg eax
align 16
loop2:
add edi, 8*4
movsx ecx, word ptr [esi+eax+0]
cvtsi2ss xmm0, ecx
movsx edx, word ptr [esi+eax+2]
cvtsi2ss xmm1, edx
unpcklps xmm0, xmm1
movlps [edi-8*4+0], xmm0
movlps [edi-8*4+8], xmm0
movlps [edi-4*4+0], xmm0
movlps [edi-4*4+8], xmm0
add eax, 2*2
jl loop2
done2:
}
}
/*
============
SSE_UpSample22kHzMonoPCMTo44kHz
============
*/
static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
__asm {
mov esi, src
mov edi, dest
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 1
add esi, eax
neg eax
align 16
loop2:
add edi, 4*4
movsx ecx, word ptr [esi+eax+0]
cvtsi2ss xmm0, ecx
movsx edx, word ptr [esi+eax+2]
cvtsi2ss xmm1, edx
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi-4*4+0], xmm0
movhps [edi-4*4+8], xmm0
add eax, 2*2
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movsx ecx, word ptr [esi]
cvtsi2ss xmm0, ecx
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi], xmm0
done:
}
}
/*
============
SSE_UpSample22kHzStereoPCMTo44kHz
============
*/
static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
__asm {
mov esi, src
mov edi, dest
mov eax, numSamples
test eax, ~1
jz done2
shl eax, 1
add esi, eax
neg eax
align 16
loop2:
add edi, 4*4
movsx ecx, word ptr [esi+eax+0]
cvtsi2ss xmm0, ecx
movss [edi-4*4], xmm0
movss [edi-2*4], xmm0
movsx edx, word ptr [esi+eax+2]
cvtsi2ss xmm1, edx
movss [edi-3*4], xmm1
movss [edi-1*4], xmm1
add eax, 2*2
jl loop2
done2:
}
}
/*
============
SSE_UpSample44kHzMonoPCMTo44kHz
============
*/
static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
__asm {
mov esi, src
mov edi, dest
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 1
add esi, eax
neg eax
align 16
loop2:
add edi, 2*4
movsx ecx, word ptr [esi+eax+0]
cvtsi2ss xmm0, ecx
movss [edi-2*4], xmm0
movsx edx, word ptr [esi+eax+2]
cvtsi2ss xmm1, edx
movss [edi-1*4], xmm1
add eax, 2*2
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movsx ecx, word ptr [esi]
cvtsi2ss xmm0, ecx
movss [edi], xmm0
done:
}
}
/*
============
idSIMD_SSE::UpSamplePCMTo44kHz
Duplicate samples for 44kHz output.
============
*/
void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
} else {
SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
} else {
SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
}
} else if ( kHz == 44100 ) {
SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
} else {
assert( 0 );
}
}
/*
============
SSE_UpSample11kHzMonoOGGTo44kHz
============
*/
static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
float constant = 32768.0f;
__asm {
mov esi, src
mov edi, dest
movss xmm7, constant
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 2
add esi, eax
neg eax
align 16
loop2:
add edi, 2*16
movss xmm0, [esi+eax+0]
mulss xmm0, xmm7
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi-32], xmm0
movlps [edi-24], xmm0
movss xmm1, [esi+eax+4]
mulss xmm1, xmm7
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi-16], xmm1
movlps [edi- 8], xmm1
add eax, 2*4
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movss xmm0, [esi]
mulss xmm0, xmm7
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi+0], xmm0
movlps [edi+8], xmm0
done:
}
}
/*
============
SSE_UpSample11kHzStereoOGGTo44kHz
============
*/
static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
float constant = 32768.0f;
__asm {
mov esi, src
mov ecx, [esi+0]
mov edx, [esi+4]
mov edi, dest
movss xmm7, constant
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 1
add ecx, eax
add edx, eax
neg eax
align 16
loop2:
add edi, 4*16
movlps xmm0, [ecx+eax]
movlps xmm1, [edx+eax]
unpcklps xmm0, xmm1
mulps xmm0, xmm7
movlps [edi-8*8], xmm0
movlps [edi-7*8], xmm0
movlps [edi-6*8], xmm0
movlps [edi-5*8], xmm0
movhps [edi-4*8], xmm0
movhps [edi-3*8], xmm0
movhps [edi-2*8], xmm0
movhps [edi-1*8], xmm0
add eax, 2*4
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movss xmm0, [ecx]
movss xmm1, [edx]
unpcklps xmm0, xmm1
mulps xmm0, xmm7
movlps [edi+0*8], xmm0
movlps [edi+1*8], xmm0
movlps [edi+2*8], xmm0
movlps [edi+3*8], xmm0
done:
}
}
/*
============
SSE_UpSample22kHzMonoOGGTo44kHz
============
*/
static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
float constant = 32768.0f;
__asm {
mov esi, src
mov edi, dest
movss xmm7, constant
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 2
add esi, eax
neg eax
align 16
loop2:
add edi, 2*8
movss xmm0, [esi+eax+0]
movss xmm1, [esi+eax+4]
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm0, xmm7
movlps [edi-16], xmm0
movhps [edi- 8], xmm0
add eax, 2*4
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movss xmm0, [esi]
mulss xmm0, xmm7
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
movlps [edi+0], xmm0
done:
}
}
/*
============
SSE_UpSample22kHzStereoOGGTo44kHz
============
*/
static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
float constant = 32768.0f;
__asm {
mov esi, src
mov ecx, [esi+0]
mov edx, [esi+4]
mov edi, dest
movss xmm7, constant
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 1
add ecx, eax
add edx, eax
neg eax
align 16
loop2:
add edi, 2*16
movlps xmm0, [ecx+eax]
movlps xmm1, [edx+eax]
unpcklps xmm0, xmm1
mulps xmm0, xmm7
movlps [edi-4*8], xmm0
movlps [edi-3*8], xmm0
movhps [edi-2*8], xmm0
movhps [edi-1*8], xmm0
add eax, 2*4
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movss xmm0, [ecx]
movss xmm1, [edx]
unpcklps xmm0, xmm1
mulps xmm0, xmm7
movlps [edi+0*8], xmm0
movlps [edi+1*8], xmm0
done:
}
}
/*
============
SSE_UpSample44kHzMonoOGGTo44kHz
============
*/
static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
float constant = 32768.0f;
KFLOAT_CA( mul, dest, src, constant, numSamples )
}
/*
============
SSE_UpSample44kHzStereoOGGTo44kHz
============
*/
static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
float constant = 32768.0f;
__asm {
mov esi, src
mov ecx, [esi+0]
mov edx, [esi+4]
mov edi, dest
movss xmm7, constant
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
mov eax, numSamples
and eax, ~1
jz done2
shl eax, 1
add ecx, eax
add edx, eax
neg eax
align 16
loop2:
add edi, 16
movlps xmm0, [ecx+eax]
movlps xmm1, [edx+eax]
unpcklps xmm0, xmm1
mulps xmm0, xmm7
movlps [edi-2*8], xmm0
movhps [edi-1*8], xmm0
add eax, 2*4
jl loop2
done2:
mov eax, numSamples
and eax, 1
jz done
movss xmm0, [ecx]
movss xmm1, [edx]
unpcklps xmm0, xmm1
mulps xmm0, xmm7
movlps [edi+0*8], xmm0
done:
}
}
/*
============
idSIMD_SSE::UpSampleOGGTo44kHz
Duplicate samples for 44kHz output.
============
*/
void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
} else {
SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
} else {
SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
}
} else if ( kHz == 44100 ) {
if ( numChannels == 1 ) {
SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
} else {
SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
}
} else {
assert( 0 );
}
}
/*
============
idSIMD_SSE::MixSoundTwoSpeakerMono
============
*/
void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
#if 1
ALIGN16( float incs[2] );
assert( numSamples == MIXBUFFER_SAMPLES );
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
__asm {
mov eax, MIXBUFFER_SAMPLES
mov edi, mixBuffer
mov esi, samples
shl eax, 2
add esi, eax
neg eax
mov ecx, lastV
movlps xmm6, [ecx]
xorps xmm7, xmm7
movhps xmm7, incs
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
addps xmm6, xmm7
shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
addps xmm7, xmm7
loop16:
add edi, 4*4*4
movaps xmm0, [esi+eax+0*4*4]
movaps xmm1, xmm0
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
mulps xmm0, xmm6
addps xmm0, [edi-4*4*4]
addps xmm6, xmm7
movaps [edi-4*4*4], xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
mulps xmm1, xmm6
addps xmm1, [edi-3*4*4]
addps xmm6, xmm7
movaps [edi-3*4*4], xmm1
movaps xmm2, [esi+eax+1*4*4]
movaps xmm3, xmm2
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
mulps xmm2, xmm6
addps xmm2, [edi-2*4*4]
addps xmm6, xmm7
movaps [edi-2*4*4], xmm2
shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
mulps xmm3, xmm6
addps xmm3, [edi-1*4*4]
addps xmm6, xmm7
movaps [edi-1*4*4], xmm3
add eax, 2*4*4
jl loop16
}
#else
int i;
float incL;
float incR;
float sL0, sL1;
float sR0, sR1;
assert( numSamples == MIXBUFFER_SAMPLES );
incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
sL0 = lastV[0];
sR0 = lastV[1];
sL1 = lastV[0] + incL;
sR1 = lastV[1] + incR;
incL *= 2;
incR *= 2;
for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
mixBuffer[i*2+0] += samples[i+0] * sL0;
mixBuffer[i*2+1] += samples[i+0] * sR0;
mixBuffer[i*2+2] += samples[i+1] * sL1;
mixBuffer[i*2+3] += samples[i+1] * sR1;
sL0 += incL;
sR0 += incR;
sL1 += incL;
sR1 += incR;
}
#endif
}
/*
============
idSIMD_SSE::MixSoundTwoSpeakerStereo
============
*/
void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
#if 1
ALIGN16( float incs[2] );
assert( numSamples == MIXBUFFER_SAMPLES );
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
__asm {
mov eax, MIXBUFFER_SAMPLES
mov edi, mixBuffer
mov esi, samples
shl eax, 3
add esi, eax
neg eax
mov ecx, lastV
movlps xmm6, [ecx]
xorps xmm7, xmm7
movhps xmm7, incs
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
addps xmm6, xmm7
shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
addps xmm7, xmm7
loop16:
add edi, 4*4*4
movaps xmm0, [esi+eax+0*4*4]
mulps xmm0, xmm6
addps xmm0, [edi-4*4*4]
addps xmm6, xmm7
movaps [edi-4*4*4], xmm0
movaps xmm2, [esi+eax+1*4*4]
mulps xmm2, xmm6
addps xmm2, [edi-3*4*4]
addps xmm6, xmm7
movaps [edi-3*4*4], xmm2
movaps xmm3, [esi+eax+2*4*4]
mulps xmm3, xmm6
addps xmm3, [edi-2*4*4]
addps xmm6, xmm7
movaps [edi-2*4*4], xmm3
movaps xmm4, [esi+eax+3*4*4]
mulps xmm4, xmm6
addps xmm4, [edi-1*4*4]
addps xmm6, xmm7
movaps [edi-1*4*4], xmm4
add eax, 4*4*4
jl loop16
}
#else
int i;
float incL;
float incR;
float sL0, sL1;
float sR0, sR1;
assert( numSamples == MIXBUFFER_SAMPLES );
incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
sL0 = lastV[0];
sR0 = lastV[1];
sL1 = lastV[0] + incL;
sR1 = lastV[1] + incR;
incL *= 2;
incR *= 2;
for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
mixBuffer[i*2+0] += samples[i*2+0] * sL0;
mixBuffer[i*2+1] += samples[i*2+1] * sR0;
mixBuffer[i*2+2] += samples[i*2+2] * sL1;
mixBuffer[i*2+3] += samples[i*2+3] * sR1;
sL0 += incL;
sR0 += incR;
sL1 += incL;
sR1 += incR;
}
#endif
}
/*
============
idSIMD_SSE::MixSoundSixSpeakerMono
============
*/
void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
#if 1
ALIGN16( float incs[6] );
assert( numSamples == MIXBUFFER_SAMPLES );
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
__asm {
mov eax, MIXBUFFER_SAMPLES
mov edi, mixBuffer
mov esi, samples
shl eax, 2
add esi, eax
neg eax
mov ecx, lastV
movlps xmm2, [ecx+ 0]
movhps xmm2, [ecx+ 8]
movlps xmm3, [ecx+16]
movaps xmm4, xmm2
shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
xorps xmm5, xmm5
movhps xmm5, incs
movlps xmm7, incs+8
movhps xmm7, incs+16
addps xmm3, xmm5
addps xmm4, xmm7
shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
movaps xmm6, xmm7
shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
addps xmm5, xmm5
addps xmm6, xmm6
addps xmm7, xmm7
loop24:
add edi, 6*16
movaps xmm0, [esi+eax]
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
mulps xmm1, xmm2
addps xmm1, [edi-6*16]
addps xmm2, xmm5
movaps [edi-6*16], xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
mulps xmm1, xmm3
addps xmm1, [edi-5*16]
addps xmm3, xmm6
movaps [edi-5*16], xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
mulps xmm1, xmm4
addps xmm1, [edi-4*16]
addps xmm4, xmm7
movaps [edi-4*16], xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
mulps xmm1, xmm2
addps xmm1, [edi-3*16]
addps xmm2, xmm5
movaps [edi-3*16], xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
mulps xmm1, xmm3
addps xmm1, [edi-2*16]
addps xmm3, xmm6
movaps [edi-2*16], xmm1
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
mulps xmm0, xmm4
addps xmm0, [edi-1*16]
addps xmm4, xmm7
movaps [edi-1*16], xmm0
add eax, 4*4
jl loop24
}
#else
int i;
float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
float incL0, incL1, incL2, incL3, incL4, incL5;
assert( numSamples == MIXBUFFER_SAMPLES );
incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
sL0 = lastV[0];
sL1 = lastV[1];
sL2 = lastV[2];
sL3 = lastV[3];
sL4 = lastV[4];
sL5 = lastV[5];
sL6 = lastV[0] + incL0;
sL7 = lastV[1] + incL1;
sL8 = lastV[2] + incL2;
sL9 = lastV[3] + incL3;
sL10 = lastV[4] + incL4;
sL11 = lastV[5] + incL5;
incL0 *= 2;
incL1 *= 2;
incL2 *= 2;
incL3 *= 2;
incL4 *= 2;
incL5 *= 2;
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
mixBuffer[i*6+ 0] += samples[i+0] * sL0;
mixBuffer[i*6+ 1] += samples[i+0] * sL1;
mixBuffer[i*6+ 2] += samples[i+0] * sL2;
mixBuffer[i*6+ 3] += samples[i+0] * sL3;
mixBuffer[i*6+ 4] += samples[i+0] * sL4;
mixBuffer[i*6+ 5] += samples[i+0] * sL5;
mixBuffer[i*6+ 6] += samples[i+1] * sL6;
mixBuffer[i*6+ 7] += samples[i+1] * sL7;
mixBuffer[i*6+ 8] += samples[i+1] * sL8;
mixBuffer[i*6+ 9] += samples[i+1] * sL9;
mixBuffer[i*6+10] += samples[i+1] * sL10;
mixBuffer[i*6+11] += samples[i+1] * sL11;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
sL4 += incL4;
sL5 += incL5;
sL6 += incL0;
sL7 += incL1;
sL8 += incL2;
sL9 += incL3;
sL10 += incL4;
sL11 += incL5;
}
#endif
}
/*
============
idSIMD_SSE::MixSoundSixSpeakerStereo
============
*/
void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
#if 1
ALIGN16( float incs[6] );
assert( numSamples == MIXBUFFER_SAMPLES );
assert( SPEAKER_RIGHT == 1 );
assert( SPEAKER_BACKRIGHT == 5 );
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
__asm {
mov eax, MIXBUFFER_SAMPLES
mov edi, mixBuffer
mov esi, samples
shl eax, 3
add esi, eax
neg eax
mov ecx, lastV
movlps xmm2, [ecx+ 0]
movhps xmm2, [ecx+ 8]
movlps xmm3, [ecx+16]
movaps xmm4, xmm2
shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
xorps xmm5, xmm5
movhps xmm5, incs
movlps xmm7, incs+ 8
movhps xmm7, incs+16
addps xmm3, xmm5
addps xmm4, xmm7
shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
movaps xmm6, xmm7
shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
addps xmm5, xmm5
addps xmm6, xmm6
addps xmm7, xmm7
loop12:
add edi, 3*16
movaps xmm0, [esi+eax+0]
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
mulps xmm1, xmm2
addps xmm1, [edi-3*16]
addps xmm2, xmm5
movaps [edi-3*16], xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
mulps xmm1, xmm3
addps xmm1, [edi-2*16]
addps xmm3, xmm6
movaps [edi-2*16], xmm1
add eax, 4*4
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
mulps xmm0, xmm4
addps xmm0, [edi-1*16]
addps xmm4, xmm7
movaps [edi-1*16], xmm0
jl loop12
emms
}
#else
int i;
float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
float incL0, incL1, incL2, incL3, incL4, incL5;
assert( numSamples == MIXBUFFER_SAMPLES );
assert( SPEAKER_RIGHT == 1 );
assert( SPEAKER_BACKRIGHT == 5 );
incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
sL0 = lastV[0];
sL1 = lastV[1];
sL2 = lastV[2];
sL3 = lastV[3];
sL4 = lastV[4];
sL5 = lastV[5];
sL6 = lastV[0] + incL0;
sL7 = lastV[1] + incL1;
sL8 = lastV[2] + incL2;
sL9 = lastV[3] + incL3;
sL10 = lastV[4] + incL4;
sL11 = lastV[5] + incL5;
incL0 *= 2;
incL1 *= 2;
incL2 *= 2;
incL3 *= 2;
incL4 *= 2;
incL5 *= 2;
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
sL4 += incL4;
sL5 += incL5;
sL6 += incL0;
sL7 += incL1;
sL8 += incL2;
sL9 += incL3;
sL10 += incL4;
sL11 += incL5;
}
#endif
}
/*
============
idSIMD_SSE::MixedSoundToSamples
============
*/
void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
#if 1
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
__asm {
mov eax, numSamples
mov edi, mixBuffer
mov esi, samples
shl eax, 2
add edi, eax
neg eax
loop16:
movaps xmm0, [edi+eax+0*16]
movaps xmm2, [edi+eax+1*16]
movaps xmm4, [edi+eax+2*16]
movaps xmm6, [edi+eax+3*16]
add esi, 4*4*2
movhlps xmm1, xmm0
movhlps xmm3, xmm2
movhlps xmm5, xmm4
movhlps xmm7, xmm6
prefetchnta [edi+eax+64]
cvtps2pi mm0, xmm0
cvtps2pi mm2, xmm2
cvtps2pi mm4, xmm4
cvtps2pi mm6, xmm6
prefetchnta [edi+eax+128]
cvtps2pi mm1, xmm1
cvtps2pi mm3, xmm3
cvtps2pi mm5, xmm5
cvtps2pi mm7, xmm7
add eax, 4*16
packssdw mm0, mm1
packssdw mm2, mm3
packssdw mm4, mm5
packssdw mm6, mm7
movq [esi-4*4*2], mm0
movq [esi-3*4*2], mm2
movq [esi-2*4*2], mm4
movq [esi-1*4*2], mm6
jl loop16
emms
}
#else
for ( int i = 0; i < numSamples; i++ ) {
if ( mixBuffer[i] <= -32768.0f ) {
samples[i] = -32768;
} else if ( mixBuffer[i] >= 32767.0f ) {
samples[i] = 32767;
} else {
samples[i] = (short) mixBuffer[i];
}
}
#endif
}
#endif /* _WIN32 */