mirror of
https://github.com/dhewm/dhewm3-sdk.git
synced 2024-12-01 16:52:10 +00:00
afebd7e1e5
Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards.
18077 lines
462 KiB
C++
18077 lines
462 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include "sys/platform.h"
|
|
#include "idlib/geometry/DrawVert.h"
|
|
|
|
#include "idlib/math/Simd_SSE.h"
|
|
|
|
//===============================================================
|
|
// M
|
|
// SSE implementation of idSIMDProcessor MrE
|
|
// E
|
|
//===============================================================
|
|
|
|
#define DRAWVERT_SIZE 60
|
|
#define DRAWVERT_XYZ_OFFSET (0*4)
|
|
#define DRAWVERT_ST_OFFSET (3*4)
|
|
#define DRAWVERT_NORMAL_OFFSET (5*4)
|
|
#define DRAWVERT_TANGENT0_OFFSET (8*4)
|
|
#define DRAWVERT_TANGENT1_OFFSET (11*4)
|
|
#define DRAWVERT_COLOR_OFFSET (14*4)
|
|
|
|
#if defined(__GNUC__) && defined(__SSE__)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE::GetName( void ) const {
|
|
return "MMX & SSE";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
// 0, 1, 2
|
|
// 3, 4, 5
|
|
// 6, 7, 8
|
|
// 9, 10, 11
|
|
|
|
/*
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
*/
|
|
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // Declare 8 xmm registers.
|
|
int count_l4 = count; // count_l4 = eax
|
|
int count_l1 = count; // count_l1 = edx
|
|
char *constant_p = (char *)&constant; // constant_p = edi
|
|
char *src_p = (char *) src; // src_p = esi
|
|
char *dst_p = (char *) dst; // dst_p = ecx
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET );
|
|
|
|
/*
|
|
and eax, ~3
|
|
movss xmm4, [edi+0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm5, [edi+4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [edi+12]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
*/
|
|
count_l4 = count_l4 & ~3;
|
|
xmm4 = _mm_load_ss((float *) (constant_p));
|
|
xmm4 = _mm_shuffle_ps(xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
xmm5 = _mm_load_ss((float *) (constant_p + 4));
|
|
xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
xmm6 = _mm_load_ss((float *) (constant_p + 8));
|
|
xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
xmm7 = _mm_load_ss((float *) (constant_p + 12));
|
|
xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
|
|
/*
|
|
jz startVert1
|
|
*/
|
|
if(count_l4 != 0) {
|
|
/*
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
*/
|
|
count_l4 = count_l4 * DRAWVERT_SIZE;
|
|
src_p = src_p + count_l4;
|
|
count_l4 = -count_l4;
|
|
/*
|
|
loopVert4:
|
|
*/
|
|
do {
|
|
/*
|
|
movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
|
|
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
|
|
movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
|
|
movaps xmm1, xmm0 // 3, X, 0, 1
|
|
*/
|
|
xmm0 = _mm_load_ss((float *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, X, X
|
|
xmm2 = _mm_load_ss((float *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 2, X, X, X
|
|
xmm0 = _mm_loadh_pi(xmm0, (__m64 *) (src_p+count_l4+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 3, X, 0, 1
|
|
xmm1 = xmm0; // 3, X, 0, 1
|
|
|
|
/*
|
|
movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
|
|
shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
|
|
*/
|
|
xmm1 = _mm_loadl_pi(xmm1, (__m64 *) (src_p+count_l4+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 4, 5, 0, 1
|
|
xmm2 = _mm_shuffle_ps(xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )); // 2, X, 4, 5
|
|
|
|
/*
|
|
movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
|
|
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
|
|
shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
|
|
*/
|
|
xmm3 = _mm_load_ss((float *) (src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, X, X
|
|
xmm3 = _mm_loadh_pi(xmm3, (__m64 *) (src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0)); // 9, X, 6, 7
|
|
xmm0 = _mm_shuffle_ps(xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 )); // 0, 3, 6, 9
|
|
/*
|
|
movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
|
|
*/
|
|
xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4)); // 10, 11, 6, 7
|
|
xmm1 = _mm_shuffle_ps(xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 )); // 1, 4, 7, 10
|
|
/*
|
|
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
|
|
*/
|
|
xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8)); // 10, 11, 8, X
|
|
xmm2 = _mm_shuffle_ps(xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 )); // 2, 5, 8, 11
|
|
|
|
/*
|
|
add ecx, 16
|
|
add eax, 4*DRAWVERT_SIZE
|
|
*/
|
|
dst_p = dst_p + 16;
|
|
count_l4 = count_l4 + 4*DRAWVERT_SIZE;
|
|
|
|
/*
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
*/
|
|
xmm0 = _mm_mul_ps(xmm0, xmm4);
|
|
xmm1 = _mm_mul_ps(xmm1, xmm5);
|
|
xmm2 = _mm_mul_ps(xmm2, xmm6);
|
|
xmm0 = _mm_add_ps(xmm0, xmm7);
|
|
xmm0 = _mm_add_ps(xmm0, xmm1);
|
|
xmm0 = _mm_add_ps(xmm0, xmm2);
|
|
|
|
/*
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loopVert4
|
|
*/
|
|
_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
|
|
_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
|
|
} while(count_l4 < 0);
|
|
}
|
|
|
|
/*
|
|
startVert1:
|
|
and edx, 3
|
|
jz done
|
|
*/
|
|
count_l1 = count_l1 & 3;
|
|
if(count_l1 != 0) {
|
|
/*
|
|
loopVert1:
|
|
movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
|
|
mulss xmm0, xmm4
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm6
|
|
addss xmm0, xmm7
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, DRAWVERT_SIZE
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loopVert1
|
|
*/
|
|
do {
|
|
xmm0 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+0));
|
|
xmm1 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+4));
|
|
xmm2 = _mm_load_ss((float *) (src_p+count_l4+DRAWVERT_XYZ_OFFSET+8));
|
|
xmm0 = _mm_mul_ss(xmm0, xmm4);
|
|
xmm1 = _mm_mul_ss(xmm1, xmm5);
|
|
xmm2 = _mm_mul_ss(xmm2, xmm6);
|
|
xmm0 = _mm_add_ss(xmm0, xmm7);
|
|
dst_p = dst_p + 4;
|
|
xmm0 = _mm_add_ss(xmm0, xmm1);
|
|
count_l4 = count_l4 + DRAWVERT_SIZE;
|
|
xmm0 = _mm_add_ss(xmm0, xmm2);
|
|
count_l1 = count_l1 - 1;
|
|
_mm_store_ss((float *) (dst_p-4), xmm0);
|
|
} while( count_l1 != 0);
|
|
}
|
|
/*
|
|
done:
|
|
*/
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( ptrdiff_t(&src->xyz) - ptrdiff_t(src) == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
|
char *indexes_p;
|
|
char *src_p;
|
|
int count_l;
|
|
int edx;
|
|
char *min_p;
|
|
char *max_p;
|
|
|
|
/*
|
|
movss xmm0, idMath::INFINITY
|
|
xorps xmm1, xmm1
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
subps xmm1, xmm0
|
|
movaps xmm2, xmm0
|
|
movaps xmm3, xmm1
|
|
*/
|
|
xmm0 = _mm_load_ss(&idMath::INFINITY);
|
|
// To satisfy the compiler use xmm0 instead.
|
|
xmm1 = _mm_xor_ps(xmm0, xmm0);
|
|
xmm0 = _mm_shuffle_ps(xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
xmm1 = _mm_sub_ps(xmm1, xmm0);
|
|
xmm2 = xmm0;
|
|
xmm3 = xmm1;
|
|
|
|
/*
|
|
mov edi, indexes
|
|
mov esi, src
|
|
mov eax, count
|
|
and eax, ~3
|
|
jz done4
|
|
*/
|
|
indexes_p = (char *) indexes;
|
|
src_p = (char *) src;
|
|
count_l = count;
|
|
count_l = count_l & ~3;
|
|
if(count_l != 0) {
|
|
/*
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
*/
|
|
count_l = count_l << 2;
|
|
indexes_p = indexes_p + count_l;
|
|
count_l = -count_l;
|
|
/*
|
|
loop4:
|
|
// prefetchnta [edi+128]
|
|
// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
|
|
*/
|
|
do {
|
|
/*
|
|
mov edx, [edi+eax+0]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
*/
|
|
edx = *((int*)(indexes_p+count_l+0));
|
|
edx = edx * DRAWVERT_SIZE;
|
|
xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
|
|
xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
|
|
xmm0 = _mm_min_ps(xmm0, xmm4);
|
|
xmm1 = _mm_max_ps(xmm1, xmm4);
|
|
|
|
/*
|
|
mov edx, [edi+eax+4]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
|
|
minps xmm2, xmm5
|
|
maxps xmm3, xmm5
|
|
*/
|
|
edx = *((int*)(indexes_p+count_l+4));
|
|
edx = edx * DRAWVERT_SIZE;
|
|
xmm5 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
|
|
xmm5 = _mm_loadh_pi(xmm5, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
|
|
xmm2 = _mm_min_ps(xmm2, xmm5);
|
|
xmm3 = _mm_max_ps(xmm3, xmm5);
|
|
|
|
/*
|
|
mov edx, [edi+eax+8]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm6
|
|
maxps xmm1, xmm6
|
|
*/
|
|
edx = *((int*)(indexes_p+count_l+8));
|
|
edx = edx * DRAWVERT_SIZE;
|
|
xmm6 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
|
|
xmm6 = _mm_loadh_pi(xmm6, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
|
|
xmm0 = _mm_min_ps(xmm0, xmm6);
|
|
xmm1 = _mm_max_ps(xmm1, xmm6);
|
|
|
|
/*
|
|
mov edx, [edi+eax+12]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
|
|
minps xmm2, xmm7
|
|
maxps xmm3, xmm7
|
|
*/
|
|
edx = *((int*)(indexes_p+count_l+12));
|
|
edx = edx * DRAWVERT_SIZE;
|
|
xmm7 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0));
|
|
xmm7 = _mm_loadh_pi(xmm7, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+4) );
|
|
xmm2 = _mm_min_ps(xmm2, xmm7);
|
|
xmm3 = _mm_max_ps(xmm3, xmm7);
|
|
|
|
/*
|
|
add eax, 4*4
|
|
jl loop4
|
|
*/
|
|
count_l = count_l + 4*4;
|
|
} while (count_l < 0);
|
|
}
|
|
/*
|
|
done4:
|
|
mov eax, count
|
|
and eax, 3
|
|
jz done1
|
|
*/
|
|
count_l = count;
|
|
count_l = count_l & 3;
|
|
if(count_l != 0) {
|
|
/*
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
*/
|
|
count_l = count_l << 2;
|
|
indexes_p = indexes_p + count_l;
|
|
count_l = -count_l;
|
|
/*
|
|
loop1:
|
|
*/
|
|
do{
|
|
/*
|
|
mov edx, [edi+eax+0]
|
|
imul edx, DRAWVERT_SIZE;
|
|
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
*/
|
|
edx = *((int*)(indexes_p+count_l+0));
|
|
edx = edx * DRAWVERT_SIZE;
|
|
xmm4 = _mm_load_ss((float *) (src_p+edx+DRAWVERT_XYZ_OFFSET+8));
|
|
xmm4 = _mm_loadh_pi(xmm4, (__m64 *) (src_p+edx+DRAWVERT_XYZ_OFFSET+0) );
|
|
xmm0 = _mm_min_ps(xmm0, xmm4);
|
|
xmm1 = _mm_max_ps(xmm1, xmm4);
|
|
|
|
/*
|
|
add eax, 4
|
|
jl loop1
|
|
*/
|
|
count_l = count_l + 4;
|
|
} while (count_l < 0);
|
|
|
|
}
|
|
|
|
/*
|
|
done1:
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm3
|
|
mov esi, min
|
|
movhps [esi], xmm0
|
|
movss [esi+8], xmm0
|
|
mov edi, max
|
|
movhps [edi], xmm1
|
|
movss [edi+8], xmm1
|
|
*/
|
|
xmm2 = _mm_shuffle_ps(xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 ));
|
|
xmm3 = _mm_shuffle_ps(xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 ));
|
|
xmm0 = _mm_min_ps(xmm0, xmm2);
|
|
xmm1 = _mm_max_ps(xmm1, xmm3);
|
|
min_p = (char *) &min;
|
|
_mm_storeh_pi((__m64 *)(min_p), xmm0);
|
|
_mm_store_ss((float *)(min_p+8), xmm0);
|
|
max_p = (char *) &max;
|
|
_mm_storeh_pi((__m64 *)(max_p), xmm1);
|
|
_mm_store_ss((float *)(max_p+8), xmm1);
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
|
|
int count_l4;
|
|
int count_l1;
|
|
char *constant_p;
|
|
char *src_p;
|
|
char *dst_p;
|
|
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
|
|
|
/*
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
*/
|
|
count_l4 = count;
|
|
constant_p = (char *) &constant;
|
|
count_l1 = count_l4;
|
|
src_p = (char *) src;
|
|
dst_p = (char *) dst;
|
|
count_l4 = count_l4 & ~3;
|
|
|
|
/*
|
|
movss xmm5, [edi+0]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [edi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
*/
|
|
xmm5 = _mm_load_ss((float *) (constant_p+0));
|
|
xmm5 = _mm_shuffle_ps(xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
xmm6 = _mm_load_ss((float *) (constant_p+4));
|
|
xmm6 = _mm_shuffle_ps(xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
xmm7 = _mm_load_ss((float *) (constant_p+8));
|
|
xmm7 = _mm_shuffle_ps(xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ));
|
|
|
|
/*
|
|
jz startVert1
|
|
*/
|
|
if (count != 0) {
|
|
/*
|
|
imul eax, 16
|
|
add esi, eax
|
|
neg eax
|
|
*/
|
|
count_l4 = count_l4 * 16;
|
|
src_p = src_p + count_l4;
|
|
count_l4 = -count_l4;
|
|
/*
|
|
loopVert4:
|
|
*/
|
|
do {
|
|
/*
|
|
movlps xmm1, [esi+eax+ 0]
|
|
movlps xmm3, [esi+eax+ 8]
|
|
movhps xmm1, [esi+eax+16]
|
|
movhps xmm3, [esi+eax+24]
|
|
movlps xmm2, [esi+eax+32]
|
|
movlps xmm4, [esi+eax+40]
|
|
movhps xmm2, [esi+eax+48]
|
|
movhps xmm4, [esi+eax+56]
|
|
movaps xmm0, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
movaps xmm2, xmm3
|
|
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
*/
|
|
xmm1 = _mm_loadl_pi(xmm1, (__m64 *)(src_p+count_l4+ 0));
|
|
xmm3 = _mm_loadl_pi(xmm3, (__m64 *)(src_p+count_l4+ 8));
|
|
xmm1 = _mm_loadh_pi(xmm1, (__m64 *)(src_p+count_l4+16));
|
|
xmm3 = _mm_loadh_pi(xmm3, (__m64 *)(src_p+count_l4+24));
|
|
xmm2 = _mm_loadl_pi(xmm2, (__m64 *)(src_p+count_l4+32));
|
|
xmm4 = _mm_loadl_pi(xmm4, (__m64 *)(src_p+count_l4+40));
|
|
xmm2 = _mm_loadh_pi(xmm2, (__m64 *)(src_p+count_l4+48));
|
|
xmm4 = _mm_loadh_pi(xmm4, (__m64 *)(src_p+count_l4+56));
|
|
|
|
xmm0 = xmm1;
|
|
xmm0 = _mm_shuffle_ps(xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 ));
|
|
xmm1 = _mm_shuffle_ps(xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 ));
|
|
xmm2 = xmm3;
|
|
xmm2 = _mm_shuffle_ps(xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ));
|
|
xmm3 = _mm_shuffle_ps(xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ));
|
|
|
|
/*
|
|
add ecx, 16
|
|
add eax, 4*16
|
|
*/
|
|
dst_p = dst_p + 16;
|
|
count_l4 = count_l4 + 4*16;
|
|
|
|
/*
|
|
mulps xmm0, xmm5
|
|
mulps xmm1, xmm6
|
|
mulps xmm2, xmm7
|
|
addps xmm0, xmm3
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
*/
|
|
xmm0 = _mm_mul_ps(xmm0, xmm5);
|
|
xmm1 = _mm_mul_ps(xmm1, xmm6);
|
|
xmm2 = _mm_mul_ps(xmm2, xmm7);
|
|
xmm0 = _mm_add_ps(xmm0, xmm3);
|
|
xmm0 = _mm_add_ps(xmm0, xmm1);
|
|
xmm0 = _mm_add_ps(xmm0, xmm2);
|
|
|
|
/*
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loopVert4
|
|
*/
|
|
_mm_storel_pi((__m64 *) (dst_p-16+0), xmm0);
|
|
_mm_storeh_pi((__m64 *) (dst_p-16+8), xmm0);
|
|
} while (count_l4 < 0);
|
|
}
|
|
|
|
/*
|
|
startVert1:
|
|
and edx, 3
|
|
jz done
|
|
*/
|
|
count_l1 = count_l1 & 3;
|
|
|
|
if(count_l1 != 0) {
|
|
/*
|
|
loopVert1:
|
|
*/
|
|
do {
|
|
/*
|
|
movss xmm0, [esi+eax+0]
|
|
movss xmm1, [esi+eax+4]
|
|
movss xmm2, [esi+eax+8]
|
|
mulss xmm0, xmm5
|
|
mulss xmm1, xmm6
|
|
mulss xmm2, xmm7
|
|
addss xmm0, [esi+eax+12]
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, 16
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loopVert1
|
|
*/
|
|
xmm0 = _mm_load_ss((float *) (src_p+count_l4+ 0));
|
|
xmm1 = _mm_load_ss((float *) (src_p+count_l4+ 4));
|
|
xmm2 = _mm_load_ss((float *) (src_p+count_l4+ 8));
|
|
xmm3 = _mm_load_ss((float *) (src_p+count_l4+12));
|
|
|
|
xmm0 = _mm_mul_ss(xmm0, xmm5);
|
|
xmm1 = _mm_mul_ss(xmm1, xmm6);
|
|
xmm2 = _mm_mul_ss(xmm2, xmm7);
|
|
|
|
xmm0 = _mm_add_ss(xmm0, xmm3);
|
|
dst_p = dst_p + 4;
|
|
xmm0 = _mm_add_ss(xmm0, xmm1);
|
|
count_l4 = count_l4 + 16;
|
|
xmm0 = _mm_add_ss(xmm0, xmm2);
|
|
count_l1 = count_l1 - 1;
|
|
_mm_store_ss((float *) (dst_p-4), xmm0);
|
|
} while (count_l1 != 0);
|
|
}
|
|
/*
|
|
done:
|
|
*/
|
|
}
|
|
|
|
#elif defined(_WIN32)
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
#define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
|
|
#define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
|
|
|
// transpose a 4x4 matrix loaded into 4 xmm registers (reg4 is temporary)
|
|
#define TRANSPOSE_4x4( reg0, reg1, reg2, reg3, reg4 ) \
|
|
__asm movaps reg4, reg2 /* reg4 = 8, 9, 10, 11 */ \
|
|
__asm unpcklps reg2, reg3 /* reg2 = 8, 12, 9, 13 */ \
|
|
__asm unpckhps reg4, reg3 /* reg4 = 10, 14, 11, 15 */ \
|
|
__asm movaps reg3, reg0 /* reg3 = 0, 1, 2, 3 */ \
|
|
__asm unpcklps reg0, reg1 /* reg0 = 0, 4, 1, 5 */ \
|
|
__asm unpckhps reg3, reg1 /* reg3 = 2, 6, 3, 7 */ \
|
|
__asm movaps reg1, reg0 /* reg1 = 0, 4, 1, 5 */ \
|
|
__asm shufps reg0, reg2, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg0 = 0, 4, 8, 12 */ \
|
|
__asm shufps reg1, reg2, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg1 = 1, 5, 9, 13 */ \
|
|
__asm movaps reg2, reg3 /* reg2 = 2, 6, 3, 7 */ \
|
|
__asm shufps reg2, reg4, R_SHUFFLEPS( 0, 1, 0, 1 ) /* reg2 = 2, 6, 10, 14 */ \
|
|
__asm shufps reg3, reg4, R_SHUFFLEPS( 2, 3, 2, 3 ) /* reg3 = 3, 7, 11, 15 */
|
|
|
|
// transpose a 4x4 matrix from memory into 4 xmm registers (reg4 is temporary)
|
|
#define TRANPOSE_4x4_FROM_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
|
|
__asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
|
|
__asm movlps reg3, [address+ 8] /* reg3 = 2, 3, X, X */ \
|
|
__asm movhps reg1, [address+16] /* reg1 = 0, 1, 4, 5 */ \
|
|
__asm movhps reg3, [address+24] /* reg3 = 2, 3, 6, 7 */ \
|
|
__asm movlps reg2, [address+32] /* reg2 = 8, 9, X, X */ \
|
|
__asm movlps reg4, [address+40] /* reg4 = 10, 11, X, X */ \
|
|
__asm movhps reg2, [address+48] /* reg2 = 8, 9, 12, 13 */ \
|
|
__asm movhps reg4, [address+56] /* reg4 = 10, 11, 14, 15 */ \
|
|
__asm movaps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
|
|
__asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg0 = 0, 4, 8, 12 */ \
|
|
__asm shufps reg1, reg2, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg1 = 1, 5, 9, 13 */ \
|
|
__asm movaps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
|
|
__asm shufps reg2, reg4, R_SHUFFLEPS( 0, 2, 0, 2 ) /* reg2 = 2, 6, 10, 14 */ \
|
|
__asm shufps reg3, reg4, R_SHUFFLEPS( 1, 3, 1, 3 ) /* reg3 = 3, 7, 11, 15 */
|
|
|
|
// transpose a 4x4 matrix to memory from 4 xmm registers (reg4 is temporary)
|
|
#define TRANPOSE_4x4_TO_MEMORY( address, reg0, reg1, reg2, reg3, reg4 ) \
|
|
__asm movaps reg4, reg0 /* reg4 = 0, 4, 8, 12 */ \
|
|
__asm unpcklps reg0, reg1 /* reg0 = 0, 1, 4, 5 */ \
|
|
__asm unpckhps reg4, reg1 /* reg4 = 8, 9, 12, 13 */ \
|
|
__asm movaps reg1, reg2 /* reg1 = 2, 6, 10, 14 */ \
|
|
__asm unpcklps reg2, reg3 /* reg2 = 2, 3, 6, 7 */ \
|
|
__asm unpckhps reg1, reg3 /* reg1 = 10, 11, 14, 15 */ \
|
|
__asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
|
|
__asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
|
|
__asm movhps [address+16], reg0 /* mem1 = 4, 5, X, X */ \
|
|
__asm movhps [address+24], reg2 /* mem1 = 4, 5, 6, 7 */ \
|
|
__asm movlps [address+32], reg4 /* mem2 = 8, 9, X, X */ \
|
|
__asm movlps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */ \
|
|
__asm movhps [address+48], reg4 /* mem3 = 12, 13, X, X */ \
|
|
__asm movhps [address+56], reg1 /* mem3 = 12, 13, 14, 15 */
|
|
|
|
// transpose a 4x3 matrix loaded into 3 xmm registers (reg3 is temporary)
|
|
#define TRANSPOSE_4x3( reg0, reg1, reg2, reg3 ) \
|
|
__asm movaps reg3, reg2 /* reg3 = 8, 9, 10, 11 */ \
|
|
__asm shufps reg3, reg1, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg3 = 10, 11, 4, 5 */ \
|
|
__asm shufps reg2, reg0, R_SHUFFLEPS( 0, 1, 2, 3 ) /* reg2 = 8, 9, 2, 3 */ \
|
|
__asm shufps reg1, reg0, R_SHUFFLEPS( 2, 3, 0, 1 ) /* reg1 = 6, 7, 0, 1 */ \
|
|
__asm movaps reg0, reg1 /* reg0 = 6, 7, 0, 1 */ \
|
|
__asm shufps reg0, reg2, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg0 = 0, 6, 3, 9 */ \
|
|
__asm shufps reg1, reg3, R_SHUFFLEPS( 3, 1, 2, 0 ) /* reg1 = 1, 7, 4, 10 */ \
|
|
__asm shufps reg2, reg3, R_SHUFFLEPS( 2, 0, 3, 1 ) /* reg2 = 2, 8, 5, 11 */
|
|
|
|
// transpose a 4x3 matrix from memory into 3 xmm registers (reg3 is temporary)
|
|
#define TRANSPOSE_4x3_FROM_MEMORY( address, reg0, reg1, reg2, reg3 ) \
|
|
__asm movlps reg1, [address+ 0] /* reg1 = 0, 1, X, X */ \
|
|
__asm movlps reg2, [address+ 8] /* reg2 = 2, 3, X, X */ \
|
|
__asm movlps reg3, [address+16] /* reg3 = 4, 5, X, X */ \
|
|
__asm movhps reg1, [address+24] /* reg1 = 0, 1, 6, 7 */ \
|
|
__asm movhps reg2, [address+32] /* reg2 = 2, 3, 8, 9 */ \
|
|
__asm movhps reg3, [address+40] /* reg3 = 4, 5, 10, 11 */ \
|
|
__asm movaps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
|
|
__asm shufps reg0, reg2, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg0 = 0, 6, 3, 9 */ \
|
|
__asm shufps reg1, reg3, R_SHUFFLEPS( 1, 3, 0, 2 ) /* reg1 = 1, 7, 4, 10 */ \
|
|
__asm shufps reg2, reg3, R_SHUFFLEPS( 0, 2, 1, 3 ) /* reg2 = 2, 8, 5, 11 */
|
|
|
|
// transpose a 4x3 matrix to memory from 3 xmm registers (reg3 is temporary)
|
|
#define TRANSPOSE_4x3_TO_MEMORY( address, reg0, reg1, reg2, reg3 ) \
|
|
__asm movhlps reg3, reg0 /* reg3 = 3, 9, X, X */ \
|
|
__asm unpcklps reg0, reg1 /* reg0 = 0, 1, 6, 7 */ \
|
|
__asm unpckhps reg1, reg2 /* reg1 = 4, 5, 10, 11 */ \
|
|
__asm unpcklps reg2, reg3 /* reg2 = 2, 3, 8, 9 */ \
|
|
__asm movlps [address+ 0], reg0 /* mem0 = 0, 1, X, X */ \
|
|
__asm movlps [address+ 8], reg2 /* mem0 = 0, 1, 2, 3 */ \
|
|
__asm movlps [address+16], reg1 /* mem1 = 4, 5, X, X */ \
|
|
__asm movhps [address+24], reg0 /* mem1 = 4, 5, 6, 7 */ \
|
|
__asm movhps [address+32], reg2 /* mem2 = 8, 9, X, X */ \
|
|
__asm movhps [address+40], reg1 /* mem2 = 8, 9, 10, 11 */
|
|
|
|
|
|
// with alignment
|
|
#define KFLOATINITS( SRC0, COUNT, PRE, POST ) KFLOATINITDSS( SRC0,SRC0,SRC0,COUNT,PRE,POST )
|
|
#define KFLOATINITD( DST, COUNT, PRE, POST ) KFLOATINITDSS( DST,DST,DST,COUNT,PRE,POST )
|
|
#define KFLOATINITDS( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS( DST,SRC0,SRC0,COUNT,PRE,POST )
|
|
|
|
#define KFLOATINITDSS( DST, SRC0, SRC1, COUNT, PRE, POST )\
|
|
__asm mov ecx,DST \
|
|
__asm shr ecx,2 \
|
|
__asm mov ebx,COUNT \
|
|
__asm neg ecx \
|
|
__asm mov edx,SRC0 \
|
|
__asm and ecx,3 \
|
|
__asm mov esi,SRC1 \
|
|
__asm sub ebx,ecx \
|
|
__asm jge noUnderFlow \
|
|
__asm xor ebx,ebx \
|
|
__asm mov ecx,COUNT \
|
|
__asm noUnderFlow: \
|
|
__asm mov PRE,ecx \
|
|
__asm mov eax,ebx \
|
|
__asm mov edi,DST \
|
|
__asm and eax,8-1 \
|
|
__asm mov POST,eax \
|
|
__asm and ebx,0xfffffff8 \
|
|
__asm jle done \
|
|
__asm shl ebx,2 \
|
|
__asm lea ecx,[ecx*4+ebx] \
|
|
__asm neg ebx \
|
|
__asm add edx,ecx \
|
|
__asm add esi,ecx \
|
|
__asm add edi,ecx \
|
|
__asm mov eax,edx \
|
|
__asm or eax,esi
|
|
|
|
// without alignment (pre==0)
|
|
#define KFLOATINITS_NA( SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( SRC0,SRC0,SRC0,COUNT,PRE,POST )
|
|
#define KFLOATINITD_NA( DST, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,DST,DST,COUNT,PRE,POST )
|
|
#define KFLOATINITDS_NA( DST, SRC0, COUNT, PRE, POST ) KFLOATINITDSS_NA( DST,SRC0,SRC0,COUNT,PRE,POST )
|
|
#define KFLOATINITDSS_NA( DST, SRC0, SRC1, COUNT, PRE, POST )\
|
|
__asm mov eax,COUNT \
|
|
__asm mov PRE,0 \
|
|
__asm and eax,8-1 \
|
|
__asm mov ebx,COUNT \
|
|
__asm mov POST,eax \
|
|
__asm and ebx,0xfffffff8 \
|
|
__asm je done \
|
|
__asm shl ebx,2 \
|
|
__asm mov edx,SRC0 \
|
|
__asm mov esi,SRC1 \
|
|
__asm mov edi,DST \
|
|
__asm add edx,ebx \
|
|
__asm add esi,ebx \
|
|
__asm add edi,ebx \
|
|
__asm mov eax,edx \
|
|
__asm or eax,esi \
|
|
__asm or eax,edi \
|
|
__asm neg ebx \
|
|
|
|
/*
|
|
when OPER is called:
|
|
edx = s0
|
|
esi = s1
|
|
edi = d
|
|
ebx = index*4
|
|
|
|
xmm0 & xmm1 must not be trashed
|
|
*/
|
|
#define KMOVDS1( DST, SRC0 ) \
|
|
__asm movss xmm2,SRC0 \
|
|
__asm movss DST,xmm2
|
|
#define KMOVDS4( DST, SRC0 ) \
|
|
__asm movups xmm2,SRC0 \
|
|
__asm movups DST,xmm2
|
|
#define KMINDS1( DST, SRC0 ) \
|
|
__asm movss xmm2,SRC0 \
|
|
__asm minss DST,xmm2
|
|
#define KMAXDS1( DST, SRC0 ) \
|
|
__asm movss xmm2,SRC0 \
|
|
__asm maxss DST,xmm2
|
|
|
|
// general ALU operation
|
|
#define KALUDSS1( OP, DST, SRC0, SRC1 ) \
|
|
__asm movss xmm2,SRC0 \
|
|
__asm OP##ss xmm2,SRC1 \
|
|
__asm movss DST,xmm2
|
|
#define KALUDSS4( OP, DST, SRC0, SRC1 ) \
|
|
__asm movups xmm2,SRC0 \
|
|
__asm movups xmm3,SRC1 \
|
|
__asm OP##ps xmm2,xmm3 \
|
|
__asm movups DST,xmm2
|
|
|
|
#define KADDDSS1( DST, SRC0, SRC1 ) KALUDSS1( add, DST,SRC0,SRC1 )
|
|
#define KADDDSS4( DST, SRC0, SRC1 ) KALUDSS4( add, DST,SRC0,SRC1 )
|
|
#define KSUBDSS1( DST, SRC0, SRC1 ) KALUDSS1( sub, DST,SRC0,SRC1 )
|
|
#define KSUBDSS4( DST, SRC0, SRC1 ) KALUDSS4( sub, DST,SRC0,SRC1 )
|
|
#define KMULDSS1( DST, SRC0, SRC1 ) KALUDSS1( mul, DST,SRC0,SRC1 )
|
|
#define KMULDSS4( DST, SRC0, SRC1 ) KALUDSS4( mul, DST,SRC0,SRC1 )
|
|
|
|
#define KDIVDSS1( DST, SRC0, SRC1 ) \
|
|
__asm movss xmm2,SRC1 \
|
|
__asm rcpss xmm3,xmm2 \
|
|
__asm mulss xmm2,xmm3 \
|
|
__asm mulss xmm2,xmm3 \
|
|
__asm addss xmm3,xmm3 \
|
|
__asm subss xmm3,xmm2 \
|
|
__asm mulss xmm3,SRC0 \
|
|
__asm movss DST,xmm3
|
|
#define KDIVDSS4( DST, SRC0, SRC1 ) \
|
|
__asm movups xmm2,SRC1 \
|
|
__asm rcpps xmm3,xmm2 \
|
|
__asm mulps xmm2,xmm3 \
|
|
__asm mulps xmm2,xmm3 \
|
|
__asm addps xmm3,xmm3 \
|
|
__asm subps xmm3,xmm2 \
|
|
__asm movups xmm2,SRC0 \
|
|
__asm mulps xmm3,xmm2 \
|
|
__asm movups DST,xmm3
|
|
#define KF2IDS1( SRC0 ) \
|
|
__asm movss xmm2,SRC0 \
|
|
__asm cvttps2pi mm2,xmm2 \
|
|
__asm movd [edi+ebx],mm2
|
|
#define KF2IDS4( SRC0 ) \
|
|
__asm movups xmm2,SRC0 \
|
|
__asm cvttps2pi mm2,xmm2 \
|
|
__asm movq [edi+ebx+0],mm2 \
|
|
__asm shufps xmm2,xmm2,SHUFFLEPS(1,0,3,2) \
|
|
__asm cvttps2pi mm2,xmm2 \
|
|
__asm movq [edi+ebx+8],mm2
|
|
#define KISQRTDS1( DST,SRC0 ) \
|
|
__asm movss xmm2,SRC0 \
|
|
__asm rsqrtss xmm3,xmm2 \
|
|
__asm mulss xmm2,xmm3 \
|
|
__asm mulss xmm2,xmm3 \
|
|
__asm subss xmm2,xmm1 \
|
|
__asm mulss xmm3,xmm0 \
|
|
__asm mulss xmm3,xmm2 \
|
|
__asm movss DST,xmm3
|
|
#define KISQRTDS4( DST,SRC0 ) \
|
|
__asm movups xmm2,SRC0 \
|
|
__asm rsqrtps xmm3,xmm2 \
|
|
__asm mulps xmm2,xmm3 \
|
|
__asm mulps xmm2,xmm3 \
|
|
__asm subps xmm2,xmm1 \
|
|
__asm mulps xmm3,xmm0 \
|
|
__asm mulps xmm3,xmm2 \
|
|
__asm movups DST,xmm3
|
|
|
|
// this is used in vector4 implementation to shift constant V4
|
|
#define KANDREGDSV( DST, SRC0, VALUE ) \
|
|
__asm mov DST,SRC0 \
|
|
__asm and DST,VALUE
|
|
|
|
// this is used in vector4 code to operate with float arrays as sources
|
|
#define KEXPANDFLOAT( DST, SRC ) \
|
|
__asm movss DST,SRC \
|
|
__asm shufps DST,DST,0
|
|
|
|
#define KADDDS1( DST,SRC ) KADDDSS1( DST,DST,SRC )
|
|
#define KADDDS4( DST,SRC ) KADDDSS4( DST,DST,SRC )
|
|
#define KSUBDS1( DST,SRC ) KSUBDSS1( DST,DST,SRC )
|
|
#define KSUBDS4( DST,SRC ) KSUBDSS4( DST,DST,SRC )
|
|
#define KMULDS1( DST,SRC ) KMULDSS1( DST,DST,SRC )
|
|
#define KMULDS4( DST,SRC ) KMULDSS4( DST,DST,SRC )
|
|
#define KDIVDS1( DST,SRC ) KDIVDSS1( DST,DST,SRC )
|
|
#define KDIVDS4( DST,SRC ) KDIVDSS4( DST,DST,SRC )
|
|
|
|
// handles pre & post leftovers
|
|
#define KFLOATOPER( OPER, OPER4, COUNT ) \
|
|
__asm mov ecx,pre \
|
|
__asm mov ebx,COUNT \
|
|
__asm cmp ebx,ecx \
|
|
__asm cmovl ecx,COUNT \
|
|
__asm test ecx,ecx \
|
|
__asm je preDone \
|
|
__asm xor ebx,ebx \
|
|
__asm lpPre: \
|
|
OPER \
|
|
__asm add ebx,4 \
|
|
__asm dec ecx \
|
|
__asm jg lpPre \
|
|
__asm preDone: \
|
|
__asm mov ecx,post \
|
|
__asm mov ebx,COUNT \
|
|
__asm sub ebx,ecx \
|
|
__asm shl ebx,2 \
|
|
__asm cmp ecx,4 \
|
|
__asm jl post4Done \
|
|
OPER4 \
|
|
__asm sub ecx,4 \
|
|
__asm add ebx,4*4 \
|
|
__asm post4Done: \
|
|
__asm test ecx,ecx \
|
|
__asm je postDone \
|
|
__asm lpPost: \
|
|
OPER \
|
|
__asm add ebx,4 \
|
|
__asm dec ecx \
|
|
__asm jg lpPost \
|
|
__asm postDone:
|
|
|
|
// operate on a constant and a float array
|
|
#define KFLOAT_CA( ALUOP, DST, SRC, CONSTANT, COUNT ) \
|
|
int pre,post; \
|
|
__asm movss xmm0,CONSTANT \
|
|
__asm shufps xmm0,xmm0,0 \
|
|
KFLOATINITDS( DST, SRC, COUNT, pre, post ) \
|
|
__asm and eax,15 \
|
|
__asm jne lpNA \
|
|
__asm jmp lpA \
|
|
__asm align 16 \
|
|
__asm lpA: \
|
|
__asm prefetchnta [edx+ebx+64] \
|
|
__asm movaps xmm1,xmm0 \
|
|
__asm movaps xmm2,xmm0 \
|
|
__asm ALUOP##ps xmm1,[edx+ebx] \
|
|
__asm ALUOP##ps xmm2,[edx+ebx+16] \
|
|
__asm movaps [edi+ebx],xmm1 \
|
|
__asm movaps [edi+ebx+16],xmm2 \
|
|
__asm add ebx,16*2 \
|
|
__asm jl lpA \
|
|
__asm jmp done \
|
|
__asm align 16 \
|
|
__asm lpNA: \
|
|
__asm prefetchnta [edx+ebx+64] \
|
|
__asm movaps xmm1,xmm0 \
|
|
__asm movaps xmm2,xmm0 \
|
|
__asm movups xmm3,[edx+ebx] \
|
|
__asm movups xmm4,[edx+ebx+16] \
|
|
__asm ALUOP##ps xmm1,xmm3 \
|
|
__asm ALUOP##ps xmm2,xmm4 \
|
|
__asm movaps [edi+ebx],xmm1 \
|
|
__asm movaps [edi+ebx+16],xmm2 \
|
|
__asm add ebx,16*2 \
|
|
__asm jl lpNA \
|
|
__asm done: \
|
|
__asm mov edx,SRC \
|
|
__asm mov edi,DST \
|
|
__asm KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), \
|
|
__asm KALUDSS4( ALUOP, [edi+ebx],xmm0,[edx+ebx] ), COUNT )
|
|
|
|
// operate on two float arrays
|
|
#define KFLOAT_AA( ALUOP, DST, SRC0, SRC1, COUNT ) \
|
|
int pre,post; \
|
|
KFLOATINITDSS( DST, SRC0, SRC1, COUNT, pre, post ) \
|
|
__asm and eax,15 \
|
|
__asm jne lpNA \
|
|
__asm jmp lpA \
|
|
__asm align 16 \
|
|
__asm lpA: \
|
|
__asm movaps xmm1,[edx+ebx] \
|
|
__asm movaps xmm2,[edx+ebx+16] \
|
|
__asm ALUOP##ps xmm1,[esi+ebx] \
|
|
__asm ALUOP##ps xmm2,[esi+ebx+16] \
|
|
__asm prefetchnta [edx+ebx+64] \
|
|
__asm prefetchnta [esi+ebx+64] \
|
|
__asm movaps [edi+ebx],xmm1 \
|
|
__asm movaps [edi+ebx+16],xmm2 \
|
|
__asm add ebx,16*2 \
|
|
__asm jl lpA \
|
|
__asm jmp done \
|
|
__asm align 16 \
|
|
__asm lpNA: \
|
|
__asm movups xmm1,[edx+ebx] \
|
|
__asm movups xmm2,[edx+ebx+16] \
|
|
__asm movups xmm3,[esi+ebx] \
|
|
__asm movups xmm4,[esi+ebx+16] \
|
|
__asm prefetchnta [edx+ebx+64] \
|
|
__asm prefetchnta [esi+ebx+64] \
|
|
__asm ALUOP##ps xmm1,xmm3 \
|
|
__asm ALUOP##ps xmm2,xmm4 \
|
|
__asm movaps [edi+ebx],xmm1 \
|
|
__asm movaps [edi+ebx+16],xmm2 \
|
|
__asm add ebx,16*2 \
|
|
__asm jl lpNA \
|
|
__asm done: \
|
|
__asm mov edx,SRC0 \
|
|
__asm mov esi,SRC1 \
|
|
__asm mov edi,DST \
|
|
KFLOATOPER( KALUDSS1( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), \
|
|
KALUDSS4( ALUOP, [edi+ebx],[edx+ebx],[esi+ebx] ), COUNT )
|
|
|
|
|
|
#define DRAWVERT_SIZE 60
|
|
|
|
#define JOINTQUAT_SIZE (7*4)
|
|
#define JOINTMAT_SIZE (4*3*4)
|
|
#define JOINTWEIGHT_SIZE (4*4)
|
|
|
|
|
|
#define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
|
|
#define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
|
|
#define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
|
|
|
|
ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
|
|
ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
|
|
|
|
ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle0, (3<<0)|(2<<8)|(1<<16)|(0<<24) );
|
|
ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle1, (0<<0)|(1<<8)|(2<<16)|(3<<24) );
|
|
ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle2, (1<<0)|(0<<8)|(3<<16)|(2<<24) );
|
|
ALIGN4_INIT1( unsigned int SIMD_DW_mat2quatShuffle3, (2<<0)|(3<<8)|(0<<16)|(1<<24) );
|
|
|
|
ALIGN4_INIT4( unsigned int SIMD_SP_singleSignBitMask, (unsigned int) ( 1 << 31 ), 0, 0, 0 );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_signBitMask, (unsigned int) ( 1 << 31 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_absMask, (unsigned int) ~( 1 << 31 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_infinityMask, (unsigned int) ~( 1 << 23 ) );
|
|
ALIGN4_INIT1( unsigned int SIMD_SP_not, 0xFFFFFFFF );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_half, 0.5f );
|
|
ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
|
|
ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
|
|
ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
|
|
ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
|
|
ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
|
|
ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_rsqrt_c0, 3.0f );
|
|
ALIGN4_INIT1( float SIMD_SP_rsqrt_c1, -0.5f );
|
|
ALIGN4_INIT1( float SIMD_SP_mat2quat_rsqrt_c1, -0.5f*0.5f );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_sin_c0, -2.39e-08f );
|
|
ALIGN4_INIT1( float SIMD_SP_sin_c1, 2.7526e-06f );
|
|
ALIGN4_INIT1( float SIMD_SP_sin_c2, -1.98409e-04f );
|
|
ALIGN4_INIT1( float SIMD_SP_sin_c3, 8.3333315e-03f );
|
|
ALIGN4_INIT1( float SIMD_SP_sin_c4, -1.666666664e-01f );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_cos_c0, -2.605e-07f );
|
|
ALIGN4_INIT1( float SIMD_SP_cos_c1, 2.47609e-05f );
|
|
ALIGN4_INIT1( float SIMD_SP_cos_c2, -1.3888397e-03f );
|
|
ALIGN4_INIT1( float SIMD_SP_cos_c3, 4.16666418e-02f );
|
|
ALIGN4_INIT1( float SIMD_SP_cos_c4, -4.999999963e-01f );
|
|
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c0, 0.0028662257f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c1, -0.0161657367f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c2, 0.0429096138f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c3, -0.0752896400f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c4, 0.1065626393f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c5, -0.1420889944f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c6, 0.1999355085f );
|
|
ALIGN4_INIT1( float SIMD_SP_atan_c7, -0.3333314528f );
|
|
|
|
/*
|
|
============
|
|
SSE_InvSqrt
|
|
============
|
|
*/
|
|
float SSE_InvSqrt( float x ) {
|
|
float y;
|
|
|
|
__asm {
|
|
movss xmm0, x
|
|
rsqrtss xmm1, xmm0
|
|
mulss xmm0, xmm1
|
|
mulss xmm0, xmm1
|
|
subss xmm0, SIMD_SP_rsqrt_c0
|
|
mulss xmm1, SIMD_SP_rsqrt_c1
|
|
mulss xmm0, xmm1
|
|
movss y, xmm0
|
|
}
|
|
return y;
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_InvSqrt4
|
|
============
|
|
*/
|
|
void SSE_InvSqrt4( float x[4] ) {
|
|
__asm {
|
|
mov edi, x
|
|
movaps xmm0, [edi]
|
|
rsqrtps xmm1, xmm0
|
|
mulps xmm0, xmm1
|
|
mulps xmm0, xmm1
|
|
subps xmm0, SIMD_SP_rsqrt_c0
|
|
mulps xmm1, SIMD_SP_rsqrt_c1
|
|
mulps xmm0, xmm1
|
|
movaps [edi], xmm0
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_SinZeroHalfPI
|
|
|
|
The angle must be between zero and half PI.
|
|
============
|
|
*/
|
|
float SSE_SinZeroHalfPI( float a ) {
|
|
#if 1
|
|
|
|
float t;
|
|
|
|
assert( a >= 0.0f && a <= idMath::HALF_PI );
|
|
|
|
__asm {
|
|
movss xmm0, a
|
|
movss xmm1, xmm0
|
|
mulss xmm1, xmm1
|
|
movss xmm2, SIMD_SP_sin_c0
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c1
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c2
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c3
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c4
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_one
|
|
mulss xmm2, xmm0
|
|
movss t, xmm2
|
|
}
|
|
|
|
return t;
|
|
|
|
#else
|
|
|
|
float s, t;
|
|
|
|
assert( a >= 0.0f && a <= idMath::HALF_PI );
|
|
|
|
s = a * a;
|
|
t = -2.39e-08f;
|
|
t *= s;
|
|
t += 2.7526e-06f;
|
|
t *= s;
|
|
t += -1.98409e-04f;
|
|
t *= s;
|
|
t += 8.3333315e-03f;
|
|
t *= s;
|
|
t += -1.666666664e-01f;
|
|
t *= s;
|
|
t += 1.0f;
|
|
t *= a;
|
|
|
|
return t;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_Sin4ZeroHalfPI
|
|
|
|
The angle must be between zero and half PI.
|
|
============
|
|
*/
|
|
void SSE_Sin4ZeroHalfPI( float a[4], float s[4] ) {
|
|
__asm {
|
|
mov edi, a
|
|
mov esi, s
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, xmm0
|
|
mulps xmm1, xmm1
|
|
movaps xmm2, SIMD_SP_sin_c0
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c1
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c2
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c3
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c4
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_one
|
|
mulps xmm2, xmm0
|
|
movaps [esi], xmm2
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_Sin
|
|
============
|
|
*/
|
|
float SSE_Sin( float a ) {
|
|
#if 1
|
|
|
|
float t;
|
|
|
|
__asm {
|
|
movss xmm1, a
|
|
movss xmm2, xmm1
|
|
movss xmm3, xmm1
|
|
mulss xmm2, SIMD_SP_oneOverTwoPI
|
|
cvttss2si ecx, xmm2
|
|
cmpltss xmm3, SIMD_SP_zero
|
|
andps xmm3, SIMD_SP_one
|
|
cvtsi2ss xmm2, ecx
|
|
subss xmm2, xmm3
|
|
mulss xmm2, SIMD_SP_twoPI
|
|
subss xmm1, xmm2
|
|
|
|
movss xmm0, SIMD_SP_PI // xmm0 = PI
|
|
subss xmm0, xmm1 // xmm0 = PI - a
|
|
movss xmm1, xmm0 // xmm1 = PI - a
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
|
|
movss xmm2, xmm0 // xmm2 = PI - a
|
|
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
|
|
cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
|
|
movss xmm3, SIMD_SP_PI // xmm3 = PI
|
|
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
|
|
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
|
|
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
|
|
xorps xmm0, xmm2
|
|
addps xmm0, xmm3
|
|
|
|
movss xmm1, xmm0
|
|
mulss xmm1, xmm1
|
|
movss xmm2, SIMD_SP_sin_c0
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c1
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c2
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c3
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_sin_c4
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_one
|
|
mulss xmm2, xmm0
|
|
movss t, xmm2
|
|
}
|
|
|
|
return t;
|
|
|
|
#else
|
|
|
|
float s, t;
|
|
|
|
if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
|
|
a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
|
|
}
|
|
|
|
a = idMath::PI - a;
|
|
if ( fabs( a ) >= idMath::HALF_PI ) {
|
|
a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
|
|
}
|
|
|
|
s = a * a;
|
|
t = -2.39e-08f;
|
|
t *= s;
|
|
t += 2.7526e-06f;
|
|
t *= s;
|
|
t += -1.98409e-04f;
|
|
t *= s;
|
|
t += 8.3333315e-03f;
|
|
t *= s;
|
|
t += -1.666666664e-01f;
|
|
t *= s;
|
|
t += 1.0f;
|
|
t *= a;
|
|
|
|
return t;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_Sin4
|
|
============
|
|
*/
|
|
void SSE_Sin4( float a[4], float s[4] ) {
|
|
__asm {
|
|
mov edi, a
|
|
mov esi, s
|
|
movaps xmm1, [edi]
|
|
movaps xmm2, xmm1
|
|
mulps xmm2, SIMD_SP_oneOverTwoPI
|
|
movhlps xmm3, xmm2
|
|
cvttss2si ecx, xmm2
|
|
cvtsi2ss xmm2, ecx
|
|
cvttss2si edx, xmm3
|
|
cvtsi2ss xmm3, edx
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
cvttss2si ecx, xmm2
|
|
cvtsi2ss xmm2, ecx
|
|
cvttss2si edx, xmm3
|
|
cvtsi2ss xmm3, edx
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
|
|
movaps xmm3, xmm1
|
|
cmpltps xmm3, SIMD_SP_zero
|
|
andps xmm3, SIMD_SP_one
|
|
subps xmm2, xmm3
|
|
mulps xmm2, SIMD_SP_twoPI
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm0, SIMD_SP_PI // xmm0 = PI
|
|
subps xmm0, xmm1 // xmm0 = PI - a
|
|
movaps xmm1, xmm0 // xmm1 = PI - a
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
|
|
movaps xmm2, xmm0 // xmm2 = PI - a
|
|
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
|
|
cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
|
|
movaps xmm3, SIMD_SP_PI // xmm3 = PI
|
|
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
|
|
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
|
|
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
|
|
xorps xmm0, xmm2
|
|
addps xmm0, xmm3
|
|
|
|
movaps xmm1, xmm0
|
|
mulps xmm1, xmm1
|
|
movaps xmm2, SIMD_SP_sin_c0
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c1
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c2
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c3
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_sin_c4
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_one
|
|
mulps xmm2, xmm0
|
|
movaps [esi], xmm2
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_CosZeroHalfPI
|
|
|
|
The angle must be between zero and half PI.
|
|
============
|
|
*/
|
|
float SSE_CosZeroHalfPI( float a ) {
|
|
#if 1
|
|
|
|
float t;
|
|
|
|
assert( a >= 0.0f && a <= idMath::HALF_PI );
|
|
|
|
__asm {
|
|
movss xmm0, a
|
|
mulss xmm0, xmm0
|
|
movss xmm1, SIMD_SP_cos_c0
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c1
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c2
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c3
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c4
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_one
|
|
movss t, xmm1
|
|
}
|
|
|
|
return t;
|
|
|
|
#else
|
|
|
|
float s, t;
|
|
|
|
assert( a >= 0.0f && a <= idMath::HALF_PI );
|
|
|
|
s = a * a;
|
|
t = -2.605e-07f;
|
|
t *= s;
|
|
t += 2.47609e-05f;
|
|
t *= s;
|
|
t += -1.3888397e-03f;
|
|
t *= s;
|
|
t += 4.16666418e-02f;
|
|
t *= s;
|
|
t += -4.999999963e-01f;
|
|
t *= s;
|
|
t += 1.0f;
|
|
|
|
return t;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_Cos4ZeroHalfPI
|
|
|
|
The angle must be between zero and half PI.
|
|
============
|
|
*/
|
|
void SSE_Cos4ZeroHalfPI( float a[4], float c[4] ) {
|
|
__asm {
|
|
mov edi, a
|
|
mov esi, c
|
|
movaps xmm0, [edi]
|
|
mulps xmm0, xmm0
|
|
movaps xmm1, SIMD_SP_cos_c0
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c1
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c2
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c3
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c4
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_one
|
|
movaps [esi], xmm2
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_Cos
|
|
============
|
|
*/
|
|
float SSE_Cos( float a ) {
|
|
#if 1
|
|
|
|
float t;
|
|
|
|
__asm {
|
|
movss xmm1, a
|
|
movss xmm2, xmm1
|
|
movss xmm3, xmm1
|
|
mulss xmm2, SIMD_SP_oneOverTwoPI
|
|
cvttss2si ecx, xmm2
|
|
cmpltss xmm3, SIMD_SP_zero
|
|
andps xmm3, SIMD_SP_one
|
|
cvtsi2ss xmm2, ecx
|
|
subss xmm2, xmm3
|
|
mulss xmm2, SIMD_SP_twoPI
|
|
subss xmm1, xmm2
|
|
|
|
movss xmm0, SIMD_SP_PI // xmm0 = PI
|
|
subss xmm0, xmm1 // xmm0 = PI - a
|
|
movss xmm1, xmm0 // xmm1 = PI - a
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
|
|
movss xmm2, xmm0 // xmm2 = PI - a
|
|
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
|
|
cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
|
|
movss xmm3, SIMD_SP_PI // xmm3 = PI
|
|
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
|
|
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
|
|
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
|
|
xorps xmm0, xmm2
|
|
addps xmm0, xmm3
|
|
|
|
mulss xmm0, xmm0
|
|
movss xmm1, SIMD_SP_cos_c0
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c1
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c2
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c3
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_cos_c4
|
|
mulss xmm1, xmm0
|
|
addss xmm1, SIMD_SP_one
|
|
xorps xmm2, SIMD_SP_signBitMask
|
|
xorps xmm1, xmm2
|
|
movss t, xmm1
|
|
}
|
|
|
|
return t;
|
|
|
|
#else
|
|
|
|
float s, t;
|
|
|
|
if ( ( a < 0.0f ) || ( a >= idMath::TWO_PI ) ) {
|
|
a -= floorf( a / idMath::TWO_PI ) * idMath::TWO_PI;
|
|
}
|
|
|
|
a = idMath::PI - a;
|
|
if ( fabs( a ) >= idMath::HALF_PI ) {
|
|
a = ( ( a < 0.0f ) ? -idMath::PI : idMath::PI ) - a;
|
|
d = 1.0f;
|
|
} else {
|
|
d = -1.0f;
|
|
}
|
|
|
|
s = a * a;
|
|
t = -2.605e-07f;
|
|
t *= s;
|
|
t += 2.47609e-05f;
|
|
t *= s;
|
|
t += -1.3888397e-03f;
|
|
t *= s;
|
|
t += 4.16666418e-02f;
|
|
t *= s;
|
|
t += -4.999999963e-01f;
|
|
t *= s;
|
|
t += 1.0f;
|
|
t *= d;
|
|
|
|
return t;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_Cos4
|
|
============
|
|
*/
|
|
void SSE_Cos4( float a[4], float c[4] ) {
|
|
__asm {
|
|
mov edi, a
|
|
mov esi, c
|
|
movaps xmm1, [edi]
|
|
movaps xmm2, xmm1
|
|
mulps xmm2, SIMD_SP_oneOverTwoPI
|
|
movhlps xmm3, xmm2
|
|
cvttss2si ecx, xmm2
|
|
cvtsi2ss xmm2, ecx
|
|
cvttss2si edx, xmm3
|
|
cvtsi2ss xmm3, edx
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
cvttss2si ecx, xmm2
|
|
cvtsi2ss xmm2, ecx
|
|
cvttss2si edx, xmm3
|
|
cvtsi2ss xmm3, edx
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
|
|
movaps xmm3, xmm1
|
|
cmpltps xmm3, SIMD_SP_zero
|
|
andps xmm3, SIMD_SP_one
|
|
subps xmm2, xmm3
|
|
mulps xmm2, SIMD_SP_twoPI
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm0, SIMD_SP_PI // xmm0 = PI
|
|
subps xmm0, xmm1 // xmm0 = PI - a
|
|
movaps xmm1, xmm0 // xmm1 = PI - a
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
|
|
movaps xmm2, xmm0 // xmm2 = PI - a
|
|
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
|
|
cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
|
|
movaps xmm3, SIMD_SP_PI // xmm3 = PI
|
|
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
|
|
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
|
|
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
|
|
xorps xmm0, xmm2
|
|
addps xmm0, xmm3
|
|
|
|
mulps xmm0, xmm0
|
|
movaps xmm1, SIMD_SP_cos_c0
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c1
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c2
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c3
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_cos_c4
|
|
mulps xmm1, xmm0
|
|
addps xmm1, SIMD_SP_one
|
|
xorps xmm2, SIMD_SP_signBitMask
|
|
xorps xmm1, xmm2
|
|
movaps [esi], xmm1
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_SinCos
|
|
============
|
|
*/
|
|
void SSE_SinCos( float a, float &s, float &c ) {
|
|
__asm {
|
|
mov edi, s
|
|
mov esi, c
|
|
movss xmm1, a
|
|
movss xmm2, xmm1
|
|
movss xmm3, xmm1
|
|
mulss xmm2, SIMD_SP_oneOverTwoPI
|
|
cvttss2si ecx, xmm2
|
|
cmpltss xmm3, SIMD_SP_zero
|
|
andps xmm3, SIMD_SP_one
|
|
cvtsi2ss xmm2, ecx
|
|
subss xmm2, xmm3
|
|
mulss xmm2, SIMD_SP_twoPI
|
|
subss xmm1, xmm2
|
|
|
|
movss xmm0, SIMD_SP_PI // xmm0 = PI
|
|
subss xmm0, xmm1 // xmm0 = PI - a
|
|
movss xmm1, xmm0 // xmm1 = PI - a
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
|
|
movss xmm2, xmm0 // xmm2 = PI - a
|
|
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
|
|
cmpnltss xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
|
|
movss xmm3, SIMD_SP_PI // xmm3 = PI
|
|
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
|
|
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
|
|
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
|
|
xorps xmm0, xmm2
|
|
addps xmm0, xmm3
|
|
|
|
movss xmm1, xmm0
|
|
mulss xmm1, xmm1
|
|
movss xmm3, SIMD_SP_sin_c0
|
|
movss xmm4, SIMD_SP_cos_c0
|
|
mulss xmm3, xmm1
|
|
mulss xmm4, xmm1
|
|
addss xmm3, SIMD_SP_sin_c1
|
|
addss xmm4, SIMD_SP_cos_c1
|
|
mulss xmm3, xmm1
|
|
mulss xmm4, xmm1
|
|
addss xmm3, SIMD_SP_sin_c2
|
|
addss xmm4, SIMD_SP_cos_c2
|
|
mulss xmm3, xmm1
|
|
mulss xmm4, xmm1
|
|
addss xmm3, SIMD_SP_sin_c3
|
|
addss xmm4, SIMD_SP_cos_c3
|
|
mulss xmm3, xmm1
|
|
mulss xmm4, xmm1
|
|
addss xmm3, SIMD_SP_sin_c4
|
|
addss xmm4, SIMD_SP_cos_c4
|
|
mulss xmm3, xmm1
|
|
mulss xmm4, xmm1
|
|
addss xmm3, SIMD_SP_one
|
|
addss xmm4, SIMD_SP_one
|
|
mulss xmm3, xmm0
|
|
xorps xmm2, SIMD_SP_signBitMask
|
|
xorps xmm4, xmm2
|
|
movss [edi], xmm2
|
|
movss [esi], xmm3
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_SinCos4
|
|
============
|
|
*/
|
|
void SSE_SinCos4( float a[4], float s[4], float c[4] ) {
|
|
__asm {
|
|
mov eax, a
|
|
mov edi, s
|
|
mov esi, c
|
|
movaps xmm1, [eax]
|
|
movaps xmm2, xmm1
|
|
mulps xmm2, SIMD_SP_oneOverTwoPI
|
|
movhlps xmm3, xmm2
|
|
cvttss2si ecx, xmm2
|
|
cvtsi2ss xmm2, ecx
|
|
cvttss2si edx, xmm3
|
|
cvtsi2ss xmm3, edx
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
cvttss2si ecx, xmm2
|
|
cvtsi2ss xmm2, ecx
|
|
cvttss2si edx, xmm3
|
|
cvtsi2ss xmm3, edx
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 0, 1, 0 )
|
|
movaps xmm3, xmm1
|
|
cmpltps xmm3, SIMD_SP_zero
|
|
andps xmm3, SIMD_SP_one
|
|
subps xmm2, xmm3
|
|
mulps xmm2, SIMD_SP_twoPI
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm0, SIMD_SP_PI // xmm0 = PI
|
|
subps xmm0, xmm1 // xmm0 = PI - a
|
|
movaps xmm1, xmm0 // xmm1 = PI - a
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signbit( PI - a )
|
|
movaps xmm2, xmm0 // xmm2 = PI - a
|
|
xorps xmm2, xmm1 // xmm2 = fabs( PI - a )
|
|
cmpnltps xmm2, SIMD_SP_halfPI // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? 0xFFFFFFFF : 0x00000000
|
|
movaps xmm3, SIMD_SP_PI // xmm3 = PI
|
|
xorps xmm3, xmm1 // xmm3 = PI ^ signbit( PI - a )
|
|
andps xmm3, xmm2 // xmm3 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? ( PI ^ signbit( PI - a ) ) : 0.0f
|
|
andps xmm2, SIMD_SP_signBitMask // xmm2 = ( fabs( PI - a ) >= idMath::HALF_PI ) ? SIMD_SP_signBitMask : 0.0f
|
|
xorps xmm0, xmm2
|
|
addps xmm0, xmm3
|
|
|
|
movaps xmm0, [eax]
|
|
movaps xmm1, xmm0
|
|
mulps xmm1, xmm1
|
|
movaps xmm3, SIMD_SP_sin_c0
|
|
movaps xmm4, SIMD_SP_cos_c0
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm1
|
|
addps xmm3, SIMD_SP_sin_c1
|
|
addps xmm4, SIMD_SP_cos_c1
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm1
|
|
addps xmm3, SIMD_SP_sin_c2
|
|
addps xmm4, SIMD_SP_cos_c2
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm1
|
|
addps xmm3, SIMD_SP_sin_c3
|
|
addps xmm4, SIMD_SP_cos_c3
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm1
|
|
addps xmm3, SIMD_SP_sin_c4
|
|
addps xmm4, SIMD_SP_cos_c4
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm1
|
|
addps xmm3, SIMD_SP_one
|
|
addps xmm4, SIMD_SP_one
|
|
mulps xmm3, xmm0
|
|
xorps xmm2, SIMD_SP_signBitMask
|
|
xorps xmm4, xmm2
|
|
movaps [edi], xmm3
|
|
movaps [esi], xmm4
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_ATanPositive
|
|
|
|
Both 'x' and 'y' must be positive.
|
|
============
|
|
*/
|
|
float SSE_ATanPositive( float y, float x ) {
|
|
#if 1
|
|
|
|
float t;
|
|
|
|
assert( y >= 0.0f && x >= 0.0f );
|
|
|
|
__asm {
|
|
movss xmm0, x
|
|
movss xmm3, xmm0
|
|
movss xmm1, y
|
|
minss xmm0, xmm1
|
|
maxss xmm1, xmm3
|
|
cmpeqss xmm3, xmm0
|
|
rcpss xmm2, xmm1
|
|
mulss xmm1, xmm2
|
|
mulss xmm1, xmm2
|
|
addss xmm2, xmm2
|
|
subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
|
|
mulss xmm0, xmm2 // xmm0 = x / y or y / x
|
|
movss xmm1, xmm3
|
|
andps xmm1, SIMD_SP_signBitMask
|
|
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
|
|
andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
|
|
movss xmm1, xmm0
|
|
mulss xmm1, xmm1 // xmm1 = s
|
|
movss xmm2, SIMD_SP_atan_c0
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c1
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c2
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c3
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c4
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c5
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c6
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c7
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_one
|
|
mulss xmm2, xmm0
|
|
addss xmm2, xmm3
|
|
movss t, xmm2
|
|
}
|
|
|
|
return t;
|
|
|
|
#else
|
|
|
|
float a, d, s, t;
|
|
|
|
assert( y >= 0.0f && x >= 0.0f );
|
|
|
|
if ( y > x ) {
|
|
a = -x / y;
|
|
d = idMath::HALF_PI;
|
|
} else {
|
|
a = y / x;
|
|
d = 0.0f;
|
|
}
|
|
s = a * a;
|
|
t = 0.0028662257f;
|
|
t *= s;
|
|
t += -0.0161657367f;
|
|
t *= s;
|
|
t += 0.0429096138f;
|
|
t *= s;
|
|
t += -0.0752896400f;
|
|
t *= s;
|
|
t += 0.1065626393f;
|
|
t *= s;
|
|
t += -0.1420889944f;
|
|
t *= s;
|
|
t += 0.1999355085f;
|
|
t *= s;
|
|
t += -0.3333314528f;
|
|
t *= s;
|
|
t += 1.0f;
|
|
t *= a;
|
|
t += d;
|
|
|
|
return t;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_ATan4Positive
|
|
|
|
Both 'x' and 'y' must be positive.
|
|
============
|
|
*/
|
|
void SSE_ATan4Positive( float y[4], float x[4], float at[4] ) {
|
|
__asm {
|
|
mov esi, x
|
|
mov edi, y
|
|
mov edx, at
|
|
movaps xmm0, [esi]
|
|
movaps xmm3, xmm0
|
|
movaps xmm1, [edi]
|
|
minps xmm0, xmm1
|
|
maxps xmm1, xmm3
|
|
cmpeqps xmm3, xmm0
|
|
rcpps xmm2, xmm1
|
|
mulps xmm1, xmm2
|
|
mulps xmm1, xmm2
|
|
addps xmm2, xmm2
|
|
subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
|
|
mulps xmm0, xmm2 // xmm0 = x / y or y / x
|
|
movaps xmm1, xmm3
|
|
andps xmm1, SIMD_SP_signBitMask
|
|
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
|
|
andps xmm3, SIMD_SP_halfPI // xmm3 = HALF_PI or 0.0f
|
|
movaps xmm1, xmm0
|
|
mulps xmm1, xmm1 // xmm1 = s
|
|
movaps xmm2, SIMD_SP_atan_c0
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c1
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c2
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c3
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c4
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c5
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c6
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c7
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_one
|
|
mulps xmm2, xmm0
|
|
addps xmm2, xmm3
|
|
movaps [edx], xmm2
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_ATan
|
|
============
|
|
*/
|
|
float SSE_ATan( float y, float x ) {
|
|
#if 1
|
|
|
|
float t;
|
|
|
|
__asm {
|
|
movss xmm0, x
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm0
|
|
andps xmm0, SIMD_SP_absMask
|
|
movss xmm1, y
|
|
xorps xmm4, xmm1
|
|
andps xmm1, SIMD_SP_absMask
|
|
andps xmm4, SIMD_SP_signBitMask
|
|
minss xmm0, xmm1
|
|
maxss xmm1, xmm3
|
|
cmpeqss xmm3, xmm0
|
|
rcpss xmm2, xmm1
|
|
mulss xmm1, xmm2
|
|
mulss xmm1, xmm2
|
|
addss xmm2, xmm2
|
|
subss xmm2, xmm1 // xmm2 = 1 / y or 1 / x
|
|
mulss xmm0, xmm2 // xmm0 = x / y or y / x
|
|
xorps xmm0, xmm4
|
|
movss xmm1, xmm3
|
|
andps xmm1, SIMD_SP_signBitMask
|
|
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
|
|
orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
|
|
andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
|
|
movss xmm1, xmm0
|
|
mulss xmm1, xmm1 // xmm1 = s
|
|
movss xmm2, SIMD_SP_atan_c0
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c1
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c2
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c3
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c4
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c5
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c6
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_atan_c7
|
|
mulss xmm2, xmm1
|
|
addss xmm2, SIMD_SP_one
|
|
mulss xmm2, xmm0
|
|
addss xmm2, xmm3
|
|
movss t, xmm2
|
|
}
|
|
|
|
return t;
|
|
|
|
#else
|
|
|
|
float a, d, s, t;
|
|
|
|
if ( fabs( y ) > fabs( x ) ) {
|
|
a = -x / y;
|
|
d = idMath::HALF_PI;
|
|
*((unsigned int *)&d) ^= ( *((unsigned int *)&x) ^ *((unsigned int *)&y) ) & (1<<31);
|
|
} else {
|
|
a = y / x;
|
|
d = 0.0f;
|
|
}
|
|
|
|
s = a * a;
|
|
t = 0.0028662257f;
|
|
t *= s;
|
|
t += -0.0161657367f;
|
|
t *= s;
|
|
t += 0.0429096138f;
|
|
t *= s;
|
|
t += -0.0752896400f;
|
|
t *= s;
|
|
t += 0.1065626393f;
|
|
t *= s;
|
|
t += -0.1420889944f;
|
|
t *= s;
|
|
t += 0.1999355085f;
|
|
t *= s;
|
|
t += -0.3333314528f;
|
|
t *= s;
|
|
t += 1.0f;
|
|
t *= a;
|
|
t += d;
|
|
|
|
return t;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_ATan4
|
|
============
|
|
*/
|
|
void SSE_ATan4( float y[4], float x[4], float at[4] ) {
|
|
__asm {
|
|
mov esi, x
|
|
mov edi, y
|
|
mov edx, at
|
|
movaps xmm0, [esi]
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm0
|
|
andps xmm0, SIMD_SP_absMask
|
|
movaps xmm1, [edi]
|
|
xorps xmm4, xmm1
|
|
andps xmm1, SIMD_SP_absMask
|
|
andps xmm4, SIMD_SP_signBitMask
|
|
minps xmm0, xmm1
|
|
maxps xmm1, xmm3
|
|
cmpeqps xmm3, xmm0
|
|
rcpps xmm2, xmm1
|
|
mulps xmm1, xmm2
|
|
mulps xmm1, xmm2
|
|
addps xmm2, xmm2
|
|
subps xmm2, xmm1 // xmm2 = 1 / y or 1 / x
|
|
mulps xmm0, xmm2 // xmm0 = x / y or y / x
|
|
xorps xmm0, xmm4
|
|
movaps xmm1, xmm3
|
|
andps xmm1, SIMD_SP_signBitMask
|
|
xorps xmm0, xmm1 // xmm0 = -x / y or y / x
|
|
orps xmm4, SIMD_SP_halfPI // xmm4 = +/- HALF_PI
|
|
andps xmm3, xmm4 // xmm3 = +/- HALF_PI or 0.0f
|
|
movaps xmm1, xmm0
|
|
mulps xmm1, xmm1 // xmm1 = s
|
|
movaps xmm2, SIMD_SP_atan_c0
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c1
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c2
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c3
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c4
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c5
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c6
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_atan_c7
|
|
mulps xmm2, xmm1
|
|
addps xmm2, SIMD_SP_one
|
|
mulps xmm2, xmm0
|
|
addps xmm2, xmm3
|
|
movaps [edx], xmm2
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_TestTrigonometry
|
|
============
|
|
*/
|
|
void SSE_TestTrigonometry( void ) {
|
|
int i;
|
|
float a, s1, s2, c1, c2;
|
|
|
|
for ( i = 0; i < 100; i++ ) {
|
|
a = i * idMath::HALF_PI / 100.0f;
|
|
|
|
s1 = sin( a );
|
|
s2 = SSE_SinZeroHalfPI( a );
|
|
|
|
if ( fabs( s1 - s2 ) > 1e-7f ) {
|
|
assert( 0 );
|
|
}
|
|
|
|
c1 = cos( a );
|
|
c2 = SSE_CosZeroHalfPI( a );
|
|
|
|
if ( fabs( c1 - c2 ) > 1e-7f ) {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
for ( i = -200; i < 200; i++ ) {
|
|
a = i * idMath::TWO_PI / 100.0f;
|
|
|
|
s1 = sin( a );
|
|
s2 = SSE_Sin( a );
|
|
|
|
if ( fabs( s1 - s2 ) > 1e-6f ) {
|
|
assert( 0 );
|
|
}
|
|
|
|
c1 = cos( a );
|
|
c2 = SSE_Cos( a );
|
|
|
|
if ( fabs( c1 - c2 ) > 1e-6f ) {
|
|
assert( 0 );
|
|
}
|
|
|
|
SSE_SinCos( a, s2, c2 );
|
|
if ( fabs( s1 - s2 ) > 1e-6f || fabs( c1 - c2 ) > 1e-6f ) {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_SSE::GetName( void ) const {
|
|
return "MMX & SSE";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Add
|
|
|
|
dst[i] = constant + src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Add( float *dst, const float constant, const float *src, const int count ) {
|
|
KFLOAT_CA( add, dst, src, constant, count )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Add
|
|
|
|
dst[i] = src0[i] + src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Add( float *dst, const float *src0, const float *src1, const int count ) {
|
|
KFLOAT_AA( add, dst, src0, src1, count )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Sub
|
|
|
|
dst[i] = constant - src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Sub( float *dst, const float constant, const float *src, const int count ) {
|
|
KFLOAT_CA( sub, dst, src, constant, count )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Sub
|
|
|
|
dst[i] = src0[i] - src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Sub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
KFLOAT_AA( sub, dst, src0, src1, count )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Mul
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Mul( float *dst, const float constant, const float *src, const int count ) {
|
|
KFLOAT_CA( mul, dst, src, constant, count )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Mul
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Mul( float *dst, const float *src0, const float *src1, const int count ) {
|
|
KFLOAT_AA( mul, dst, src0, src1, count )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Div
|
|
|
|
dst[i] = constant / src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Div( float *dst, const float constant, const float *src, const int count ) {
|
|
int pre, post;
|
|
|
|
// 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
|
|
__asm
|
|
{
|
|
movss xmm1,constant
|
|
shufps xmm1,xmm1,0
|
|
|
|
KFLOATINITDS( dst, src, count, pre, post )
|
|
and eax,15
|
|
jne lpNA
|
|
jmp lpA
|
|
align 16
|
|
lpA:
|
|
movaps xmm2,[edx+ebx]
|
|
movaps xmm3,[edx+ebx+16]
|
|
rcpps xmm4,xmm2
|
|
rcpps xmm5,xmm3
|
|
prefetchnta [edx+ebx+64]
|
|
mulps xmm2,xmm4
|
|
mulps xmm2,xmm4
|
|
mulps xmm3,xmm5
|
|
mulps xmm3,xmm5
|
|
addps xmm4,xmm4
|
|
addps xmm5,xmm5
|
|
subps xmm4,xmm2
|
|
subps xmm5,xmm3
|
|
mulps xmm4,xmm1
|
|
mulps xmm5,xmm1
|
|
movaps [edi+ebx],xmm4
|
|
movaps [edi+ebx+16],xmm5
|
|
add ebx,16*2
|
|
jl lpA
|
|
jmp done
|
|
align 16
|
|
lpNA:
|
|
movups xmm2,[edx+ebx]
|
|
movups xmm3,[edx+ebx+16]
|
|
rcpps xmm4,xmm2
|
|
rcpps xmm5,xmm3
|
|
prefetchnta [edx+ebx+64]
|
|
mulps xmm2,xmm4
|
|
mulps xmm2,xmm4
|
|
mulps xmm3,xmm5
|
|
mulps xmm3,xmm5
|
|
addps xmm4,xmm4
|
|
addps xmm5,xmm5
|
|
subps xmm4,xmm2
|
|
subps xmm5,xmm3
|
|
mulps xmm4,xmm1
|
|
mulps xmm5,xmm1
|
|
movaps [edi+ebx],xmm4
|
|
movaps [edi+ebx+16],xmm5
|
|
add ebx,16*2
|
|
jl lpNA
|
|
done:
|
|
mov edx,src
|
|
mov edi,dst
|
|
KFLOATOPER( KDIVDSS1( [edi+ebx],xmm1,[edx+ebx] ),
|
|
KDIVDSS4( [edi+ebx],xmm1,[edx+ebx] ), count )
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Div
|
|
|
|
dst[i] = src0[i] / src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Div( float *dst, const float *src0, const float *src1, const int count ) {
|
|
int pre,post;
|
|
|
|
// 1 / x = 2 * rcpps(x) - (x * rcpps(x) * rcpps(x));
|
|
__asm
|
|
{
|
|
KFLOATINITDSS( dst, src0, src1, count, pre, post )
|
|
and eax,15
|
|
jne lpNA
|
|
jmp lpA
|
|
align 16
|
|
lpA:
|
|
movaps xmm2,[esi+ebx]
|
|
movaps xmm3,[esi+ebx+16]
|
|
rcpps xmm4,xmm2
|
|
rcpps xmm5,xmm3
|
|
prefetchnta [esi+ebx+64]
|
|
mulps xmm2,xmm4
|
|
mulps xmm2,xmm4
|
|
mulps xmm3,xmm5
|
|
mulps xmm3,xmm5
|
|
addps xmm4,xmm4
|
|
addps xmm5,xmm5
|
|
subps xmm4,xmm2
|
|
subps xmm5,xmm3
|
|
mulps xmm4,[edx+ebx]
|
|
mulps xmm5,[edx+ebx+16]
|
|
movaps [edi+ebx],xmm4
|
|
movaps [edi+ebx+16],xmm5
|
|
add ebx,16*2
|
|
jl lpA
|
|
jmp done
|
|
align 16
|
|
lpNA:
|
|
movups xmm2,[esi+ebx]
|
|
movups xmm3,[esi+ebx+16]
|
|
rcpps xmm4,xmm2
|
|
rcpps xmm5,xmm3
|
|
prefetchnta [esi+ebx+64]
|
|
mulps xmm2,xmm4
|
|
mulps xmm2,xmm4
|
|
mulps xmm3,xmm5
|
|
mulps xmm3,xmm5
|
|
addps xmm4,xmm4
|
|
addps xmm5,xmm5
|
|
subps xmm4,xmm2
|
|
subps xmm5,xmm3
|
|
movups xmm2,[edx+ebx]
|
|
movups xmm3,[edx+ebx+16]
|
|
mulps xmm4,xmm2
|
|
mulps xmm5,xmm3
|
|
movaps [edi+ebx],xmm4
|
|
movaps [edi+ebx+16],xmm5
|
|
add ebx,16*2
|
|
jl lpNA
|
|
done:
|
|
mov edx,src0
|
|
mov esi,src1
|
|
mov edi,dst
|
|
KFLOATOPER( KDIVDSS1( [edi+ebx],[edx+ebx],[esi+ebx] ),
|
|
KDIVDSS4( [edi+ebx],[edx+ebx],[esi+ebx] ), count )
|
|
}
|
|
}
|
|
/*
|
|
============
|
|
Simd_MulAdd
|
|
|
|
assumes count >= 7
|
|
============
|
|
*/
|
|
static void Simd_MulAdd( float *dst, const float constant, const float *src, const int count ) {
|
|
__asm mov esi, dst
|
|
__asm mov edi, src
|
|
__asm mov eax, count
|
|
__asm shl eax, 2
|
|
__asm mov ecx, esi
|
|
__asm mov edx, eax
|
|
__asm or ecx, edi
|
|
__asm fld constant
|
|
__asm and ecx, 15
|
|
__asm jz SimdMulAdd16
|
|
__asm and ecx, 3
|
|
__asm jnz SimdMulAdd8
|
|
__asm mov ecx, esi
|
|
__asm xor ecx, edi
|
|
__asm and ecx, 15
|
|
__asm jnz MulAdd8
|
|
__asm mov ecx, esi
|
|
__asm and ecx, 15
|
|
__asm neg ecx
|
|
__asm add ecx, 16
|
|
__asm sub eax, ecx
|
|
__asm add edi, ecx
|
|
__asm add esi, ecx
|
|
__asm neg ecx
|
|
__asm mov edx, eax
|
|
__asm loopPreMulAdd16:
|
|
__asm fld st
|
|
__asm fmul dword ptr [edi+ecx]
|
|
__asm fadd dword ptr [esi+ecx]
|
|
__asm fstp dword ptr [esi+ecx]
|
|
__asm add ecx, 4
|
|
__asm jl loopPreMulAdd16
|
|
__asm SimdMulAdd16:
|
|
__asm and eax, ~15
|
|
__asm movss xmm1, constant
|
|
__asm shufps xmm1, xmm1, 0x00
|
|
__asm add esi, eax
|
|
__asm add edi, eax
|
|
__asm neg eax
|
|
__asm align 16
|
|
__asm loopMulAdd16:
|
|
__asm movaps xmm0, [edi+eax]
|
|
__asm mulps xmm0, xmm1
|
|
__asm addps xmm0, [esi+eax]
|
|
__asm movaps [esi+eax], xmm0
|
|
__asm add eax, 16
|
|
__asm jl loopMulAdd16
|
|
__asm jmp postMulAdd
|
|
__asm MulAdd8:
|
|
__asm mov ecx, esi
|
|
__asm and ecx, 7
|
|
__asm jz SimdMulAdd8
|
|
__asm sub eax, ecx
|
|
__asm add esi, ecx
|
|
__asm add edi, ecx
|
|
__asm neg ecx
|
|
__asm mov edx, eax
|
|
__asm loopPreMulAdd8:
|
|
__asm fld st
|
|
__asm fmul dword ptr [edi+ecx]
|
|
__asm fadd dword ptr [esi+ecx]
|
|
__asm fstp dword ptr [esi+ecx]
|
|
__asm add ecx, 4
|
|
__asm jl loopPreMulAdd8
|
|
__asm SimdMulAdd8:
|
|
__asm and eax, ~15
|
|
__asm movss xmm1, constant
|
|
__asm shufps xmm1, xmm1, 0x00
|
|
__asm add esi, eax
|
|
__asm add edi, eax
|
|
__asm neg eax
|
|
__asm align 16
|
|
__asm loopMulAdd8:
|
|
__asm movlps xmm0, [edi+eax]
|
|
__asm movhps xmm0, [edi+eax+8]
|
|
__asm mulps xmm0, xmm1
|
|
__asm movlps xmm2, [esi+eax]
|
|
__asm movhps xmm2, [esi+eax+8]
|
|
__asm addps xmm0, xmm2
|
|
__asm movlps [esi+eax], xmm0
|
|
__asm movhps [esi+eax+8], xmm0
|
|
__asm add eax, 16
|
|
__asm jl loopMulAdd8
|
|
__asm jmp postMulAdd
|
|
__asm postMulAdd:
|
|
__asm and edx, 15
|
|
__asm jz MulAddDone
|
|
__asm add esi, edx
|
|
__asm add edi, edx
|
|
__asm neg edx
|
|
__asm loopPostMulAdd:
|
|
__asm fld st
|
|
__asm fmul dword ptr [edi+edx]
|
|
__asm fadd dword ptr [esi+edx]
|
|
__asm fstp dword ptr [esi+edx]
|
|
__asm add edx, 4
|
|
__asm jl loopPostMulAdd
|
|
__asm MulAddDone:
|
|
__asm fstp st
|
|
}
|
|
|
|
#define MULADD_FEW( OPER ) \
|
|
switch( count ) { \
|
|
case 0: \
|
|
return; \
|
|
case 1: \
|
|
dst[0] OPER c * src[0]; \
|
|
return; \
|
|
case 2: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; \
|
|
return; \
|
|
case 3: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; \
|
|
return; \
|
|
case 4: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
return; \
|
|
case 5: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; \
|
|
return; \
|
|
case 6: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; \
|
|
return; \
|
|
case 7: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; \
|
|
return; \
|
|
case 8: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
|
|
return; \
|
|
case 9: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
|
|
dst[8] OPER c * src[8]; \
|
|
return; \
|
|
case 10: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
|
|
dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; \
|
|
return; \
|
|
case 11: \
|
|
dst[0] OPER c * src[0]; dst[1] OPER c * src[1]; dst[2] OPER c * src[2]; dst[3] OPER c * src[3]; \
|
|
dst[4] OPER c * src[4]; dst[5] OPER c * src[5]; dst[6] OPER c * src[6]; dst[7] OPER c * src[7]; \
|
|
dst[8] OPER c * src[8]; dst[9] OPER c * src[9]; dst[10] OPER c * src[10]; \
|
|
return; \
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MulAdd
|
|
|
|
dst[i] += constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MulAdd( float *dst, const float constant, const float *src, const int count ) {
|
|
float c = constant;
|
|
MULADD_FEW( += )
|
|
Simd_MulAdd( dst, constant, src, count );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MulAdd
|
|
|
|
dst[i] += src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
|
|
for ( int i = 0; i < count; i++ ) {
|
|
dst[i] += src0[i] + src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MulSub
|
|
|
|
dst[i] -= constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MulSub( float *dst, const float constant, const float *src, const int count ) {
|
|
float c = constant;
|
|
MULADD_FEW( -= )
|
|
Simd_MulAdd( dst, -constant, src, count );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MulSub
|
|
|
|
dst[i] -= src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
for ( int i = 0; i < count; i++ ) {
|
|
dst[i] -= src0[i] + src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
|
|
__asm
|
|
{
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
|
|
movss xmm4, [edi+0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm5, [edi+4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
jz done4
|
|
imul eax, 12
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
movlps xmm1, [esi+eax+ 0]
|
|
movlps xmm2, [esi+eax+ 8]
|
|
movlps xmm3, [esi+eax+16]
|
|
movhps xmm1, [esi+eax+24]
|
|
movhps xmm2, [esi+eax+32]
|
|
movhps xmm3, [esi+eax+40]
|
|
movaps xmm0, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
add ecx, 16
|
|
add eax, 4*12
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loop4
|
|
|
|
done4:
|
|
and edx, 3
|
|
jz done1
|
|
|
|
loop1:
|
|
movss xmm0, [esi+eax+0]
|
|
movss xmm1, [esi+eax+4]
|
|
movss xmm2, [esi+eax+8]
|
|
mulss xmm0, xmm4
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm6
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, 12
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loop1
|
|
|
|
done1:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
|
|
__asm {
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
|
|
movss xmm5, [edi+0]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [edi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
jz startVert1
|
|
imul eax, 16
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopVert4:
|
|
|
|
movlps xmm1, [esi+eax+ 0]
|
|
movlps xmm3, [esi+eax+ 8]
|
|
movhps xmm1, [esi+eax+16]
|
|
movhps xmm3, [esi+eax+24]
|
|
movlps xmm2, [esi+eax+32]
|
|
movlps xmm4, [esi+eax+40]
|
|
movhps xmm2, [esi+eax+48]
|
|
movhps xmm4, [esi+eax+56]
|
|
movaps xmm0, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
movaps xmm2, xmm3
|
|
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm3, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
|
|
add ecx, 16
|
|
add eax, 4*16
|
|
|
|
mulps xmm0, xmm5
|
|
mulps xmm1, xmm6
|
|
mulps xmm2, xmm7
|
|
addps xmm0, xmm3
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loopVert4
|
|
|
|
startVert1:
|
|
and edx, 3
|
|
jz done
|
|
|
|
loopVert1:
|
|
movss xmm0, [esi+eax+0]
|
|
movss xmm1, [esi+eax+4]
|
|
movss xmm2, [esi+eax+8]
|
|
mulss xmm0, xmm5
|
|
mulss xmm1, xmm6
|
|
mulss xmm2, xmm7
|
|
addss xmm0, [esi+eax+12]
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, 16
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loopVert1
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant * src[i].xyz;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
// 0, 1, 2
|
|
// 3, 4, 5
|
|
// 6, 7, 8
|
|
// 9, 10, 11
|
|
|
|
__asm {
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
|
|
movss xmm4, [edi+0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm5, [edi+4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
jz startVert1
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopVert4:
|
|
movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
|
|
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
|
|
movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
|
|
movaps xmm1, xmm0 // 3, X, 0, 1
|
|
|
|
movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
|
|
shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
|
|
|
|
movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
|
|
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
|
|
shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
|
|
|
|
movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
|
|
|
|
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
|
|
|
|
add ecx, 16
|
|
add eax, 4*DRAWVERT_SIZE
|
|
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loopVert4
|
|
|
|
startVert1:
|
|
and edx, 3
|
|
jz done
|
|
|
|
loopVert1:
|
|
movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
|
|
mulss xmm0, xmm4
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm6
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, DRAWVERT_SIZE
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loopVert1
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i] + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
|
|
__asm
|
|
{
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
|
|
movss xmm4, [edi+0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm5, [edi+4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [edi+12]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
jz done4
|
|
imul eax, 12
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
movlps xmm1, [esi+eax+ 0]
|
|
movlps xmm2, [esi+eax+ 8]
|
|
movlps xmm3, [esi+eax+16]
|
|
movhps xmm1, [esi+eax+24]
|
|
movhps xmm2, [esi+eax+32]
|
|
movhps xmm3, [esi+eax+40]
|
|
movaps xmm0, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 1, 3, 0, 2 )
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
|
|
add ecx, 16
|
|
add eax, 4*12
|
|
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loop4
|
|
|
|
done4:
|
|
and edx, 3
|
|
jz done1
|
|
|
|
loop1:
|
|
movss xmm0, [esi+eax+0]
|
|
movss xmm1, [esi+eax+4]
|
|
movss xmm2, [esi+eax+8]
|
|
mulss xmm0, xmm4
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm6
|
|
addss xmm0, xmm7
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, 12
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loop1
|
|
|
|
done1:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
|
|
|
|
#define SINGLE_OP(SRC, DEST) \
|
|
__asm movlps xmm0,[SRC] \
|
|
__asm movlps xmm1,[SRC+8] \
|
|
__asm mulps xmm0,xmm4 \
|
|
__asm mulps xmm1,xmm5 \
|
|
__asm addps xmm0,xmm1 \
|
|
__asm movaps xmm1,xmm0 \
|
|
__asm shufps xmm1,xmm1,SHUFFLEPS(1,1,1,1) \
|
|
__asm addss xmm0,xmm1 \
|
|
__asm movss [DEST],xmm0 \
|
|
__asm add SRC,16 \
|
|
__asm add DEST,4
|
|
|
|
#define DUAL_OP(SRC, DEST) \
|
|
__asm movlps xmm0,[SRC] \
|
|
__asm movlps xmm1,[SRC+8] \
|
|
__asm movhps xmm0,[SRC+16] \
|
|
__asm movhps xmm1,[SRC+24] \
|
|
__asm mulps xmm0,xmm4 \
|
|
__asm mulps xmm1,xmm5 \
|
|
__asm addps xmm0,xmm1 \
|
|
__asm shufps xmm1,xmm0,SHUFFLEPS(2,0,1,0) \
|
|
__asm shufps xmm0,xmm0,SHUFFLEPS(3,1,2,0) \
|
|
__asm addps xmm0,xmm1 \
|
|
__asm movhps [DEST],xmm0 \
|
|
__asm add SRC,32 \
|
|
__asm add DEST,8
|
|
|
|
__asm {
|
|
mov edx, dst
|
|
mov eax, src
|
|
mov ebx, constant
|
|
mov ecx, count
|
|
|
|
movlps xmm4, [ebx]
|
|
shufps xmm4, xmm4, SHUFFLEPS(1,0,1,0)
|
|
movlps xmm5, [ebx+8]
|
|
shufps xmm5, xmm5, SHUFFLEPS(1,0,1,0)
|
|
|
|
xorps xmm0, xmm0
|
|
xorps xmm1, xmm1
|
|
|
|
_lpAlignDest:
|
|
test edx, 0x0f
|
|
jz _destAligned
|
|
SINGLE_OP(eax,edx)
|
|
dec ecx
|
|
jnz _lpAlignDest
|
|
jmp _vpExit
|
|
|
|
_destAligned:
|
|
push ecx
|
|
|
|
cmp ecx, 4
|
|
jl _post
|
|
|
|
and ecx, ~3
|
|
shl ecx, 2
|
|
lea eax, [eax+ecx*4]
|
|
add edx, ecx
|
|
neg ecx
|
|
|
|
movlps xmm0, [eax+ecx*4]
|
|
movhps xmm0, [eax+ecx*4+16]
|
|
movlps xmm2, [eax+ecx*4+32]
|
|
movhps xmm2, [eax+ecx*4+48]
|
|
jmp _lpStart
|
|
|
|
align 16
|
|
_lp:
|
|
prefetchnta [eax+ecx*4+128]
|
|
addps xmm1, xmm0
|
|
movlps xmm0, [eax+ecx*4]
|
|
movhps xmm0, [eax+ecx*4+16]
|
|
movlps xmm2, [eax+ecx*4+32]
|
|
movhps xmm2, [eax+ecx*4+48]
|
|
movaps [edx+ecx-16],xmm1
|
|
_lpStart:
|
|
movlps xmm1, [eax+ecx*4+8]
|
|
movhps xmm1, [eax+ecx*4+24]
|
|
movlps xmm3, [eax+ecx*4+40]
|
|
movhps xmm3, [eax+ecx*4+56]
|
|
add ecx, 16
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm4
|
|
mulps xmm3, xmm5
|
|
addps xmm2, xmm3 // y3+w3 x3+z3 y2+w2 x2+z2
|
|
mulps xmm0, xmm4
|
|
addps xmm0, xmm1 // y1+w1 x1+z1 y0+w0 x0+z0
|
|
movaps xmm1, xmm0
|
|
shufps xmm0, xmm2, SHUFFLEPS(2,0,2,0) // x3+z3 x2+z2 x1+z1 x0+z0
|
|
shufps xmm1, xmm2, SHUFFLEPS(3,1,3,1) // y3+w3 y2+w2 y1+w1 y0+w0
|
|
js _lp
|
|
addps xmm1, xmm0
|
|
movaps [edx+ecx-16], xmm1
|
|
_post:
|
|
pop ecx
|
|
and ecx, 0x3
|
|
cmp ecx, 2
|
|
jl _post1
|
|
DUAL_OP(eax,edx)
|
|
sub ecx, 2
|
|
_post1:
|
|
cmp ecx, 1
|
|
jne _vpExit
|
|
SINGLE_OP(eax,edx)
|
|
_vpExit:
|
|
}
|
|
|
|
#undef DUAL_OP
|
|
#undef SINGLE_OP
|
|
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
// 0, 1, 2
|
|
// 3, 4, 5
|
|
// 6, 7, 8
|
|
// 9, 10, 11
|
|
|
|
__asm {
|
|
mov eax, count
|
|
mov edi, constant
|
|
mov edx, eax
|
|
mov esi, src
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
|
|
movss xmm4, [edi+0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm5, [edi+4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [edi+8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [edi+12]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
jz startVert1
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopVert4:
|
|
movss xmm0, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, X, X
|
|
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 2, X, X, X
|
|
movhps xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 3, X, 0, 1
|
|
movaps xmm1, xmm0 // 3, X, 0, 1
|
|
|
|
movlps xmm1, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 4, 5, 0, 1
|
|
shufps xmm2, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) // 2, X, 4, 5
|
|
|
|
movss xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, X, X
|
|
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0] // 9, X, 6, 7
|
|
shufps xmm0, xmm3, R_SHUFFLEPS( 2, 0, 2, 0 ) // 0, 3, 6, 9
|
|
|
|
movlps xmm3, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4] // 10, 11, 6, 7
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 1, 4, 7, 10
|
|
|
|
movhps xmm3, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8] // 10, 11, 8, X
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 3, 2, 1 ) // 2, 5, 8, 11
|
|
|
|
add ecx, 16
|
|
add eax, 4*DRAWVERT_SIZE
|
|
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
|
|
movlps [ecx-16+0], xmm0
|
|
movhps [ecx-16+8], xmm0
|
|
jl loopVert4
|
|
|
|
startVert1:
|
|
and edx, 3
|
|
jz done
|
|
|
|
loopVert1:
|
|
movss xmm0, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm1, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm2, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
|
|
mulss xmm0, xmm4
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm6
|
|
addss xmm0, xmm7
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, DRAWVERT_SIZE
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loopVert1
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
|
|
__asm
|
|
{
|
|
mov eax, count
|
|
mov edi, src0
|
|
mov edx, eax
|
|
mov esi, src1
|
|
mov ecx, dst
|
|
and eax, ~3
|
|
|
|
jz done4
|
|
imul eax, 12
|
|
add edi, eax
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
movlps xmm0, [esi+eax] // 0, 1, X, X
|
|
movlps xmm3, [edi+eax] // 0, 1, X, X
|
|
movlps xmm1, [esi+eax+8] // 2, 3, X, X
|
|
movlps xmm4, [edi+eax+8] // 2, 3, X, X
|
|
movhps xmm0, [esi+eax+24] // 0, 1, 6, 7
|
|
movhps xmm3, [edi+eax+24] // 0, 1, 6, 7
|
|
movhps xmm1, [esi+eax+32] // 2, 3, 8, 9
|
|
movhps xmm4, [edi+eax+32] // 2, 3, 8, 9
|
|
movlps xmm2, [esi+eax+16] // 4, 5, X, X
|
|
movlps xmm5, [edi+eax+16] // 4, 5, X, X
|
|
movhps xmm2, [esi+eax+40] // 4, 5, 10, 11
|
|
movhps xmm5, [edi+eax+40] // 4, 5, 10, 11
|
|
|
|
add ecx, 16
|
|
add eax, 48
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm4
|
|
mulps xmm2, xmm5
|
|
movaps xmm7, xmm0
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 ) // 0, 6, 3, 9
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 1, 3, 0, 2 ) // 1, 7, 4, 10
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 2, 1, 3 ) // 2, 8, 5, 11
|
|
addps xmm7, xmm0
|
|
addps xmm7, xmm1
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
|
|
movlps [ecx-16+0], xmm7
|
|
movhps [ecx-16+8], xmm7
|
|
jl loop4
|
|
|
|
done4:
|
|
and edx, 3
|
|
jz done1
|
|
|
|
loop1:
|
|
movss xmm0, [esi+eax+0]
|
|
movss xmm3, [edi+eax+0]
|
|
movss xmm1, [esi+eax+4]
|
|
movss xmm4, [edi+eax+4]
|
|
movss xmm2, [esi+eax+8]
|
|
movss xmm5, [edi+eax+8]
|
|
mulss xmm0, xmm3
|
|
mulss xmm1, xmm4
|
|
mulss xmm2, xmm5
|
|
add ecx, 4
|
|
addss xmm0, xmm1
|
|
add eax, 12
|
|
addss xmm0, xmm2
|
|
dec edx
|
|
movss [ecx-4], xmm0
|
|
jnz loop1
|
|
|
|
done1:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Dot
|
|
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Dot( float &dot, const float *src1, const float *src2, const int count ) {
|
|
switch( count ) {
|
|
case 0:
|
|
dot = 0.0f;
|
|
return;
|
|
case 1:
|
|
dot = src1[0] * src2[0];
|
|
return;
|
|
case 2:
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1];
|
|
return;
|
|
case 3:
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
|
|
return;
|
|
default:
|
|
__asm {
|
|
mov ecx, src1
|
|
mov edx, src2
|
|
mov eax, ecx
|
|
or eax, edx
|
|
and eax, 15
|
|
jz alignedDot
|
|
// unaligned
|
|
mov eax, count
|
|
shr eax, 2
|
|
shl eax, 4
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
movups xmm0, [ecx+eax]
|
|
movups xmm1, [edx+eax]
|
|
mulps xmm0, xmm1
|
|
add eax, 16
|
|
jz doneDot
|
|
loopUnalignedDot:
|
|
movups xmm1, [ecx+eax]
|
|
movups xmm2, [edx+eax]
|
|
mulps xmm1, xmm2
|
|
addps xmm0, xmm1
|
|
add eax, 16
|
|
jl loopUnalignedDot
|
|
jmp doneDot
|
|
// aligned
|
|
alignedDot:
|
|
mov eax, count
|
|
shr eax, 2
|
|
shl eax, 4
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
movaps xmm0, [ecx+eax]
|
|
movaps xmm1, [edx+eax]
|
|
mulps xmm0, xmm1
|
|
add eax, 16
|
|
jz doneDot
|
|
loopAlignedDot:
|
|
movaps xmm1, [ecx+eax]
|
|
movaps xmm2, [edx+eax]
|
|
mulps xmm1, xmm2
|
|
addps xmm0, xmm1
|
|
add eax, 16
|
|
jl loopAlignedDot
|
|
doneDot:
|
|
}
|
|
switch( count & 3 ) {
|
|
case 1:
|
|
__asm {
|
|
movss xmm1, [ecx]
|
|
movss xmm2, [edx]
|
|
mulss xmm1, xmm2
|
|
addss xmm0, xmm1
|
|
}
|
|
break;
|
|
case 2:
|
|
__asm {
|
|
xorps xmm2, xmm2
|
|
movlps xmm1, [ecx]
|
|
movlps xmm2, [edx]
|
|
mulps xmm1, xmm2
|
|
addps xmm0, xmm1
|
|
}
|
|
break;
|
|
case 3:
|
|
__asm {
|
|
movss xmm1, [ecx]
|
|
movhps xmm1, [ecx+4]
|
|
movss xmm2, [edx]
|
|
movhps xmm2, [edx+4]
|
|
mulps xmm1, xmm2
|
|
addps xmm0, xmm1
|
|
}
|
|
break;
|
|
}
|
|
__asm {
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm1
|
|
mov eax, dot
|
|
movss [eax], xmm0
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
//
|
|
// cmpeqps == Equal
|
|
// cmpneqps != Not Equal
|
|
// cmpltps < Less Than
|
|
// cmpnltps >= Not Less Than
|
|
// cmpnleps > Not Less Or Equal
|
|
//
|
|
#define FLIP not al
|
|
#define NOFLIP
|
|
|
|
#define COMPARECONSTANT( DST, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
|
|
int i, cnt, pre, post; \
|
|
float *aligned; \
|
|
\
|
|
/* if the float array is not aligned on a 4 byte boundary */ \
|
|
if ( ((int) SRC0) & 3 ) { \
|
|
/* unaligned memory access */ \
|
|
pre = 0; \
|
|
cnt = COUNT >> 2; \
|
|
post = COUNT - (cnt<<2); \
|
|
__asm mov edx, cnt \
|
|
__asm test edx, edx \
|
|
__asm je doneCmp \
|
|
__asm push ebx \
|
|
__asm neg edx \
|
|
__asm mov esi, SRC0 \
|
|
__asm prefetchnta [esi+64] \
|
|
__asm movss xmm1, CONSTANT \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mov edi, DST \
|
|
__asm mov ecx, 0x01010101 \
|
|
__asm loopNA: \
|
|
__asm movups xmm0, [esi] \
|
|
__asm prefetchnta [esi+128] \
|
|
__asm CMPSIMD xmm0, xmm1 \
|
|
__asm movmskps eax, xmm0 \
|
|
__asm DOFLIP \
|
|
__asm mov ah, al \
|
|
__asm shr ah, 1 \
|
|
__asm mov bx, ax \
|
|
__asm shl ebx, 14 \
|
|
__asm mov bx, ax \
|
|
__asm and ebx, ecx \
|
|
__asm mov dword ptr [edi], ebx \
|
|
__asm add esi, 16 \
|
|
__asm add edi, 4 \
|
|
__asm inc edx \
|
|
__asm jl loopNA \
|
|
__asm pop ebx \
|
|
} \
|
|
else { \
|
|
/* aligned memory access */ \
|
|
aligned = (float *) ((((int) SRC0) + 15) & ~15); \
|
|
if ( (int)aligned > ((int)src0) + COUNT ) { \
|
|
pre = COUNT; \
|
|
post = 0; \
|
|
} \
|
|
else { \
|
|
pre = aligned - SRC0; \
|
|
cnt = (COUNT - pre) >> 2; \
|
|
post = COUNT - pre - (cnt<<2); \
|
|
__asm mov edx, cnt \
|
|
__asm test edx, edx \
|
|
__asm je doneCmp \
|
|
__asm push ebx \
|
|
__asm neg edx \
|
|
__asm mov esi, aligned \
|
|
__asm prefetchnta [esi+64] \
|
|
__asm movss xmm1, CONSTANT \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mov edi, DST \
|
|
__asm add edi, pre \
|
|
__asm mov ecx, 0x01010101 \
|
|
__asm loopA: \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm prefetchnta [esi+128] \
|
|
__asm CMPSIMD xmm0, xmm1 \
|
|
__asm movmskps eax, xmm0 \
|
|
__asm DOFLIP \
|
|
__asm mov ah, al \
|
|
__asm shr ah, 1 \
|
|
__asm mov bx, ax \
|
|
__asm shl ebx, 14 \
|
|
__asm mov bx, ax \
|
|
__asm and ebx, ecx \
|
|
__asm mov dword ptr [edi], ebx \
|
|
__asm add esi, 16 \
|
|
__asm add edi, 4 \
|
|
__asm inc edx \
|
|
__asm jl loopA \
|
|
__asm pop ebx \
|
|
} \
|
|
} \
|
|
doneCmp: \
|
|
double c = constant; \
|
|
for ( i = 0; i < pre; i++ ) { \
|
|
dst[i] = src0[i] CMP c; \
|
|
} \
|
|
for ( i = count - post; i < count; i++ ) { \
|
|
dst[i] = src0[i] CMP c; \
|
|
}
|
|
|
|
#define COMPAREBITCONSTANT( DST, BITNUM, SRC0, CONSTANT, COUNT, CMP, CMPSIMD, DOFLIP ) \
|
|
int i, cnt, pre, post; \
|
|
float *aligned; \
|
|
\
|
|
/* if the float array is not aligned on a 4 byte boundary */ \
|
|
if ( ((int) SRC0) & 3 ) { \
|
|
/* unaligned memory access */ \
|
|
pre = 0; \
|
|
cnt = COUNT >> 2; \
|
|
post = COUNT - (cnt<<2); \
|
|
__asm mov edx, cnt \
|
|
__asm test edx, edx \
|
|
__asm je doneCmp \
|
|
__asm push ebx \
|
|
__asm neg edx \
|
|
__asm mov esi, SRC0 \
|
|
__asm prefetchnta [esi+64] \
|
|
__asm movss xmm1, CONSTANT \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mov edi, DST \
|
|
__asm mov cl, bitNum \
|
|
__asm loopNA: \
|
|
__asm movups xmm0, [esi] \
|
|
__asm prefetchnta [esi+128] \
|
|
__asm CMPSIMD xmm0, xmm1 \
|
|
__asm movmskps eax, xmm0 \
|
|
__asm DOFLIP \
|
|
__asm mov ah, al \
|
|
__asm shr ah, 1 \
|
|
__asm mov bx, ax \
|
|
__asm shl ebx, 14 \
|
|
__asm mov bx, ax \
|
|
__asm and ebx, 0x01010101 \
|
|
__asm shl ebx, cl \
|
|
__asm or ebx, dword ptr [edi] \
|
|
__asm mov dword ptr [edi], ebx \
|
|
__asm add esi, 16 \
|
|
__asm add edi, 4 \
|
|
__asm inc edx \
|
|
__asm jl loopNA \
|
|
__asm pop ebx \
|
|
} \
|
|
else { \
|
|
/* aligned memory access */ \
|
|
aligned = (float *) ((((int) SRC0) + 15) & ~15); \
|
|
if ( (int)aligned > ((int)src0) + COUNT ) { \
|
|
pre = COUNT; \
|
|
post = 0; \
|
|
} \
|
|
else { \
|
|
pre = aligned - SRC0; \
|
|
cnt = (COUNT - pre) >> 2; \
|
|
post = COUNT - pre - (cnt<<2); \
|
|
__asm mov edx, cnt \
|
|
__asm test edx, edx \
|
|
__asm je doneCmp \
|
|
__asm push ebx \
|
|
__asm neg edx \
|
|
__asm mov esi, aligned \
|
|
__asm prefetchnta [esi+64] \
|
|
__asm movss xmm1, CONSTANT \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mov edi, DST \
|
|
__asm add edi, pre \
|
|
__asm mov cl, bitNum \
|
|
__asm loopA: \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm prefetchnta [esi+128] \
|
|
__asm CMPSIMD xmm0, xmm1 \
|
|
__asm movmskps eax, xmm0 \
|
|
__asm DOFLIP \
|
|
__asm mov ah, al \
|
|
__asm shr ah, 1 \
|
|
__asm mov bx, ax \
|
|
__asm shl ebx, 14 \
|
|
__asm mov bx, ax \
|
|
__asm and ebx, 0x01010101 \
|
|
__asm shl ebx, cl \
|
|
__asm or ebx, dword ptr [edi] \
|
|
__asm mov dword ptr [edi], ebx \
|
|
__asm add esi, 16 \
|
|
__asm add edi, 4 \
|
|
__asm inc edx \
|
|
__asm jl loopA \
|
|
__asm pop ebx \
|
|
} \
|
|
} \
|
|
doneCmp: \
|
|
float c = constant; \
|
|
for ( i = 0; i < pre; i++ ) { \
|
|
dst[i] |= ( src0[i] CMP c ) << BITNUM; \
|
|
} \
|
|
for ( i = count - post; i < count; i++ ) { \
|
|
dst[i] |= ( src0[i] CMP c ) << BITNUM; \
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpGT
|
|
|
|
dst[i] = src0[i] > constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
COMPARECONSTANT( dst, src0, constant, count, >, cmpnleps, NOFLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpGT
|
|
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >, cmpnleps, NOFLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpGE
|
|
|
|
dst[i] = src0[i] >= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
COMPARECONSTANT( dst, src0, constant, count, >=, cmpnltps, NOFLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpGE
|
|
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, >=, cmpnltps, NOFLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpLT
|
|
|
|
dst[i] = src0[i] < constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
COMPARECONSTANT( dst, src0, constant, count, <, cmpltps, NOFLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpLT
|
|
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <, cmpltps, NOFLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpLE
|
|
|
|
dst[i] = src0[i] <= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
COMPARECONSTANT( dst, src0, constant, count, <=, cmpnleps, FLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CmpLE
|
|
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
COMPAREBITCONSTANT( dst, bitNum, src0, constant, count, <=, cmpnleps, FLIP )
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MinMax( float &min, float &max, const float *src, const int count ) {
|
|
int i, pre, post;
|
|
|
|
min = idMath::INFINITY; max = -idMath::INFINITY;
|
|
|
|
__asm
|
|
{
|
|
push ebx
|
|
mov eax, min
|
|
mov ebx, max
|
|
movss xmm0, [eax]
|
|
movss xmm1, [ebx]
|
|
shufps xmm0, xmm0, 0
|
|
shufps xmm1, xmm1, 0
|
|
|
|
KFLOATINITS( src, count, pre, post )
|
|
and eax, 15
|
|
jz lpA
|
|
jmp lpNA
|
|
align 16
|
|
lpNA:
|
|
movups xmm2, [edx+ebx]
|
|
movups xmm3, [edx+ebx+16]
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm2
|
|
prefetchnta [edx+ebx+64]
|
|
minps xmm0, xmm3
|
|
maxps xmm1, xmm3
|
|
add ebx, 16*2
|
|
jl lpNA
|
|
jmp done2
|
|
lpA:
|
|
movaps xmm2, [edx+ebx]
|
|
movaps xmm3, [edx+ebx+16]
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm2
|
|
prefetchnta [edx+ebx+64]
|
|
minps xmm0, xmm3
|
|
maxps xmm1, xmm3
|
|
add ebx, 16*2
|
|
jl lpA
|
|
jmp done2
|
|
align 16
|
|
done2:
|
|
movaps xmm2, xmm0
|
|
movaps xmm3, xmm1
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
minss xmm0, xmm2
|
|
maxss xmm1, xmm3
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
minss xmm0, xmm2
|
|
maxss xmm1, xmm3
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
minss xmm0, xmm2
|
|
maxss xmm1, xmm3
|
|
mov eax, min
|
|
mov ebx, max
|
|
movss [eax], xmm0
|
|
movss [ebx], xmm1
|
|
done:
|
|
pop ebx
|
|
}
|
|
|
|
for ( i = 0; i < pre; i++ ) {
|
|
float tmp = src[i];
|
|
if ( tmp > max ) {
|
|
max = tmp;
|
|
}
|
|
if ( tmp < min ) {
|
|
min = tmp;
|
|
}
|
|
}
|
|
for ( i = count - post; i < count; i++ ) {
|
|
float tmp = src[i];
|
|
if ( tmp > max ) {
|
|
max = tmp;
|
|
}
|
|
if ( tmp < min ) {
|
|
min = tmp;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
|
|
__asm {
|
|
mov eax, count
|
|
test eax, eax
|
|
movss xmm0, idMath::INFINITY
|
|
xorps xmm1, xmm1
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
subps xmm1, xmm0
|
|
jz done
|
|
mov ecx, eax
|
|
and ecx, 1
|
|
mov esi, src
|
|
jz startLoop
|
|
movlps xmm2, [esi]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
dec eax
|
|
add esi, 2*4
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm2
|
|
startLoop:
|
|
imul eax, 2*4
|
|
add esi, eax
|
|
neg eax
|
|
loopVert:
|
|
movlps xmm2, [esi+eax]
|
|
movhps xmm2, [esi+eax+8]
|
|
add eax, 4*4
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm2
|
|
jl loopVert
|
|
done:
|
|
movaps xmm2, xmm0
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
minps xmm0, xmm2
|
|
mov esi, min
|
|
movlps [esi], xmm0
|
|
movaps xmm3, xmm1
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
maxps xmm1, xmm3
|
|
mov edi, max
|
|
movlps [edi], xmm1
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
|
|
__asm {
|
|
|
|
movss xmm0, idMath::INFINITY
|
|
xorps xmm1, xmm1
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
subps xmm1, xmm0
|
|
movaps xmm2, xmm0
|
|
movaps xmm3, xmm1
|
|
|
|
mov esi, src
|
|
mov eax, count
|
|
and eax, ~3
|
|
jz done4
|
|
imul eax, 12
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
// prefetchnta [esi+4*12]
|
|
|
|
movss xmm4, [esi+eax+0*12+8]
|
|
movhps xmm4, [esi+eax+0*12+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
|
|
movss xmm5, [esi+eax+1*12+0]
|
|
movhps xmm5, [esi+eax+1*12+4]
|
|
minps xmm2, xmm5
|
|
maxps xmm3, xmm5
|
|
|
|
movss xmm6, [esi+eax+2*12+8]
|
|
movhps xmm6, [esi+eax+2*12+0]
|
|
minps xmm0, xmm6
|
|
maxps xmm1, xmm6
|
|
|
|
movss xmm7, [esi+eax+3*12+0]
|
|
movhps xmm7, [esi+eax+3*12+4]
|
|
minps xmm2, xmm7
|
|
maxps xmm3, xmm7
|
|
|
|
add eax, 4*12
|
|
jl loop4
|
|
|
|
done4:
|
|
mov eax, count
|
|
and eax, 3
|
|
jz done1
|
|
imul eax, 12
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop1:
|
|
movss xmm4, [esi+eax+0*12+8]
|
|
movhps xmm4, [esi+eax+0*12+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
|
|
add eax, 12
|
|
jl loop1
|
|
|
|
done1:
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm3
|
|
mov esi, min
|
|
movhps [esi], xmm0
|
|
movss [esi+8], xmm0
|
|
mov edi, max
|
|
movhps [edi], xmm1
|
|
movss [edi+8], xmm1
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__asm {
|
|
|
|
movss xmm0, idMath::INFINITY
|
|
xorps xmm1, xmm1
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
subps xmm1, xmm0
|
|
movaps xmm2, xmm0
|
|
movaps xmm3, xmm1
|
|
|
|
mov esi, src
|
|
mov eax, count
|
|
and eax, ~3
|
|
jz done4
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
|
|
|
|
movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
|
|
movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
minps xmm2, xmm5
|
|
maxps xmm3, xmm5
|
|
|
|
movss xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm6, [esi+eax+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm6
|
|
maxps xmm1, xmm6
|
|
|
|
movss xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm7, [esi+eax+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
minps xmm2, xmm7
|
|
maxps xmm3, xmm7
|
|
|
|
add eax, 4*DRAWVERT_SIZE
|
|
jl loop4
|
|
|
|
done4:
|
|
mov eax, count
|
|
and eax, 3
|
|
jz done1
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loop1:
|
|
movss xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm4, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
|
|
add eax, DRAWVERT_SIZE
|
|
jl loop1
|
|
|
|
done1:
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm3
|
|
mov esi, min
|
|
movhps [esi], xmm0
|
|
movss [esi+8], xmm0
|
|
mov edi, max
|
|
movhps [edi], xmm1
|
|
movss [edi+8], xmm1
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__asm {
|
|
|
|
movss xmm0, idMath::INFINITY
|
|
xorps xmm1, xmm1
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
subps xmm1, xmm0
|
|
movaps xmm2, xmm0
|
|
movaps xmm3, xmm1
|
|
|
|
mov edi, indexes
|
|
mov esi, src
|
|
mov eax, count
|
|
and eax, ~3
|
|
jz done4
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
// prefetchnta [edi+128]
|
|
// prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
|
|
|
|
mov edx, [edi+eax+0]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
|
|
mov edx, [edi+eax+4]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm5, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
|
|
minps xmm2, xmm5
|
|
maxps xmm3, xmm5
|
|
|
|
mov edx, [edi+eax+8]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm6, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm6
|
|
maxps xmm1, xmm6
|
|
|
|
mov edx, [edi+eax+12]
|
|
imul edx, DRAWVERT_SIZE
|
|
movss xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm7, [esi+edx+DRAWVERT_XYZ_OFFSET+4]
|
|
minps xmm2, xmm7
|
|
maxps xmm3, xmm7
|
|
|
|
add eax, 4*4
|
|
jl loop4
|
|
|
|
done4:
|
|
mov eax, count
|
|
and eax, 3
|
|
jz done1
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop1:
|
|
mov edx, [edi+eax+0]
|
|
imul edx, DRAWVERT_SIZE;
|
|
movss xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm4, [esi+edx+DRAWVERT_XYZ_OFFSET+0]
|
|
minps xmm0, xmm4
|
|
maxps xmm1, xmm4
|
|
|
|
add eax, 4
|
|
jl loop1
|
|
|
|
done1:
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 1, 0, 2 )
|
|
minps xmm0, xmm2
|
|
maxps xmm1, xmm3
|
|
mov esi, min
|
|
movhps [esi], xmm0
|
|
movss [esi+8], xmm0
|
|
mov edi, max
|
|
movhps [edi], xmm1
|
|
movss [edi+8], xmm1
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Clamp
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
|
|
int i, pre, post;
|
|
|
|
__asm
|
|
{
|
|
movss xmm0,min
|
|
movss xmm1,max
|
|
shufps xmm0,xmm0,0
|
|
shufps xmm1,xmm1,0
|
|
|
|
KFLOATINITDS( dst, src, count, pre, post )
|
|
and eax,15
|
|
jne lpNA
|
|
jmp lpA
|
|
align 16
|
|
lpA:
|
|
movaps xmm2,[edx+ebx]
|
|
movaps xmm3,[edx+ebx+16]
|
|
maxps xmm2,xmm0
|
|
maxps xmm3,xmm0
|
|
prefetchnta [edx+ebx+64]
|
|
minps xmm2,xmm1
|
|
minps xmm3,xmm1
|
|
movaps [edi+ebx],xmm2
|
|
movaps [edi+ebx+16],xmm3
|
|
add ebx,16*2
|
|
jl lpA
|
|
jmp done
|
|
|
|
align 16
|
|
lpNA:
|
|
movups xmm2,[edx+ebx]
|
|
movups xmm3,[edx+ebx+16]
|
|
maxps xmm2,xmm0
|
|
maxps xmm3,xmm0
|
|
prefetchnta [edx+ebx+64]
|
|
minps xmm2,xmm1
|
|
minps xmm3,xmm1
|
|
movaps [edi+ebx],xmm2
|
|
movaps [edi+ebx+16],xmm3
|
|
add ebx,16*2
|
|
jl lpNA
|
|
done:
|
|
}
|
|
|
|
for ( i = 0; i < pre; i++ ) {
|
|
if ( src[i] < min )
|
|
dst[i] = min;
|
|
else if ( src[i] > max )
|
|
dst[i] = max;
|
|
else
|
|
dst[i] = src[i];
|
|
}
|
|
|
|
for( i = count - post; i < count; i++ ) {
|
|
if ( src[i] < min )
|
|
dst[i] = min;
|
|
else if ( src[i] > max )
|
|
dst[i] = max;
|
|
else
|
|
dst[i] = src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::ClampMin
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::ClampMin( float *dst, const float *src, const float min, const int count ) {
|
|
int i, pre, post;
|
|
|
|
__asm
|
|
{
|
|
movss xmm0,min
|
|
shufps xmm0,xmm0,0
|
|
|
|
KFLOATINITDS( dst, src, count, pre, post )
|
|
and eax,15
|
|
jne lpNA
|
|
jmp lpA
|
|
align 16
|
|
lpA:
|
|
movaps xmm2,[edx+ebx]
|
|
movaps xmm3,[edx+ebx+16]
|
|
maxps xmm2,xmm0
|
|
prefetchnta [edx+ebx+64]
|
|
maxps xmm3,xmm0
|
|
movaps [edi+ebx],xmm2
|
|
movaps [edi+ebx+16],xmm3
|
|
add ebx,16*2
|
|
jl lpA
|
|
jmp done
|
|
|
|
align 16
|
|
lpNA:
|
|
movups xmm2,[edx+ebx]
|
|
movups xmm3,[edx+ebx+16]
|
|
maxps xmm2,xmm0
|
|
prefetchnta [edx+ebx+64]
|
|
maxps xmm3,xmm0
|
|
movaps [edi+ebx],xmm2
|
|
movaps [edi+ebx+16],xmm3
|
|
add ebx,16*2
|
|
jl lpNA
|
|
done:
|
|
}
|
|
|
|
for( i = 0; i < pre; i++ ) {
|
|
if ( src[i] < min )
|
|
dst[i] = min;
|
|
else
|
|
dst[i] = src[i];
|
|
}
|
|
for( i = count - post; i < count; i++ ) {
|
|
if ( src[i] < min )
|
|
dst[i] = min;
|
|
else
|
|
dst[i] = src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::ClampMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::ClampMax( float *dst, const float *src, const float max, const int count ) {
|
|
int i, pre, post;
|
|
|
|
__asm
|
|
{
|
|
movss xmm1,max
|
|
shufps xmm1,xmm1,0
|
|
|
|
KFLOATINITDS( dst, src, count, pre, post )
|
|
and eax,15
|
|
jne lpNA
|
|
jmp lpA
|
|
align 16
|
|
lpA:
|
|
movaps xmm2,[edx+ebx]
|
|
movaps xmm3,[edx+ebx+16]
|
|
minps xmm2,xmm1
|
|
prefetchnta [edx+ebx+64]
|
|
minps xmm3,xmm1
|
|
movaps [edi+ebx],xmm2
|
|
movaps [edi+ebx+16],xmm3
|
|
add ebx,16*2
|
|
jl lpA
|
|
jmp done
|
|
|
|
align 16
|
|
lpNA:
|
|
movups xmm2,[edx+ebx]
|
|
movups xmm3,[edx+ebx+16]
|
|
minps xmm2,xmm1
|
|
prefetchnta [edx+ebx+64]
|
|
minps xmm3,xmm1
|
|
movaps [edi+ebx],xmm2
|
|
movaps [edi+ebx+16],xmm3
|
|
add ebx,16*2
|
|
jl lpNA
|
|
done:
|
|
}
|
|
|
|
for( i = 0; i < pre; i++ ) {
|
|
if ( src[i] > max )
|
|
dst[i] = max;
|
|
else
|
|
dst[i] = src[i];
|
|
}
|
|
|
|
for( i = count - post; i < count; i++ ) {
|
|
if ( src[i] > max )
|
|
dst[i] = max;
|
|
else
|
|
dst[i] = src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Zero16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Zero16( float *dst, const int count ) {
|
|
__asm {
|
|
mov edx, dst
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneZero16
|
|
shl eax, 4
|
|
add edx, eax
|
|
neg eax
|
|
xorps xmm0, xmm0
|
|
loopZero16:
|
|
movaps [edx+eax], xmm0
|
|
add eax, 16
|
|
jl loopZero16
|
|
doneZero16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Negate16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Negate16( float *dst, const int count ) {
|
|
__asm {
|
|
mov edx, dst
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneNegate16
|
|
shl eax, 4
|
|
add edx, eax
|
|
neg eax
|
|
movss xmm0, SIMD_SP_signBitMask
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
loopNegate16:
|
|
movaps xmm1, [edx+eax]
|
|
xorps xmm1, xmm0
|
|
movaps [edx+eax], xmm1
|
|
add eax, 16
|
|
jl loopNegate16
|
|
doneNegate16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Copy16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Copy16( float *dst, const float *src, const int count ) {
|
|
__asm {
|
|
mov ecx, src
|
|
mov edx, dst
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneCopy16
|
|
shl eax, 4
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
loopCopy16:
|
|
movaps xmm0, [ecx+eax]
|
|
movaps [edx+eax], xmm0
|
|
add eax, 16
|
|
jl loopCopy16
|
|
doneCopy16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Add16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Add16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
__asm {
|
|
mov ecx, src1
|
|
mov edx, src2
|
|
mov esi, dst
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneAdd16
|
|
shl eax, 4
|
|
add esi, eax
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
loopAdd16:
|
|
movaps xmm0, [ecx+eax]
|
|
addps xmm0, [edx+eax]
|
|
movaps [esi+eax], xmm0
|
|
add eax, 16
|
|
jl loopAdd16
|
|
doneAdd16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Sub16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
__asm {
|
|
mov ecx, src1
|
|
mov edx, src2
|
|
mov esi, dst
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneSub16
|
|
shl eax, 4
|
|
add esi, eax
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
loopSub16:
|
|
movaps xmm0, [ecx+eax]
|
|
subps xmm0, [edx+eax]
|
|
movaps [esi+eax], xmm0
|
|
add eax, 16
|
|
jl loopSub16
|
|
doneSub16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::Mul16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::Mul16( float *dst, const float *src1, const float constant, const int count ) {
|
|
__asm {
|
|
mov ecx, dst
|
|
mov edx, src1
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneMulScalar16
|
|
movss xmm1, constant
|
|
shl eax, 4
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
shufps xmm1, xmm1, 0x00
|
|
loopMulScalar16:
|
|
movaps xmm0, [edx+eax]
|
|
mulps xmm0, xmm1
|
|
movaps [ecx+eax], xmm0
|
|
add eax, 16
|
|
jl loopMulScalar16
|
|
doneMulScalar16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::AddAssign16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::AddAssign16( float *dst, const float *src, const int count ) {
|
|
__asm {
|
|
mov ecx, dst
|
|
mov edx, src
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneAddAssign16
|
|
shl eax, 4
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
loopAddAssign16:
|
|
movaps xmm0, [ecx+eax]
|
|
addps xmm0, [edx+eax]
|
|
movaps [ecx+eax], xmm0
|
|
add eax, 16
|
|
jl loopAddAssign16
|
|
doneAddAssign16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::SubAssign16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::SubAssign16( float *dst, const float *src, const int count ) {
|
|
__asm {
|
|
mov ecx, dst
|
|
mov edx, src
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneSubAssign16
|
|
shl eax, 4
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
loopSubAssign16:
|
|
movaps xmm0, [ecx+eax]
|
|
subps xmm0, [edx+eax]
|
|
movaps [ecx+eax], xmm0
|
|
add eax, 16
|
|
jl loopSubAssign16
|
|
doneSubAssign16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MulAssign16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MulAssign16( float *dst, const float constant, const int count ) {
|
|
__asm {
|
|
mov ecx, dst
|
|
mov eax, count
|
|
add eax, 3
|
|
shr eax, 2
|
|
jz doneMulAssign16
|
|
movss xmm1, constant
|
|
shl eax, 4
|
|
add ecx, eax
|
|
neg eax
|
|
shufps xmm1, xmm1, 0x00
|
|
loopMulAssign16:
|
|
movaps xmm0, [ecx+eax]
|
|
mulps xmm0, xmm1
|
|
movaps [ecx+eax], xmm0
|
|
add eax, 16
|
|
jl loopMulAssign16
|
|
doneMulAssign16:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_MultiplyVecX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
NxN * Nx1
|
|
Nx6 * 6x1
|
|
6xN * Nx1
|
|
|
|
with N in the range [1-6]
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
#define STORE1( offset, reg1, reg2 ) \
|
|
__asm movss [eax+offset], reg1
|
|
#define STORE2LO( offset, reg1, reg2 ) \
|
|
__asm movlps [eax+offset], reg1
|
|
#define STORE2HI( offset, reg1, reg2 ) \
|
|
__asm movhps [eax+offset], reg1
|
|
#define STORE4( offset, reg1, reg2 ) \
|
|
__asm movlps [eax+offset], reg1 \
|
|
__asm movhps [eax+offset+8], reg1
|
|
#define STOREC =
|
|
|
|
int numRows;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumColumns() );
|
|
assert( dst.GetSize() >= mat.GetNumRows() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numRows = mat.GetNumRows();
|
|
switch( mat.GetNumColumns() ) {
|
|
case 1: {
|
|
switch( numRows ) {
|
|
case 1: { // 1x1 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
mulss xmm0, [edi]
|
|
STORE1( 0, xmm0, xmm1 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x1 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
STORE4( 0, xmm0, xmm2 )
|
|
STORE2LO( 16, xmm1, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
switch( numRows ) {
|
|
case 2: { // 2x2 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
movss xmm1, [esi+4]
|
|
movss xmm2, [edi]
|
|
mulss xmm2, xmm0
|
|
movss xmm3, [edi+4]
|
|
mulss xmm3, xmm1
|
|
addss xmm2, xmm3
|
|
STORE1( 0, xmm2, xmm4 )
|
|
mulss xmm0, [edi+8]
|
|
mulss xmm1, [edi+8+4]
|
|
addss xmm0, xmm1
|
|
STORE1( 4, xmm0, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x2 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm7, [esi]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movaps xmm0, [edi]
|
|
mulps xmm0, xmm7
|
|
movaps xmm1, [edi+16]
|
|
mulps xmm1, xmm7
|
|
movaps xmm2, xmm0
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
movaps xmm3, [edi+32]
|
|
addps xmm0, xmm2
|
|
mulps xmm3, xmm7
|
|
STORE4( 0, xmm0, xmm4 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm1, xmm3
|
|
addps xmm3, xmm1
|
|
STORE2LO( 16, xmm3, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
|
|
mPtr += 2;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
switch( numRows ) {
|
|
case 3: { // 3x3 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
movss xmm4, [edi]
|
|
mulss xmm4, xmm0
|
|
movss xmm1, [esi+4]
|
|
movss xmm5, [edi+4]
|
|
mulss xmm5, xmm1
|
|
addss xmm4, xmm5
|
|
movss xmm2, [esi+8]
|
|
movss xmm6, [edi+8]
|
|
mulss xmm6, xmm2
|
|
addss xmm4, xmm6
|
|
movss xmm3, [edi+12]
|
|
mulss xmm3, xmm0
|
|
STORE1( 0, xmm4, xmm7 );
|
|
movss xmm5, [edi+12+4]
|
|
mulss xmm5, xmm1
|
|
addss xmm3, xmm5
|
|
movss xmm6, [edi+12+8]
|
|
mulss xmm6, xmm2
|
|
addss xmm3, xmm6
|
|
mulss xmm0, [edi+24]
|
|
mulss xmm1, [edi+24+4]
|
|
STORE1( 4, xmm3, xmm7 );
|
|
addss xmm0, xmm1
|
|
mulss xmm2, [edi+24+8]
|
|
addss xmm0, xmm2
|
|
STORE1( 8, xmm0, xmm7 );
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x3 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm5, [esi]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [esi+4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [esi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
|
|
movlps xmm1, [edi+4*4]
|
|
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
|
|
movlps xmm2, [edi+6*4]
|
|
movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
|
|
mulps xmm0, xmm5
|
|
movlps xmm3, [edi+10*4]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
|
|
movaps xmm3, xmm1
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
|
|
mulps xmm1, xmm6
|
|
shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
|
|
mulps xmm3, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm3
|
|
STORE4( 0, xmm0, xmm4 )
|
|
movss xmm1, [edi+12*4]
|
|
mulss xmm1, xmm5
|
|
movss xmm2, [edi+13*4]
|
|
mulss xmm2, xmm6
|
|
movss xmm3, [edi+14*4]
|
|
mulss xmm3, xmm7
|
|
addss xmm1, xmm2
|
|
addss xmm1, xmm3
|
|
STORE1( 16, xmm1, xmm4 )
|
|
mulss xmm5, [edi+15*4]
|
|
mulss xmm6, [edi+16*4]
|
|
mulss xmm7, [edi+17*4]
|
|
addss xmm5, xmm6
|
|
addss xmm5, xmm7
|
|
STORE1( 20, xmm5, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
|
|
mPtr += 3;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
switch( numRows ) {
|
|
case 4: { // 4x4 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, qword ptr [esi ]
|
|
movlps xmm0, qword ptr [edi ]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm0, qword ptr [edi+16]
|
|
mulps xmm0, xmm6
|
|
movlps xmm7, qword ptr [esi+ 8]
|
|
movlps xmm2, qword ptr [edi+ 8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm2, qword ptr [edi+24]
|
|
mulps xmm2, xmm7
|
|
movlps xmm1, qword ptr [edi+32]
|
|
movhps xmm1, qword ptr [edi+48]
|
|
mulps xmm1, xmm6
|
|
movlps xmm3, qword ptr [edi+40]
|
|
addps xmm0, xmm2
|
|
movhps xmm3, qword ptr [edi+56]
|
|
mulps xmm3, xmm7
|
|
movaps xmm4, xmm0
|
|
addps xmm1, xmm3
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm4
|
|
STORE4( 0, xmm0, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x4 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, qword ptr [esi+ 0]
|
|
movlps xmm0, qword ptr [edi+ 0]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm0, qword ptr [edi+16]
|
|
mulps xmm0, xmm6
|
|
movlps xmm7, qword ptr [esi+ 8]
|
|
movlps xmm2, qword ptr [edi+ 8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm2, qword ptr [edi+24]
|
|
mulps xmm2, xmm7
|
|
movlps xmm1, qword ptr [edi+32]
|
|
movhps xmm1, qword ptr [edi+48]
|
|
mulps xmm1, xmm6
|
|
movlps xmm3, qword ptr [edi+40]
|
|
addps xmm0, xmm2
|
|
movhps xmm3, qword ptr [edi+56]
|
|
mulps xmm3, xmm7
|
|
movaps xmm4, xmm0
|
|
addps xmm1, xmm3
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm4
|
|
movlps xmm1, qword ptr [edi+64]
|
|
movhps xmm1, qword ptr [edi+80]
|
|
STORE4( 0, xmm0, xmm4 )
|
|
mulps xmm1, xmm6
|
|
movlps xmm2, qword ptr [edi+72]
|
|
movhps xmm2, qword ptr [edi+88]
|
|
mulps xmm2, xmm7
|
|
addps xmm1, xmm2
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm3, xmm1
|
|
addps xmm1, xmm3
|
|
STORE2LO( 16, xmm1, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
|
|
mPtr += 4;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 5: {
|
|
switch( numRows ) {
|
|
case 5: { // 5x5 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
|
|
movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
|
|
movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
|
|
movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
|
|
movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
|
|
shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
|
|
movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
|
|
movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
|
|
movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
|
|
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
|
|
movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
|
|
movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
|
|
movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
|
|
shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
|
|
movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
|
|
movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
|
|
movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
|
|
movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
|
|
shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
|
|
movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
|
|
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
|
|
movss xmm7, [esi+0*4]
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm0, xmm7
|
|
movss xmm5, [esi+1*4]
|
|
shufps xmm5, xmm5, 0
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movss xmm6, [esi+2*4]
|
|
shufps xmm6, xmm6, 0
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm2
|
|
movss xmm1, [esi+3*4]
|
|
shufps xmm1, xmm1, 0
|
|
mulps xmm3, xmm1
|
|
addps xmm0, xmm3
|
|
movss xmm2, [esi+4*4]
|
|
shufps xmm2, xmm2, 0
|
|
mulps xmm4, xmm2
|
|
addps xmm0, xmm4
|
|
mulss xmm7, [edi+20*4]
|
|
mulss xmm5, [edi+21*4]
|
|
addps xmm7, xmm5
|
|
mulss xmm6, [edi+22*4]
|
|
addps xmm7, xmm6
|
|
mulss xmm1, [edi+23*4]
|
|
addps xmm7, xmm1
|
|
mulss xmm2, [edi+24*4]
|
|
addps xmm7, xmm2
|
|
STORE4( 0, xmm0, xmm3 )
|
|
STORE1( 16, xmm7, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x5 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, [esi]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movlps xmm7, [esi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movlps xmm0, [edi]
|
|
movhps xmm3, [edi+8]
|
|
movaps xmm1, [edi+16]
|
|
movlps xmm2, [edi+32]
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
|
|
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
|
|
mulps xmm0, xmm6
|
|
mulps xmm3, xmm7
|
|
movlps xmm2, [edi+40]
|
|
addps xmm0, xmm3 // xmm0 + xmm1
|
|
movhps xmm5, [edi+40+8]
|
|
movlps xmm3, [edi+40+16]
|
|
movhps xmm3, [edi+40+24]
|
|
movlps xmm4, [edi+40+32]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
|
|
shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
|
|
shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
|
|
mulps xmm2, xmm6
|
|
mulps xmm5, xmm7
|
|
addps xmm2, xmm5 // xmm2 + xmm3
|
|
movss xmm5, [esi+16]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm4, xmm0
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
|
|
addps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
STORE4( 0, xmm0, xmm2 )
|
|
movlps xmm4, [edi+80]
|
|
movhps xmm3, [edi+80+8]
|
|
movaps xmm1, [edi+80+16]
|
|
movlps xmm2, [edi+80+32]
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
|
|
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
|
|
mulps xmm4, xmm6
|
|
mulps xmm3, xmm7
|
|
mulps xmm1, xmm5
|
|
addps xmm4, xmm3 // xmm4 + xmm1
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
|
|
addps xmm4, xmm1
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
addps xmm4, xmm1
|
|
STORE2LO( 16, xmm4, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
|
|
mPtr += 5;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 6: {
|
|
switch( numRows ) {
|
|
case 1: { // 1x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
mulss xmm0, [edi]
|
|
movss xmm1, [esi+4]
|
|
mulss xmm1, [edi+4]
|
|
movss xmm2, [esi+8]
|
|
addss xmm0, xmm1
|
|
mulss xmm2, [edi+8]
|
|
movss xmm3, [esi+12]
|
|
addss xmm0, xmm2
|
|
mulss xmm3, [edi+12]
|
|
movss xmm4, [esi+16]
|
|
addss xmm0, xmm3
|
|
mulss xmm4, [edi+16]
|
|
movss xmm5, [esi+20]
|
|
addss xmm0, xmm4
|
|
mulss xmm5, [edi+20]
|
|
movss xmm6, [esi+24]
|
|
addss xmm0, xmm5
|
|
mulss xmm6, [edi+24]
|
|
addss xmm0, xmm6
|
|
STORE1( 0, xmm0, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 2x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm0, xmm1
|
|
addps xmm0, xmm1
|
|
STORE2LO( 0, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 3x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm0, xmm1
|
|
addps xmm0, xmm1
|
|
STORE2LO( 0, xmm0, xmm3 )
|
|
// row 2
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm1
|
|
STORE1( 8, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 4x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm7, xmm0
|
|
movlhps xmm7, xmm2
|
|
addps xmm7, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm7, xmm0
|
|
// row 2 and 3
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
movaps xmm2, [edi+48+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
// last 4 additions for the first 4 rows and store result
|
|
movaps xmm0, xmm7
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm7
|
|
STORE4( 0, xmm0, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 5x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm7, xmm0
|
|
movlhps xmm7, xmm2
|
|
addps xmm7, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm7, xmm0
|
|
// row 2 and 3
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
movaps xmm2, [edi+48+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
// last 4 additions for the first 4 rows and store result
|
|
movaps xmm0, xmm7
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm7
|
|
STORE4( 0, xmm0, xmm3 )
|
|
// row 5
|
|
movaps xmm0, [edi+96]
|
|
movaps xmm1, [edi+96+16]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, 0x01
|
|
addss xmm0, xmm1
|
|
STORE1( 16, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm7, qword ptr [esi]
|
|
movlps xmm6, qword ptr [esi+8]
|
|
shufps xmm7, xmm7, 0x44
|
|
shufps xmm6, xmm6, 0x44
|
|
movlps xmm0, qword ptr [edi ]
|
|
movhps xmm0, qword ptr [edi+ 24]
|
|
mulps xmm0, xmm7
|
|
movlps xmm3, qword ptr [edi+ 8]
|
|
movhps xmm3, qword ptr [edi+ 32]
|
|
mulps xmm3, xmm6
|
|
movlps xmm1, qword ptr [edi+ 48]
|
|
movhps xmm1, qword ptr [edi+ 72]
|
|
mulps xmm1, xmm7
|
|
movlps xmm2, qword ptr [edi+ 96]
|
|
movhps xmm2, qword ptr [edi+120]
|
|
mulps xmm2, xmm7
|
|
movlps xmm4, qword ptr [edi+ 56]
|
|
movhps xmm4, qword ptr [edi+ 80]
|
|
movlps xmm5, qword ptr [edi+104]
|
|
movhps xmm5, qword ptr [edi+128]
|
|
mulps xmm4, xmm6
|
|
movlps xmm7, qword ptr [esi+16]
|
|
addps xmm0, xmm3
|
|
shufps xmm7, xmm7, 0x44
|
|
mulps xmm5, xmm6
|
|
addps xmm1, xmm4
|
|
movlps xmm3, qword ptr [edi+ 16]
|
|
movhps xmm3, qword ptr [edi+ 40]
|
|
addps xmm2, xmm5
|
|
movlps xmm4, qword ptr [edi+ 64]
|
|
movhps xmm4, qword ptr [edi+ 88]
|
|
mulps xmm3, xmm7
|
|
movlps xmm5, qword ptr [edi+112]
|
|
movhps xmm5, qword ptr [edi+136]
|
|
addps xmm0, xmm3
|
|
mulps xmm4, xmm7
|
|
mulps xmm5, xmm7
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
movaps xmm6, xmm0
|
|
shufps xmm0, xmm1, 0x88
|
|
shufps xmm6, xmm1, 0xDD
|
|
movaps xmm7, xmm2
|
|
shufps xmm7, xmm2, 0x88
|
|
shufps xmm2, xmm2, 0xDD
|
|
addps xmm0, xmm6
|
|
addps xmm2, xmm7
|
|
STORE4( 0, xmm0, xmm3 )
|
|
STORE2LO( 16, xmm2, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
|
|
mPtr += 6;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
int numColumns = mat.GetNumColumns();
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numColumns; j++ ) {
|
|
sum += mPtr[j] * vPtr[j];
|
|
}
|
|
dstPtr[i] STOREC sum;
|
|
mPtr += numColumns;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
#undef STOREC
|
|
#undef STORE4
|
|
#undef STORE2HI
|
|
#undef STORE2LO
|
|
#undef STORE1
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_MultiplyAddVecX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
NxN * Nx1
|
|
Nx6 * 6x1
|
|
6xN * Nx1
|
|
|
|
with N in the range [1-6]
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
#define STORE1( offset, reg1, reg2 ) \
|
|
__asm movss reg2, [eax+offset] \
|
|
__asm addss reg2, reg1 \
|
|
__asm movss [eax+offset], reg2
|
|
#define STORE2LO( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm addps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2
|
|
#define STORE2HI( offset, reg1, reg2 ) \
|
|
__asm movhps reg2, [eax+offset] \
|
|
__asm addps reg2, reg1 \
|
|
__asm movhps [eax+offset], reg2
|
|
#define STORE4( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm movhps reg2, [eax+offset+8] \
|
|
__asm addps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2 \
|
|
__asm movhps [eax+offset+8], reg2
|
|
#define STOREC +=
|
|
|
|
int numRows;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumColumns() );
|
|
assert( dst.GetSize() >= mat.GetNumRows() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numRows = mat.GetNumRows();
|
|
switch( mat.GetNumColumns() ) {
|
|
case 1: {
|
|
switch( numRows ) {
|
|
case 1: { // 1x1 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
mulss xmm0, [edi]
|
|
STORE1( 0, xmm0, xmm1 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x1 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
STORE4( 0, xmm0, xmm2 )
|
|
STORE2LO( 16, xmm1, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
switch( numRows ) {
|
|
case 2: { // 2x2 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
movss xmm1, [esi+4]
|
|
movss xmm2, [edi]
|
|
mulss xmm2, xmm0
|
|
movss xmm3, [edi+4]
|
|
mulss xmm3, xmm1
|
|
addss xmm2, xmm3
|
|
STORE1( 0, xmm2, xmm4 )
|
|
mulss xmm0, [edi+8]
|
|
mulss xmm1, [edi+8+4]
|
|
addss xmm0, xmm1
|
|
STORE1( 4, xmm0, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x2 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm7, [esi]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movaps xmm0, [edi]
|
|
mulps xmm0, xmm7
|
|
movaps xmm1, [edi+16]
|
|
mulps xmm1, xmm7
|
|
movaps xmm2, xmm0
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
movaps xmm3, [edi+32]
|
|
addps xmm0, xmm2
|
|
mulps xmm3, xmm7
|
|
STORE4( 0, xmm0, xmm4 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm1, xmm3
|
|
addps xmm3, xmm1
|
|
STORE2LO( 16, xmm3, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
|
|
mPtr += 2;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
switch( numRows ) {
|
|
case 3: { // 3x3 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
movss xmm4, [edi]
|
|
mulss xmm4, xmm0
|
|
movss xmm1, [esi+4]
|
|
movss xmm5, [edi+4]
|
|
mulss xmm5, xmm1
|
|
addss xmm4, xmm5
|
|
movss xmm2, [esi+8]
|
|
movss xmm6, [edi+8]
|
|
mulss xmm6, xmm2
|
|
addss xmm4, xmm6
|
|
movss xmm3, [edi+12]
|
|
mulss xmm3, xmm0
|
|
STORE1( 0, xmm4, xmm7 );
|
|
movss xmm5, [edi+12+4]
|
|
mulss xmm5, xmm1
|
|
addss xmm3, xmm5
|
|
movss xmm6, [edi+12+8]
|
|
mulss xmm6, xmm2
|
|
addss xmm3, xmm6
|
|
mulss xmm0, [edi+24]
|
|
mulss xmm1, [edi+24+4]
|
|
STORE1( 4, xmm3, xmm7 );
|
|
addss xmm0, xmm1
|
|
mulss xmm2, [edi+24+8]
|
|
addss xmm0, xmm2
|
|
STORE1( 8, xmm0, xmm7 );
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x3 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm5, [esi]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [esi+4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [esi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
|
|
movlps xmm1, [edi+4*4]
|
|
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
|
|
movlps xmm2, [edi+6*4]
|
|
movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
|
|
mulps xmm0, xmm5
|
|
movlps xmm3, [edi+10*4]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
|
|
movaps xmm3, xmm1
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
|
|
mulps xmm1, xmm6
|
|
shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
|
|
mulps xmm3, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm3
|
|
STORE4( 0, xmm0, xmm4 )
|
|
movss xmm1, [edi+12*4]
|
|
mulss xmm1, xmm5
|
|
movss xmm2, [edi+13*4]
|
|
mulss xmm2, xmm6
|
|
movss xmm3, [edi+14*4]
|
|
mulss xmm3, xmm7
|
|
addss xmm1, xmm2
|
|
addss xmm1, xmm3
|
|
STORE1( 16, xmm1, xmm4 )
|
|
mulss xmm5, [edi+15*4]
|
|
mulss xmm6, [edi+16*4]
|
|
mulss xmm7, [edi+17*4]
|
|
addss xmm5, xmm6
|
|
addss xmm5, xmm7
|
|
STORE1( 20, xmm5, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
|
|
mPtr += 3;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
switch( numRows ) {
|
|
case 4: { // 4x4 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, qword ptr [esi ]
|
|
movlps xmm0, qword ptr [edi ]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm0, qword ptr [edi+16]
|
|
mulps xmm0, xmm6
|
|
movlps xmm7, qword ptr [esi+ 8]
|
|
movlps xmm2, qword ptr [edi+ 8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm2, qword ptr [edi+24]
|
|
mulps xmm2, xmm7
|
|
movlps xmm1, qword ptr [edi+32]
|
|
movhps xmm1, qword ptr [edi+48]
|
|
mulps xmm1, xmm6
|
|
movlps xmm3, qword ptr [edi+40]
|
|
addps xmm0, xmm2
|
|
movhps xmm3, qword ptr [edi+56]
|
|
mulps xmm3, xmm7
|
|
movaps xmm4, xmm0
|
|
addps xmm1, xmm3
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm4
|
|
STORE4( 0, xmm0, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x4 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, qword ptr [esi+ 0]
|
|
movlps xmm0, qword ptr [edi+ 0]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm0, qword ptr [edi+16]
|
|
mulps xmm0, xmm6
|
|
movlps xmm7, qword ptr [esi+ 8]
|
|
movlps xmm2, qword ptr [edi+ 8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm2, qword ptr [edi+24]
|
|
mulps xmm2, xmm7
|
|
movlps xmm1, qword ptr [edi+32]
|
|
movhps xmm1, qword ptr [edi+48]
|
|
mulps xmm1, xmm6
|
|
movlps xmm3, qword ptr [edi+40]
|
|
addps xmm0, xmm2
|
|
movhps xmm3, qword ptr [edi+56]
|
|
mulps xmm3, xmm7
|
|
movaps xmm4, xmm0
|
|
addps xmm1, xmm3
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm4
|
|
movlps xmm1, qword ptr [edi+64]
|
|
movhps xmm1, qword ptr [edi+80]
|
|
STORE4( 0, xmm0, xmm4 )
|
|
mulps xmm1, xmm6
|
|
movlps xmm2, qword ptr [edi+72]
|
|
movhps xmm2, qword ptr [edi+88]
|
|
mulps xmm2, xmm7
|
|
addps xmm1, xmm2
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm3, xmm1
|
|
addps xmm1, xmm3
|
|
STORE2LO( 16, xmm1, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
|
|
mPtr += 4;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 5: {
|
|
switch( numRows ) {
|
|
case 5: { // 5x5 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
|
|
movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
|
|
movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
|
|
movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
|
|
movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
|
|
shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
|
|
movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
|
|
movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
|
|
movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
|
|
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
|
|
movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
|
|
movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
|
|
movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
|
|
shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
|
|
movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
|
|
movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
|
|
movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
|
|
movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
|
|
shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
|
|
movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
|
|
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
|
|
movss xmm7, [esi+0*4]
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm0, xmm7
|
|
movss xmm5, [esi+1*4]
|
|
shufps xmm5, xmm5, 0
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movss xmm6, [esi+2*4]
|
|
shufps xmm6, xmm6, 0
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm2
|
|
movss xmm1, [esi+3*4]
|
|
shufps xmm1, xmm1, 0
|
|
mulps xmm3, xmm1
|
|
addps xmm0, xmm3
|
|
movss xmm2, [esi+4*4]
|
|
shufps xmm2, xmm2, 0
|
|
mulps xmm4, xmm2
|
|
addps xmm0, xmm4
|
|
mulss xmm7, [edi+20*4]
|
|
mulss xmm5, [edi+21*4]
|
|
addps xmm7, xmm5
|
|
mulss xmm6, [edi+22*4]
|
|
addps xmm7, xmm6
|
|
mulss xmm1, [edi+23*4]
|
|
addps xmm7, xmm1
|
|
mulss xmm2, [edi+24*4]
|
|
addps xmm7, xmm2
|
|
STORE4( 0, xmm0, xmm3 )
|
|
STORE1( 16, xmm7, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x5 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, [esi]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movlps xmm7, [esi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movlps xmm0, [edi]
|
|
movhps xmm3, [edi+8]
|
|
movaps xmm1, [edi+16]
|
|
movlps xmm2, [edi+32]
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
|
|
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
|
|
mulps xmm0, xmm6
|
|
mulps xmm3, xmm7
|
|
movlps xmm2, [edi+40]
|
|
addps xmm0, xmm3 // xmm0 + xmm1
|
|
movhps xmm5, [edi+40+8]
|
|
movlps xmm3, [edi+40+16]
|
|
movhps xmm3, [edi+40+24]
|
|
movlps xmm4, [edi+40+32]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
|
|
shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
|
|
shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
|
|
mulps xmm2, xmm6
|
|
mulps xmm5, xmm7
|
|
addps xmm2, xmm5 // xmm2 + xmm3
|
|
movss xmm5, [esi+16]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm4, xmm0
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
|
|
addps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
STORE4( 0, xmm0, xmm2 )
|
|
movlps xmm4, [edi+80]
|
|
movhps xmm3, [edi+80+8]
|
|
movaps xmm1, [edi+80+16]
|
|
movlps xmm2, [edi+80+32]
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
|
|
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
|
|
mulps xmm4, xmm6
|
|
mulps xmm3, xmm7
|
|
mulps xmm1, xmm5
|
|
addps xmm4, xmm3 // xmm4 + xmm1
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
|
|
addps xmm4, xmm1
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
addps xmm4, xmm1
|
|
STORE2LO( 16, xmm4, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
|
|
mPtr += 5;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 6: {
|
|
switch( numRows ) {
|
|
case 1: { // 1x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
mulss xmm0, [edi]
|
|
movss xmm1, [esi+4]
|
|
mulss xmm1, [edi+4]
|
|
movss xmm2, [esi+8]
|
|
addss xmm0, xmm1
|
|
mulss xmm2, [edi+8]
|
|
movss xmm3, [esi+12]
|
|
addss xmm0, xmm2
|
|
mulss xmm3, [edi+12]
|
|
movss xmm4, [esi+16]
|
|
addss xmm0, xmm3
|
|
mulss xmm4, [edi+16]
|
|
movss xmm5, [esi+20]
|
|
addss xmm0, xmm4
|
|
mulss xmm5, [edi+20]
|
|
movss xmm6, [esi+24]
|
|
addss xmm0, xmm5
|
|
mulss xmm6, [edi+24]
|
|
addss xmm0, xmm6
|
|
STORE1( 0, xmm0, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 2x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm0, xmm1
|
|
addps xmm0, xmm1
|
|
STORE2LO( 0, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 3x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm0, xmm1
|
|
addps xmm0, xmm1
|
|
STORE2LO( 0, xmm0, xmm3 )
|
|
// row 2
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm1
|
|
STORE1( 8, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 4x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm7, xmm0
|
|
movlhps xmm7, xmm2
|
|
addps xmm7, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm7, xmm0
|
|
// row 2 and 3
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
movaps xmm2, [edi+48+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
// last 4 additions for the first 4 rows and store result
|
|
movaps xmm0, xmm7
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm7
|
|
STORE4( 0, xmm0, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 5x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm7, xmm0
|
|
movlhps xmm7, xmm2
|
|
addps xmm7, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm7, xmm0
|
|
// row 2 and 3
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
movaps xmm2, [edi+48+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
// last 4 additions for the first 4 rows and store result
|
|
movaps xmm0, xmm7
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm7
|
|
STORE4( 0, xmm0, xmm3 )
|
|
// row 5
|
|
movaps xmm0, [edi+96]
|
|
movaps xmm1, [edi+96+16]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, 0x01
|
|
addss xmm0, xmm1
|
|
STORE1( 16, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm7, qword ptr [esi]
|
|
movlps xmm6, qword ptr [esi+8]
|
|
shufps xmm7, xmm7, 0x44
|
|
shufps xmm6, xmm6, 0x44
|
|
movlps xmm0, qword ptr [edi ]
|
|
movhps xmm0, qword ptr [edi+ 24]
|
|
mulps xmm0, xmm7
|
|
movlps xmm3, qword ptr [edi+ 8]
|
|
movhps xmm3, qword ptr [edi+ 32]
|
|
mulps xmm3, xmm6
|
|
movlps xmm1, qword ptr [edi+ 48]
|
|
movhps xmm1, qword ptr [edi+ 72]
|
|
mulps xmm1, xmm7
|
|
movlps xmm2, qword ptr [edi+ 96]
|
|
movhps xmm2, qword ptr [edi+120]
|
|
mulps xmm2, xmm7
|
|
movlps xmm4, qword ptr [edi+ 56]
|
|
movhps xmm4, qword ptr [edi+ 80]
|
|
movlps xmm5, qword ptr [edi+104]
|
|
movhps xmm5, qword ptr [edi+128]
|
|
mulps xmm4, xmm6
|
|
movlps xmm7, qword ptr [esi+16]
|
|
addps xmm0, xmm3
|
|
shufps xmm7, xmm7, 0x44
|
|
mulps xmm5, xmm6
|
|
addps xmm1, xmm4
|
|
movlps xmm3, qword ptr [edi+ 16]
|
|
movhps xmm3, qword ptr [edi+ 40]
|
|
addps xmm2, xmm5
|
|
movlps xmm4, qword ptr [edi+ 64]
|
|
movhps xmm4, qword ptr [edi+ 88]
|
|
mulps xmm3, xmm7
|
|
movlps xmm5, qword ptr [edi+112]
|
|
movhps xmm5, qword ptr [edi+136]
|
|
addps xmm0, xmm3
|
|
mulps xmm4, xmm7
|
|
mulps xmm5, xmm7
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
movaps xmm6, xmm0
|
|
shufps xmm0, xmm1, 0x88
|
|
shufps xmm6, xmm1, 0xDD
|
|
movaps xmm7, xmm2
|
|
shufps xmm7, xmm2, 0x88
|
|
shufps xmm2, xmm2, 0xDD
|
|
addps xmm0, xmm6
|
|
addps xmm2, xmm7
|
|
STORE4( 0, xmm0, xmm3 )
|
|
STORE2LO( 16, xmm2, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
|
|
mPtr += 6;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
int numColumns = mat.GetNumColumns();
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numColumns; j++ ) {
|
|
sum += mPtr[j] * vPtr[j];
|
|
}
|
|
dstPtr[i] STOREC sum;
|
|
mPtr += numColumns;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
#undef STOREC
|
|
#undef STORE4
|
|
#undef STORE2HI
|
|
#undef STORE2LO
|
|
#undef STORE1
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_MultiplySubVecX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
NxN * Nx1
|
|
Nx6 * 6x1
|
|
6xN * Nx1
|
|
|
|
with N in the range [1-6]
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
#define STORE1( offset, reg1, reg2 ) \
|
|
__asm movss reg2, [eax+offset] \
|
|
__asm subss reg2, reg1 \
|
|
__asm movss [eax+offset], reg2
|
|
#define STORE2LO( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm subps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2
|
|
#define STORE2HI( offset, reg1, reg2 ) \
|
|
__asm movhps reg2, [eax+offset] \
|
|
__asm subps reg2, reg1 \
|
|
__asm movhps [eax+offset], reg2
|
|
#define STORE4( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm movhps reg2, [eax+offset+8] \
|
|
__asm subps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2 \
|
|
__asm movhps [eax+offset+8], reg2
|
|
#define STOREC -=
|
|
|
|
int numRows;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumColumns() );
|
|
assert( dst.GetSize() >= mat.GetNumRows() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numRows = mat.GetNumRows();
|
|
switch( mat.GetNumColumns() ) {
|
|
case 1: {
|
|
switch( numRows ) {
|
|
case 1: { // 1x1 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
mulss xmm0, [edi]
|
|
STORE1( 0, xmm0, xmm1 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x1 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
STORE4( 0, xmm0, xmm2 )
|
|
STORE2LO( 16, xmm1, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
switch( numRows ) {
|
|
case 2: { // 2x2 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
movss xmm1, [esi+4]
|
|
movss xmm2, [edi]
|
|
mulss xmm2, xmm0
|
|
movss xmm3, [edi+4]
|
|
mulss xmm3, xmm1
|
|
addss xmm2, xmm3
|
|
STORE1( 0, xmm2, xmm4 )
|
|
mulss xmm0, [edi+8]
|
|
mulss xmm1, [edi+8+4]
|
|
addss xmm0, xmm1
|
|
STORE1( 4, xmm0, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x2 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm7, [esi]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movaps xmm0, [edi]
|
|
mulps xmm0, xmm7
|
|
movaps xmm1, [edi+16]
|
|
mulps xmm1, xmm7
|
|
movaps xmm2, xmm0
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm2, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
movaps xmm3, [edi+32]
|
|
addps xmm0, xmm2
|
|
mulps xmm3, xmm7
|
|
STORE4( 0, xmm0, xmm4 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm1, xmm3
|
|
addps xmm3, xmm1
|
|
STORE2LO( 16, xmm3, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
|
|
mPtr += 2;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
switch( numRows ) {
|
|
case 3: { // 3x3 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
movss xmm4, [edi]
|
|
mulss xmm4, xmm0
|
|
movss xmm1, [esi+4]
|
|
movss xmm5, [edi+4]
|
|
mulss xmm5, xmm1
|
|
addss xmm4, xmm5
|
|
movss xmm2, [esi+8]
|
|
movss xmm6, [edi+8]
|
|
mulss xmm6, xmm2
|
|
addss xmm4, xmm6
|
|
movss xmm3, [edi+12]
|
|
mulss xmm3, xmm0
|
|
STORE1( 0, xmm4, xmm7 );
|
|
movss xmm5, [edi+12+4]
|
|
mulss xmm5, xmm1
|
|
addss xmm3, xmm5
|
|
movss xmm6, [edi+12+8]
|
|
mulss xmm6, xmm2
|
|
addss xmm3, xmm6
|
|
mulss xmm0, [edi+24]
|
|
mulss xmm1, [edi+24+4]
|
|
STORE1( 4, xmm3, xmm7 );
|
|
addss xmm0, xmm1
|
|
mulss xmm2, [edi+24+8]
|
|
addss xmm0, xmm2
|
|
STORE1( 8, xmm0, xmm7 );
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x3 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm5, [esi]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [esi+4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm7, [esi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm0, [edi] // xmm0 = 0, 1, 2, 3
|
|
movlps xmm1, [edi+4*4]
|
|
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm1 = 4, 5, 1, 2
|
|
movlps xmm2, [edi+6*4]
|
|
movhps xmm2, [edi+8*4] // xmm2 = 6, 7, 8, 9
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 3, 0, 3 ) // xmm0 = 0, 3, 6, 9
|
|
mulps xmm0, xmm5
|
|
movlps xmm3, [edi+10*4]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 1, 2, 0, 1 ) // xmm2 = 7, 8, 10, 11
|
|
movaps xmm3, xmm1
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 0, 0, 2 ) // xmm1 = 1, 4, 7, 10
|
|
mulps xmm1, xmm6
|
|
shufps xmm3, xmm2, R_SHUFFLEPS( 3, 1, 1, 3 ) // xmm3 = 2, 5, 8, 11
|
|
mulps xmm3, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm3
|
|
STORE4( 0, xmm0, xmm4 )
|
|
movss xmm1, [edi+12*4]
|
|
mulss xmm1, xmm5
|
|
movss xmm2, [edi+13*4]
|
|
mulss xmm2, xmm6
|
|
movss xmm3, [edi+14*4]
|
|
mulss xmm3, xmm7
|
|
addss xmm1, xmm2
|
|
addss xmm1, xmm3
|
|
STORE1( 16, xmm1, xmm4 )
|
|
mulss xmm5, [edi+15*4]
|
|
mulss xmm6, [edi+16*4]
|
|
mulss xmm7, [edi+17*4]
|
|
addss xmm5, xmm6
|
|
addss xmm5, xmm7
|
|
STORE1( 20, xmm5, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
|
|
mPtr += 3;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
switch( numRows ) {
|
|
case 4: { // 4x4 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, qword ptr [esi ]
|
|
movlps xmm0, qword ptr [edi ]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm0, qword ptr [edi+16]
|
|
mulps xmm0, xmm6
|
|
movlps xmm7, qword ptr [esi+ 8]
|
|
movlps xmm2, qword ptr [edi+ 8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm2, qword ptr [edi+24]
|
|
mulps xmm2, xmm7
|
|
movlps xmm1, qword ptr [edi+32]
|
|
movhps xmm1, qword ptr [edi+48]
|
|
mulps xmm1, xmm6
|
|
movlps xmm3, qword ptr [edi+40]
|
|
addps xmm0, xmm2
|
|
movhps xmm3, qword ptr [edi+56]
|
|
mulps xmm3, xmm7
|
|
movaps xmm4, xmm0
|
|
addps xmm1, xmm3
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm4
|
|
STORE4( 0, xmm0, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x4 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, qword ptr [esi+ 0]
|
|
movlps xmm0, qword ptr [edi+ 0]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm0, qword ptr [edi+16]
|
|
mulps xmm0, xmm6
|
|
movlps xmm7, qword ptr [esi+ 8]
|
|
movlps xmm2, qword ptr [edi+ 8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movhps xmm2, qword ptr [edi+24]
|
|
mulps xmm2, xmm7
|
|
movlps xmm1, qword ptr [edi+32]
|
|
movhps xmm1, qword ptr [edi+48]
|
|
mulps xmm1, xmm6
|
|
movlps xmm3, qword ptr [edi+40]
|
|
addps xmm0, xmm2
|
|
movhps xmm3, qword ptr [edi+56]
|
|
mulps xmm3, xmm7
|
|
movaps xmm4, xmm0
|
|
addps xmm1, xmm3
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm4
|
|
movlps xmm1, qword ptr [edi+64]
|
|
movhps xmm1, qword ptr [edi+80]
|
|
STORE4( 0, xmm0, xmm4 )
|
|
mulps xmm1, xmm6
|
|
movlps xmm2, qword ptr [edi+72]
|
|
movhps xmm2, qword ptr [edi+88]
|
|
mulps xmm2, xmm7
|
|
addps xmm1, xmm2
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm3, xmm1
|
|
addps xmm1, xmm3
|
|
STORE2LO( 16, xmm1, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3];
|
|
mPtr += 4;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 5: {
|
|
switch( numRows ) {
|
|
case 5: { // 5x5 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi+5*4] // xmm0 = 5, X, X, X
|
|
movhps xmm0, [edi+0*4] // xmm0 = 5, X, 0, 1
|
|
movss xmm5, [edi+15*4] // xmm4 = 15, X, X, X
|
|
movhps xmm5, [edi+10*4] // xmm5 = 15, X, 10, 11
|
|
movaps xmm1, xmm0 // xmm1 = 5, X, 0, 1
|
|
shufps xmm0, xmm5, R_SHUFFLEPS( 2, 0, 2, 0 ) // xmm0 = 0, 5, 10, 15
|
|
movlps xmm1, [edi+6*4] // xmm1 = 6, 7, 0, 1
|
|
movlps xmm5, [edi+16*4] // xmm5 = 16, 17, 10, 11
|
|
movaps xmm2, xmm1 // xmm2 = 6, 7, 0, 1
|
|
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm1 = 1, 6, 11, 16
|
|
movhps xmm2, [edi+2*4] // xmm2 = 6, 7, 2, 3
|
|
movhps xmm5, [edi+12*4] // xmm5 = 16, 17, 12, 13
|
|
movaps xmm3, xmm2 // xmm3 = 6, 7, 2, 3
|
|
shufps xmm2, xmm5, R_SHUFFLEPS( 2, 1, 2, 1 ) // xmm2 = 2, 7, 12, 17
|
|
movlps xmm3, [edi+8*4] // xmm3 = 8, 9, 2, 3
|
|
movlps xmm5, [edi+18*4] // xmm5 = 18, 19, 12, 13
|
|
movss xmm4, [edi+4*4] // xmm4 = 4, X, X, X
|
|
movlhps xmm4, xmm3 // xmm4 = 4, X, 8, 9
|
|
shufps xmm3, xmm5, R_SHUFFLEPS( 3, 0, 3, 0 ) // xmm3 = 3, 8, 13, 18
|
|
movhps xmm5, [edi+14*4] // xmm6 = 18, 19, 14, 15
|
|
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 3, 2, 1 ) // xmm4 = 4, 9, 14, 19
|
|
movss xmm7, [esi+0*4]
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm0, xmm7
|
|
movss xmm5, [esi+1*4]
|
|
shufps xmm5, xmm5, 0
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movss xmm6, [esi+2*4]
|
|
shufps xmm6, xmm6, 0
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm2
|
|
movss xmm1, [esi+3*4]
|
|
shufps xmm1, xmm1, 0
|
|
mulps xmm3, xmm1
|
|
addps xmm0, xmm3
|
|
movss xmm2, [esi+4*4]
|
|
shufps xmm2, xmm2, 0
|
|
mulps xmm4, xmm2
|
|
addps xmm0, xmm4
|
|
mulss xmm7, [edi+20*4]
|
|
mulss xmm5, [edi+21*4]
|
|
addps xmm7, xmm5
|
|
mulss xmm6, [edi+22*4]
|
|
addps xmm7, xmm6
|
|
mulss xmm1, [edi+23*4]
|
|
addps xmm7, xmm1
|
|
mulss xmm2, [edi+24*4]
|
|
addps xmm7, xmm2
|
|
STORE4( 0, xmm0, xmm3 )
|
|
STORE1( 16, xmm7, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x5 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, [esi]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movlps xmm7, [esi+8]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movlps xmm0, [edi]
|
|
movhps xmm3, [edi+8]
|
|
movaps xmm1, [edi+16]
|
|
movlps xmm2, [edi+32]
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm0 = 0, 1, 5, 6
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 4, 7, 8, 9
|
|
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 2, 3, 7, 8
|
|
mulps xmm0, xmm6
|
|
mulps xmm3, xmm7
|
|
movlps xmm2, [edi+40]
|
|
addps xmm0, xmm3 // xmm0 + xmm1
|
|
movhps xmm5, [edi+40+8]
|
|
movlps xmm3, [edi+40+16]
|
|
movhps xmm3, [edi+40+24]
|
|
movlps xmm4, [edi+40+32]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm2 = 10, 11, 15, 16
|
|
shufps xmm3, xmm4, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm3 = 14, 17, 18, 19
|
|
shufps xmm5, xmm3, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm5 = 12, 13, 17, 18
|
|
mulps xmm2, xmm6
|
|
mulps xmm5, xmm7
|
|
addps xmm2, xmm5 // xmm2 + xmm3
|
|
movss xmm5, [esi+16]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm4, xmm0
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm4, xmm2, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
shufps xmm1, xmm3, R_SHUFFLEPS( 0, 3, 0, 3 )
|
|
addps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
STORE4( 0, xmm0, xmm2 )
|
|
movlps xmm4, [edi+80]
|
|
movhps xmm3, [edi+80+8]
|
|
movaps xmm1, [edi+80+16]
|
|
movlps xmm2, [edi+80+32]
|
|
shufps xmm4, xmm1, R_SHUFFLEPS( 0, 1, 1, 2 ) // xmm4 = 20, 21, 25, 26
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 3, 0, 1 ) // xmm1 = 24, 27, 28, 29
|
|
shufps xmm3, xmm1, R_SHUFFLEPS( 2, 3, 1, 2 ) // xmm3 = 22, 23, 27, 28
|
|
mulps xmm4, xmm6
|
|
mulps xmm3, xmm7
|
|
mulps xmm1, xmm5
|
|
addps xmm4, xmm3 // xmm4 + xmm1
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 3, 0, 2 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 1, 3, 0, 0 )
|
|
addps xmm4, xmm1
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
addps xmm4, xmm1
|
|
STORE2LO( 16, xmm4, xmm2 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
|
|
mPtr += 5;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case 6: {
|
|
switch( numRows ) {
|
|
case 1: { // 1x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
mulss xmm0, [edi]
|
|
movss xmm1, [esi+4]
|
|
mulss xmm1, [edi+4]
|
|
movss xmm2, [esi+8]
|
|
addss xmm0, xmm1
|
|
mulss xmm2, [edi+8]
|
|
movss xmm3, [esi+12]
|
|
addss xmm0, xmm2
|
|
mulss xmm3, [edi+12]
|
|
movss xmm4, [esi+16]
|
|
addss xmm0, xmm3
|
|
mulss xmm4, [edi+16]
|
|
movss xmm5, [esi+20]
|
|
addss xmm0, xmm4
|
|
mulss xmm5, [edi+20]
|
|
movss xmm6, [esi+24]
|
|
addss xmm0, xmm5
|
|
mulss xmm6, [edi+24]
|
|
addss xmm0, xmm6
|
|
STORE1( 0, xmm0, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 2x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm0, xmm1
|
|
addps xmm0, xmm1
|
|
STORE2LO( 0, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 3x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 1, 3 )
|
|
movhlps xmm0, xmm1
|
|
addps xmm0, xmm1
|
|
STORE2LO( 0, xmm0, xmm3 )
|
|
// row 2
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm1
|
|
STORE1( 8, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 4x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm7, xmm0
|
|
movlhps xmm7, xmm2
|
|
addps xmm7, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm7, xmm0
|
|
// row 2 and 3
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
movaps xmm2, [edi+48+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
// last 4 additions for the first 4 rows and store result
|
|
movaps xmm0, xmm7
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm7
|
|
STORE4( 0, xmm0, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 5x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
// load idVecX
|
|
movlps xmm4, [esi]
|
|
movhps xmm4, [esi+8]
|
|
movlps xmm5, [esi+16]
|
|
movlhps xmm5, xmm4
|
|
movhlps xmm6, xmm4
|
|
movlhps xmm6, xmm5
|
|
// row 0 and 1
|
|
movaps xmm0, [edi]
|
|
movaps xmm1, [edi+16]
|
|
movaps xmm2, [edi+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm7, xmm0
|
|
movlhps xmm7, xmm2
|
|
addps xmm7, xmm1
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm7, xmm0
|
|
// row 2 and 3
|
|
movaps xmm0, [edi+48]
|
|
movaps xmm1, [edi+48+16]
|
|
movaps xmm2, [edi+48+32]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
mulps xmm2, xmm6
|
|
movhlps xmm3, xmm0
|
|
movlhps xmm3, xmm2
|
|
addps xmm1, xmm3
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
addps xmm1, xmm0
|
|
// last 4 additions for the first 4 rows and store result
|
|
movaps xmm0, xmm7
|
|
shufps xmm7, xmm1, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 1, 3, 1, 3 )
|
|
addps xmm0, xmm7
|
|
STORE4( 0, xmm0, xmm3 )
|
|
// row 5
|
|
movaps xmm0, [edi+96]
|
|
movaps xmm1, [edi+96+16]
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, 0x01
|
|
addss xmm0, xmm1
|
|
STORE1( 16, xmm0, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm7, qword ptr [esi]
|
|
movlps xmm6, qword ptr [esi+8]
|
|
shufps xmm7, xmm7, 0x44
|
|
shufps xmm6, xmm6, 0x44
|
|
movlps xmm0, qword ptr [edi ]
|
|
movhps xmm0, qword ptr [edi+ 24]
|
|
mulps xmm0, xmm7
|
|
movlps xmm3, qword ptr [edi+ 8]
|
|
movhps xmm3, qword ptr [edi+ 32]
|
|
mulps xmm3, xmm6
|
|
movlps xmm1, qword ptr [edi+ 48]
|
|
movhps xmm1, qword ptr [edi+ 72]
|
|
mulps xmm1, xmm7
|
|
movlps xmm2, qword ptr [edi+ 96]
|
|
movhps xmm2, qword ptr [edi+120]
|
|
mulps xmm2, xmm7
|
|
movlps xmm4, qword ptr [edi+ 56]
|
|
movhps xmm4, qword ptr [edi+ 80]
|
|
movlps xmm5, qword ptr [edi+104]
|
|
movhps xmm5, qword ptr [edi+128]
|
|
mulps xmm4, xmm6
|
|
movlps xmm7, qword ptr [esi+16]
|
|
addps xmm0, xmm3
|
|
shufps xmm7, xmm7, 0x44
|
|
mulps xmm5, xmm6
|
|
addps xmm1, xmm4
|
|
movlps xmm3, qword ptr [edi+ 16]
|
|
movhps xmm3, qword ptr [edi+ 40]
|
|
addps xmm2, xmm5
|
|
movlps xmm4, qword ptr [edi+ 64]
|
|
movhps xmm4, qword ptr [edi+ 88]
|
|
mulps xmm3, xmm7
|
|
movlps xmm5, qword ptr [edi+112]
|
|
movhps xmm5, qword ptr [edi+136]
|
|
addps xmm0, xmm3
|
|
mulps xmm4, xmm7
|
|
mulps xmm5, xmm7
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
movaps xmm6, xmm0
|
|
shufps xmm0, xmm1, 0x88
|
|
shufps xmm6, xmm1, 0xDD
|
|
movaps xmm7, xmm2
|
|
shufps xmm7, xmm2, 0x88
|
|
shufps xmm2, xmm2, 0xDD
|
|
addps xmm0, xmm6
|
|
addps xmm2, xmm7
|
|
STORE4( 0, xmm0, xmm3 )
|
|
STORE2LO( 16, xmm2, xmm4 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] STOREC mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
|
|
mPtr += 6;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
int numColumns = mat.GetNumColumns();
|
|
for ( int i = 0; i < numRows; i++ ) {
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numColumns; j++ ) {
|
|
sum += mPtr[j] * vPtr[j];
|
|
}
|
|
dstPtr[i] STOREC sum;
|
|
mPtr += numColumns;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
#undef STOREC
|
|
#undef STORE4
|
|
#undef STORE2HI
|
|
#undef STORE2LO
|
|
#undef STORE1
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_TransposeMultiplyVecX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
Nx6 * Nx1
|
|
6xN * 6x1
|
|
|
|
with N in the range [1-6]
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
#define STORE1( offset, reg1, reg2 ) \
|
|
__asm movss [eax+offset], reg1
|
|
#define STORE2LO( offset, reg1, reg2 ) \
|
|
__asm movlps [eax+offset], reg1
|
|
#define STORE2HI( offset, reg1, reg2 ) \
|
|
__asm movhps [eax+offset], reg1
|
|
#define STORE4( offset, reg1, reg2 ) \
|
|
__asm movlps [eax+offset], reg1 \
|
|
__asm movhps [eax+offset+8], reg1
|
|
#define STOREC =
|
|
|
|
int numColumns;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumRows() );
|
|
assert( dst.GetSize() >= mat.GetNumColumns() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numColumns = mat.GetNumColumns();
|
|
switch( mat.GetNumRows() ) {
|
|
case 1:
|
|
switch( numColumns ) {
|
|
case 6: { // 1x6 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
STORE4( 0, xmm0, xmm2 )
|
|
STORE2LO( 16, xmm1, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 2:
|
|
switch( numColumns ) {
|
|
case 6: { // 2x6 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi]
|
|
movaps xmm1, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movaps xmm2, [edi]
|
|
mulps xmm2, xmm0
|
|
movlps xmm3, [edi+24]
|
|
movhps xmm3, [edi+32]
|
|
mulps xmm3, xmm1
|
|
addps xmm2, xmm3
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm4, [edi+16]
|
|
movhps xmm4, [edi+40]
|
|
mulps xmm4, xmm0
|
|
movhlps xmm3, xmm4
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm2, xmm5 )
|
|
STORE2LO( 16, xmm3, xmm6 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 3:
|
|
switch( numColumns ) {
|
|
case 6: { // 3x6 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movss xmm1, [esi+2*4]
|
|
movlps xmm3, [edi+(0*6+0)*4]
|
|
movhps xmm3, [edi+(0*6+2)*4]
|
|
movaps xmm4, xmm0
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm4
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*6+0)*4]
|
|
movhps xmm4, [edi+(2*6+2)*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(2*6+4)*4]
|
|
mulps xmm5, xmm1
|
|
addps xmm3, xmm5
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 4:
|
|
switch( numColumns ) {
|
|
case 6: { // 4x6 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*6+0)*4]
|
|
movhps xmm4, [edi+(2*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 5:
|
|
switch( numColumns ) {
|
|
case 6: { // 5x6 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movss xmm2, [esi+4*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(2*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm4, xmm2
|
|
mulps xmm4, [edi+(4*6+0)*4]
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(4*6+4)*4]
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 6:
|
|
switch( numColumns ) {
|
|
case 1: { // 6x1 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi]
|
|
movhps xmm0, [esi+8]
|
|
movlps xmm1, [esi+16]
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
|
|
addps xmm0, xmm1
|
|
movhlps xmm2, xmm0
|
|
addss xmm2, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm2, xmm0
|
|
STORE1( 0, xmm2, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 6x2 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm6, [edi+0*4]
|
|
mulps xmm6, xmm0
|
|
movlps xmm1, [esi+2*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm7, [edi+4*4]
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movlps xmm2, [esi+4*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm7, [edi+8*4]
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movhlps xmm3, xmm6
|
|
addps xmm3, xmm6
|
|
STORE2LO( 0, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 6x3 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi+(0*3+2)*4]
|
|
movhps xmm0, [edi+(0*3+0)*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm6, [esi+0*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movss xmm1, [edi+(1*3+0)*4]
|
|
movhps xmm1, [edi+(1*3+1)*4]
|
|
movss xmm7, [esi+1*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movss xmm2, [edi+(2*3+2)*4]
|
|
movhps xmm2, [edi+(2*3+0)*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm7, [esi+2*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movss xmm3, [edi+(3*3+0)*4]
|
|
movhps xmm3, [edi+(3*3+1)*4]
|
|
movss xmm7, [esi+3*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm7
|
|
movss xmm4, [edi+(4*3+2)*4]
|
|
movhps xmm4, [edi+(4*3+0)*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm7, [esi+4*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movss xmm5, [edi+(5*3+0)*4]
|
|
movhps xmm5, [edi+(5*3+1)*4]
|
|
movss xmm7, [esi+5*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
STORE1( 0, xmm6, xmm7 )
|
|
STORE2HI( 4, xmm6, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 6x4 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm3, [edi+(0*4+0)*4]
|
|
movhps xmm3, [edi+(0*4+2)*4]
|
|
movss xmm4, [esi+0*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm4
|
|
movlps xmm5, [edi+(1*4+0)*4]
|
|
movhps xmm5, [edi+(1*4+2)*4]
|
|
movss xmm6, [esi+1*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*4+0)*4]
|
|
movhps xmm4, [edi+(2*4+2)*4]
|
|
movss xmm6, [esi+2*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(3*4+0)*4]
|
|
movhps xmm5, [edi+(3*4+2)*4]
|
|
movss xmm6, [esi+3*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(4*4+0)*4]
|
|
movhps xmm4, [edi+(4*4+2)*4]
|
|
movss xmm6, [esi+4*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(5*4+0)*4]
|
|
movhps xmm5, [edi+(5*4+2)*4]
|
|
movss xmm6, [esi+5*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 6x5 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, [edi+(0*5+0)*4]
|
|
movhps xmm6, [edi+(0*5+2)*4]
|
|
movss xmm0, [esi+0*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movlps xmm7, [edi+(1*5+0)*4]
|
|
movhps xmm7, [edi+(1*5+2)*4]
|
|
movss xmm1, [esi+1*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(2*5+0)*4]
|
|
movhps xmm7, [edi+(2*5+2)*4]
|
|
movss xmm2, [esi+2*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(3*5+0)*4]
|
|
movhps xmm7, [edi+(3*5+2)*4]
|
|
movss xmm3, [esi+3*4]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(4*5+0)*4]
|
|
movhps xmm7, [edi+(4*5+2)*4]
|
|
movss xmm4, [esi+4*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(5*5+0)*4]
|
|
movhps xmm7, [edi+(5*5+2)*4]
|
|
movss xmm5, [esi+5*4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
STORE4( 0, xmm6, xmm7 )
|
|
movss xmm6, [edi+(0*5+4)*4]
|
|
mulss xmm6, xmm0
|
|
movss xmm7, [edi+(1*5+4)*4]
|
|
mulss xmm7, xmm1
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(2*5+4)*4]
|
|
mulss xmm7, xmm2
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(3*5+4)*4]
|
|
mulss xmm7, xmm3
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(4*5+4)*4]
|
|
mulss xmm7, xmm4
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(5*5+4)*4]
|
|
mulss xmm7, xmm5
|
|
addss xmm6, xmm7
|
|
STORE1( 16, xmm6, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movlps xmm2, [esi+4*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(2*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm2
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(4*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movaps xmm6, xmm2
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movlps xmm5, [edi+(5*6+0)*4]
|
|
movhps xmm5, [edi+(5*6+2)*4]
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(4*6+4)*4]
|
|
movhps xmm5, [edi+(5*6+4)*4]
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
int numRows = mat.GetNumRows();
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
mPtr = mat.ToFloatPtr() + i;
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numRows; j++ ) {
|
|
mPtr += numColumns;
|
|
sum += mPtr[0] * vPtr[j];
|
|
}
|
|
dstPtr[i] STOREC sum;
|
|
}
|
|
break;
|
|
}
|
|
|
|
#undef STOREC
|
|
#undef STORE4
|
|
#undef STORE2HI
|
|
#undef STORE2LO
|
|
#undef STORE1
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_TransposeMultiplyAddVecX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
Nx6 * Nx1
|
|
6xN * 6x1
|
|
|
|
with N in the range [1-6]
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
#define STORE1( offset, reg1, reg2 ) \
|
|
__asm movss reg2, [eax+offset] \
|
|
__asm addss reg2, reg1 \
|
|
__asm movss [eax+offset], reg2
|
|
#define STORE2LO( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm addps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2
|
|
#define STORE2HI( offset, reg1, reg2 ) \
|
|
__asm movhps reg2, [eax+offset] \
|
|
__asm addps reg2, reg1 \
|
|
__asm movhps [eax+offset], reg2
|
|
#define STORE4( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm movhps reg2, [eax+offset+8] \
|
|
__asm addps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2 \
|
|
__asm movhps [eax+offset+8], reg2
|
|
#define STOREC +=
|
|
|
|
int numColumns;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumRows() );
|
|
assert( dst.GetSize() >= mat.GetNumColumns() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numColumns = mat.GetNumColumns();
|
|
switch( mat.GetNumRows() ) {
|
|
case 1:
|
|
switch( numColumns ) {
|
|
case 6: { // 1x6 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
STORE4( 0, xmm0, xmm2 )
|
|
STORE2LO( 16, xmm1, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 2:
|
|
switch( numColumns ) {
|
|
case 6: { // 2x6 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi]
|
|
movaps xmm1, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movaps xmm2, [edi]
|
|
mulps xmm2, xmm0
|
|
movlps xmm3, [edi+24]
|
|
movhps xmm3, [edi+32]
|
|
mulps xmm3, xmm1
|
|
addps xmm2, xmm3
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm4, [edi+16]
|
|
movhps xmm4, [edi+40]
|
|
mulps xmm4, xmm0
|
|
movhlps xmm3, xmm4
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm2, xmm5 )
|
|
STORE2LO( 16, xmm3, xmm6 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 3:
|
|
switch( numColumns ) {
|
|
case 6: { // 3x6 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movss xmm1, [esi+2*4]
|
|
movlps xmm3, [edi+(0*6+0)*4]
|
|
movhps xmm3, [edi+(0*6+2)*4]
|
|
movaps xmm4, xmm0
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm4
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*6+0)*4]
|
|
movhps xmm4, [edi+(2*6+2)*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(2*6+4)*4]
|
|
mulps xmm5, xmm1
|
|
addps xmm3, xmm5
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 4:
|
|
switch( numColumns ) {
|
|
case 6: { // 4x6 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*6+0)*4]
|
|
movhps xmm4, [edi+(2*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 5:
|
|
switch( numColumns ) {
|
|
case 6: { // 5x6 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movss xmm2, [esi+4*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(2*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm4, xmm2
|
|
mulps xmm4, [edi+(4*6+0)*4]
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(4*6+4)*4]
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 6:
|
|
switch( numColumns ) {
|
|
case 1: { // 6x1 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi]
|
|
movhps xmm0, [esi+8]
|
|
movlps xmm1, [esi+16]
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
|
|
addps xmm0, xmm1
|
|
movhlps xmm2, xmm0
|
|
addss xmm2, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm2, xmm0
|
|
STORE1( 0, xmm2, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 6x2 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm6, [edi+0*4]
|
|
mulps xmm6, xmm0
|
|
movlps xmm1, [esi+2*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm7, [edi+4*4]
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movlps xmm2, [esi+4*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm7, [edi+8*4]
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movhlps xmm3, xmm6
|
|
addps xmm3, xmm6
|
|
STORE2LO( 0, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 6x3 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi+(0*3+2)*4]
|
|
movhps xmm0, [edi+(0*3+0)*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm6, [esi+0*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movss xmm1, [edi+(1*3+0)*4]
|
|
movhps xmm1, [edi+(1*3+1)*4]
|
|
movss xmm7, [esi+1*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movss xmm2, [edi+(2*3+2)*4]
|
|
movhps xmm2, [edi+(2*3+0)*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm7, [esi+2*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movss xmm3, [edi+(3*3+0)*4]
|
|
movhps xmm3, [edi+(3*3+1)*4]
|
|
movss xmm7, [esi+3*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm7
|
|
movss xmm4, [edi+(4*3+2)*4]
|
|
movhps xmm4, [edi+(4*3+0)*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm7, [esi+4*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movss xmm5, [edi+(5*3+0)*4]
|
|
movhps xmm5, [edi+(5*3+1)*4]
|
|
movss xmm7, [esi+5*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
STORE1( 0, xmm6, xmm7 )
|
|
STORE2HI( 4, xmm6, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 6x4 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm3, [edi+(0*4+0)*4]
|
|
movhps xmm3, [edi+(0*4+2)*4]
|
|
movss xmm4, [esi+0*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm4
|
|
movlps xmm5, [edi+(1*4+0)*4]
|
|
movhps xmm5, [edi+(1*4+2)*4]
|
|
movss xmm6, [esi+1*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*4+0)*4]
|
|
movhps xmm4, [edi+(2*4+2)*4]
|
|
movss xmm6, [esi+2*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(3*4+0)*4]
|
|
movhps xmm5, [edi+(3*4+2)*4]
|
|
movss xmm6, [esi+3*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(4*4+0)*4]
|
|
movhps xmm4, [edi+(4*4+2)*4]
|
|
movss xmm6, [esi+4*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(5*4+0)*4]
|
|
movhps xmm5, [edi+(5*4+2)*4]
|
|
movss xmm6, [esi+5*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 6x5 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, [edi+(0*5+0)*4]
|
|
movhps xmm6, [edi+(0*5+2)*4]
|
|
movss xmm0, [esi+0*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movlps xmm7, [edi+(1*5+0)*4]
|
|
movhps xmm7, [edi+(1*5+2)*4]
|
|
movss xmm1, [esi+1*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(2*5+0)*4]
|
|
movhps xmm7, [edi+(2*5+2)*4]
|
|
movss xmm2, [esi+2*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(3*5+0)*4]
|
|
movhps xmm7, [edi+(3*5+2)*4]
|
|
movss xmm3, [esi+3*4]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(4*5+0)*4]
|
|
movhps xmm7, [edi+(4*5+2)*4]
|
|
movss xmm4, [esi+4*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(5*5+0)*4]
|
|
movhps xmm7, [edi+(5*5+2)*4]
|
|
movss xmm5, [esi+5*4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
STORE4( 0, xmm6, xmm7 )
|
|
movss xmm6, [edi+(0*5+4)*4]
|
|
mulss xmm6, xmm0
|
|
movss xmm7, [edi+(1*5+4)*4]
|
|
mulss xmm7, xmm1
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(2*5+4)*4]
|
|
mulss xmm7, xmm2
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(3*5+4)*4]
|
|
mulss xmm7, xmm3
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(4*5+4)*4]
|
|
mulss xmm7, xmm4
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(5*5+4)*4]
|
|
mulss xmm7, xmm5
|
|
addss xmm6, xmm7
|
|
STORE1( 16, xmm6, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movlps xmm2, [esi+4*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(2*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm2
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(4*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movaps xmm6, xmm2
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movlps xmm5, [edi+(5*6+0)*4]
|
|
movhps xmm5, [edi+(5*6+2)*4]
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(4*6+4)*4]
|
|
movhps xmm5, [edi+(5*6+4)*4]
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
int numRows = mat.GetNumRows();
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
mPtr = mat.ToFloatPtr() + i;
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numRows; j++ ) {
|
|
mPtr += numColumns;
|
|
sum += mPtr[0] * vPtr[j];
|
|
}
|
|
dstPtr[i] STOREC sum;
|
|
}
|
|
break;
|
|
}
|
|
|
|
#undef STOREC
|
|
#undef STORE4
|
|
#undef STORE2HI
|
|
#undef STORE2LO
|
|
#undef STORE1
|
|
}
|
|
|
|
/*
|
|
============
|
|
void idSIMD_SSE::MatX_TransposeMultiplySubVecX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
Nx6 * Nx1
|
|
6xN * 6x1
|
|
|
|
with N in the range [1-6]
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
#define STORE1( offset, reg1, reg2 ) \
|
|
__asm movss reg2, [eax+offset] \
|
|
__asm subss reg2, reg1 \
|
|
__asm movss [eax+offset], reg2
|
|
#define STORE2LO( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm subps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2
|
|
#define STORE2HI( offset, reg1, reg2 ) \
|
|
__asm movhps reg2, [eax+offset] \
|
|
__asm subps reg2, reg1 \
|
|
__asm movhps [eax+offset], reg2
|
|
#define STORE4( offset, reg1, reg2 ) \
|
|
__asm movlps reg2, [eax+offset] \
|
|
__asm movhps reg2, [eax+offset+8] \
|
|
__asm subps reg2, reg1 \
|
|
__asm movlps [eax+offset], reg2 \
|
|
__asm movhps [eax+offset+8], reg2
|
|
#define STOREC -=
|
|
|
|
int numColumns;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumRows() );
|
|
assert( dst.GetSize() >= mat.GetNumColumns() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numColumns = mat.GetNumColumns();
|
|
switch( mat.GetNumRows() ) {
|
|
case 1:
|
|
switch( numColumns ) {
|
|
case 6: { // 1x6 * 1x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
STORE4( 0, xmm0, xmm2 )
|
|
STORE2LO( 16, xmm1, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 2:
|
|
switch( numColumns ) {
|
|
case 6: { // 2x6 * 2x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi]
|
|
movaps xmm1, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movaps xmm2, [edi]
|
|
mulps xmm2, xmm0
|
|
movlps xmm3, [edi+24]
|
|
movhps xmm3, [edi+32]
|
|
mulps xmm3, xmm1
|
|
addps xmm2, xmm3
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm4, [edi+16]
|
|
movhps xmm4, [edi+40]
|
|
mulps xmm4, xmm0
|
|
movhlps xmm3, xmm4
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm2, xmm5 )
|
|
STORE2LO( 16, xmm3, xmm6 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 3:
|
|
switch( numColumns ) {
|
|
case 6: { // 3x6 * 3x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movss xmm1, [esi+2*4]
|
|
movlps xmm3, [edi+(0*6+0)*4]
|
|
movhps xmm3, [edi+(0*6+2)*4]
|
|
movaps xmm4, xmm0
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm4
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*6+0)*4]
|
|
movhps xmm4, [edi+(2*6+2)*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(2*6+4)*4]
|
|
mulps xmm5, xmm1
|
|
addps xmm3, xmm5
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 4:
|
|
switch( numColumns ) {
|
|
case 6: { // 4x6 * 4x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*6+0)*4]
|
|
movhps xmm4, [edi+(2*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 5:
|
|
switch( numColumns ) {
|
|
case 6: { // 5x6 * 5x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movss xmm2, [esi+4*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(2*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm4, xmm2
|
|
mulps xmm4, [edi+(4*6+0)*4]
|
|
addps xmm3, xmm4
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(4*6+4)*4]
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
case 6:
|
|
switch( numColumns ) {
|
|
case 1: { // 6x1 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi]
|
|
movhps xmm0, [esi+8]
|
|
movlps xmm1, [esi+16]
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 3, 2 )
|
|
addps xmm0, xmm1
|
|
movhlps xmm2, xmm0
|
|
addss xmm2, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm2, xmm0
|
|
STORE1( 0, xmm2, xmm3 )
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 6x2 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm6, [edi+0*4]
|
|
mulps xmm6, xmm0
|
|
movlps xmm1, [esi+2*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm7, [edi+4*4]
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movlps xmm2, [esi+4*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm7, [edi+8*4]
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movhlps xmm3, xmm6
|
|
addps xmm3, xmm6
|
|
STORE2LO( 0, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 6x3 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi+(0*3+2)*4]
|
|
movhps xmm0, [edi+(0*3+0)*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm6, [esi+0*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movss xmm1, [edi+(1*3+0)*4]
|
|
movhps xmm1, [edi+(1*3+1)*4]
|
|
movss xmm7, [esi+1*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movss xmm2, [edi+(2*3+2)*4]
|
|
movhps xmm2, [edi+(2*3+0)*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm7, [esi+2*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movss xmm3, [edi+(3*3+0)*4]
|
|
movhps xmm3, [edi+(3*3+1)*4]
|
|
movss xmm7, [esi+3*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm7
|
|
movss xmm4, [edi+(4*3+2)*4]
|
|
movhps xmm4, [edi+(4*3+0)*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 1, 3, 0 )
|
|
movss xmm7, [esi+4*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movss xmm5, [edi+(5*3+0)*4]
|
|
movhps xmm5, [edi+(5*3+1)*4]
|
|
movss xmm7, [esi+5*4]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
STORE1( 0, xmm6, xmm7 )
|
|
STORE2HI( 4, xmm6, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 6x4 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm3, [edi+(0*4+0)*4]
|
|
movhps xmm3, [edi+(0*4+2)*4]
|
|
movss xmm4, [esi+0*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm4
|
|
movlps xmm5, [edi+(1*4+0)*4]
|
|
movhps xmm5, [edi+(1*4+2)*4]
|
|
movss xmm6, [esi+1*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(2*4+0)*4]
|
|
movhps xmm4, [edi+(2*4+2)*4]
|
|
movss xmm6, [esi+2*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(3*4+0)*4]
|
|
movhps xmm5, [edi+(3*4+2)*4]
|
|
movss xmm6, [esi+3*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movlps xmm4, [edi+(4*4+0)*4]
|
|
movhps xmm4, [edi+(4*4+2)*4]
|
|
movss xmm6, [esi+4*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(5*4+0)*4]
|
|
movhps xmm5, [edi+(5*4+2)*4]
|
|
movss xmm6, [esi+5*4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 6x5 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm6, [edi+(0*5+0)*4]
|
|
movhps xmm6, [edi+(0*5+2)*4]
|
|
movss xmm0, [esi+0*4]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movlps xmm7, [edi+(1*5+0)*4]
|
|
movhps xmm7, [edi+(1*5+2)*4]
|
|
movss xmm1, [esi+1*4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(2*5+0)*4]
|
|
movhps xmm7, [edi+(2*5+2)*4]
|
|
movss xmm2, [esi+2*4]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(3*5+0)*4]
|
|
movhps xmm7, [edi+(3*5+2)*4]
|
|
movss xmm3, [esi+3*4]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(4*5+0)*4]
|
|
movhps xmm7, [edi+(4*5+2)*4]
|
|
movss xmm4, [esi+4*4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movlps xmm7, [edi+(5*5+0)*4]
|
|
movhps xmm7, [edi+(5*5+2)*4]
|
|
movss xmm5, [esi+5*4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
STORE4( 0, xmm6, xmm7 )
|
|
movss xmm6, [edi+(0*5+4)*4]
|
|
mulss xmm6, xmm0
|
|
movss xmm7, [edi+(1*5+4)*4]
|
|
mulss xmm7, xmm1
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(2*5+4)*4]
|
|
mulss xmm7, xmm2
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(3*5+4)*4]
|
|
mulss xmm7, xmm3
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(4*5+4)*4]
|
|
mulss xmm7, xmm4
|
|
addss xmm6, xmm7
|
|
movss xmm7, [edi+(5*5+4)*4]
|
|
mulss xmm7, xmm5
|
|
addss xmm6, xmm7
|
|
STORE1( 16, xmm6, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, vPtr
|
|
mov edi, mPtr
|
|
mov eax, dstPtr
|
|
movlps xmm0, [esi+0*4]
|
|
movlps xmm1, [esi+2*4]
|
|
movlps xmm2, [esi+4*4]
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, [edi+(0*6+0)*4]
|
|
movlps xmm5, [edi+(1*6+0)*4]
|
|
movhps xmm5, [edi+(1*6+2)*4]
|
|
movaps xmm6, xmm0
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(2*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movaps xmm6, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movlps xmm5, [edi+(3*6+0)*4]
|
|
movhps xmm5, [edi+(3*6+2)*4]
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
movaps xmm6, xmm2
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, [edi+(4*6+0)*4]
|
|
addps xmm3, xmm6
|
|
movaps xmm6, xmm2
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movlps xmm5, [edi+(5*6+0)*4]
|
|
movhps xmm5, [edi+(5*6+2)*4]
|
|
mulps xmm5, xmm6
|
|
addps xmm3, xmm5
|
|
STORE4( 0, xmm3, xmm7 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movlps xmm3, [edi+(0*6+4)*4]
|
|
movhps xmm3, [edi+(1*6+4)*4]
|
|
mulps xmm3, xmm0
|
|
movlps xmm4, [edi+(2*6+4)*4]
|
|
movhps xmm4, [edi+(3*6+4)*4]
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movlps xmm5, [edi+(4*6+4)*4]
|
|
movhps xmm5, [edi+(5*6+4)*4]
|
|
mulps xmm5, xmm2
|
|
addps xmm3, xmm5
|
|
movhlps xmm4, xmm3
|
|
addps xmm3, xmm4
|
|
STORE2LO( 16, xmm3, xmm7 )
|
|
}
|
|
return;
|
|
}
|
|
default: {
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] STOREC *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
|
|
mPtr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
int numRows = mat.GetNumRows();
|
|
for ( int i = 0; i < numColumns; i++ ) {
|
|
mPtr = mat.ToFloatPtr() + i;
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numRows; j++ ) {
|
|
mPtr += numColumns;
|
|
sum += mPtr[0] * vPtr[j];
|
|
}
|
|
dstPtr[i] STOREC sum;
|
|
}
|
|
break;
|
|
}
|
|
|
|
#undef STOREC
|
|
#undef STORE4
|
|
#undef STORE2HI
|
|
#undef STORE2LO
|
|
#undef STORE1
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_MultiplyMatX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
NxN * Nx6
|
|
6xN * Nx6
|
|
Nx6 * 6xN
|
|
6x6 * 6xN
|
|
|
|
with N in the range [1-6].
|
|
|
|
The hot cache clock cycle counts are generally better for the SIMD version than the
|
|
FPU version. At times up to 40% less clock cycles on a P3. In practise however,
|
|
the results are poor probably due to memory access.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
|
|
int i, j, k, l, n;
|
|
float *dstPtr;
|
|
const float *m1Ptr, *m2Ptr;
|
|
double sum;
|
|
|
|
assert( m1.GetNumColumns() == m2.GetNumRows() );
|
|
|
|
dstPtr = dst.ToFloatPtr();
|
|
m1Ptr = m1.ToFloatPtr();
|
|
m2Ptr = m2.ToFloatPtr();
|
|
k = m1.GetNumRows();
|
|
l = m2.GetNumColumns();
|
|
n = m1.GetNumColumns();
|
|
|
|
switch( n ) {
|
|
case 1: {
|
|
if ( !(l^6) ) {
|
|
switch( k ) {
|
|
case 1: { // 1x1 * 1x6, no precision loss compared to FPU version
|
|
__asm {
|
|
mov esi, m2Ptr
|
|
mov edi, m1Ptr
|
|
mov eax, dstPtr
|
|
movss xmm0, [edi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, [esi]
|
|
mulps xmm1, xmm0
|
|
movaps [eax], xmm1
|
|
movlps xmm2, [esi+16]
|
|
mulps xmm2, xmm0
|
|
movlps [eax+16], xmm2
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x1 * 1x6, no precision loss compared to FPU version
|
|
__asm {
|
|
mov esi, m2Ptr
|
|
mov edi, m1Ptr
|
|
mov eax, dstPtr
|
|
xorps xmm1, xmm1
|
|
movaps xmm0, [edi]
|
|
movlps xmm1, [edi+16]
|
|
movlhps xmm1, xmm0
|
|
movhlps xmm2, xmm0
|
|
movlhps xmm2, xmm1
|
|
// row 0 and 1
|
|
movaps xmm3, [esi]
|
|
movaps xmm4, xmm3
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm5, xmm3
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
movaps xmm6, xmm3
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm6, xmm2
|
|
movaps [eax], xmm4
|
|
movaps [eax+16], xmm5
|
|
movaps [eax+32], xmm6
|
|
// row 2 and 3
|
|
movaps xmm4, xmm3
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 )
|
|
movaps xmm5, xmm3
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm3, xmm2
|
|
movaps [eax+48], xmm4
|
|
movaps [eax+64], xmm5
|
|
movaps [eax+80], xmm3
|
|
// row 4 and 5
|
|
movlps xmm3, [esi+16]
|
|
movaps xmm4, xmm3
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm5, xmm3
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm3, xmm2
|
|
movaps [eax+96], xmm4
|
|
movaps [eax+112], xmm5
|
|
movaps [eax+128], xmm3
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
if ( !(l^6) ) {
|
|
switch( k ) {
|
|
case 2: { // 2x2 * 2x6
|
|
|
|
#define MUL_Nx2_2x6_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm movlps xmm1, [esi+16] \
|
|
__asm movhps xmm1, [esi+40] \
|
|
__asm movlps xmm2, [esi+24] \
|
|
__asm movhps xmm2, [esi+32]
|
|
|
|
#define MUL_Nx2_2x6_ROW2( row ) \
|
|
__asm movaps xmm3, [edi+row*16] \
|
|
__asm movaps xmm5, xmm0 \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm4 \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm movaps xmm6, xmm2 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 1, 1 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm5, xmm6 \
|
|
__asm movaps [eax+row*48], xmm5 \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm movaps xmm7, xmm1 \
|
|
__asm mulps xmm7, xmm4 \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm movaps xmm5, xmm0 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 2, 2 ) \
|
|
__asm mulps xmm5, xmm4 \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm movaps xmm6, xmm2 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 3, 3, 3, 3 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm5, xmm6 \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 ) \
|
|
__asm movaps xmm6, xmm1 \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm movaps xmm4, xmm7 \
|
|
__asm movlhps xmm7, xmm6 \
|
|
__asm movhlps xmm6, xmm4 \
|
|
__asm addps xmm6, xmm7 \
|
|
__asm movlps [eax+row*48+16], xmm6 \
|
|
__asm movlps [eax+row*48+24], xmm5 \
|
|
__asm movhps [eax+row*48+32], xmm5 \
|
|
__asm movhps [eax+row*48+40], xmm6
|
|
|
|
MUL_Nx2_2x6_INIT
|
|
MUL_Nx2_2x6_ROW2( 0 )
|
|
|
|
return;
|
|
}
|
|
case 6: { // 6x2 * 2x6
|
|
|
|
MUL_Nx2_2x6_INIT
|
|
MUL_Nx2_2x6_ROW2( 0 )
|
|
MUL_Nx2_2x6_ROW2( 1 )
|
|
MUL_Nx2_2x6_ROW2( 2 )
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 2;
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
if ( !(l^6) ) {
|
|
switch( k ) {
|
|
case 3: { // 3x3 * 3x6
|
|
__asm {
|
|
mov esi, m2Ptr
|
|
mov edi, m1Ptr
|
|
mov eax, dstPtr
|
|
movaps xmm5, xmmword ptr [esi]
|
|
movlps xmm6, qword ptr [esi+24]
|
|
movhps xmm6, qword ptr [esi+32]
|
|
movaps xmm7, xmmword ptr [esi+48]
|
|
movss xmm0, dword ptr [edi]
|
|
shufps xmm0, xmm0, 0
|
|
mulps xmm0, xmm5
|
|
movss xmm1, dword ptr [edi+4]
|
|
shufps xmm1, xmm1, 0
|
|
mulps xmm1, xmm6
|
|
movss xmm2, dword ptr [edi+8]
|
|
shufps xmm2, xmm2, 0
|
|
mulps xmm2, xmm7
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
movaps xmmword ptr [eax], xmm0
|
|
movss xmm3, dword ptr [edi+12]
|
|
shufps xmm3, xmm3, 0
|
|
mulps xmm3, xmm5
|
|
movss xmm4, dword ptr [edi+16]
|
|
shufps xmm4, xmm4, 0
|
|
mulps xmm4, xmm6
|
|
movss xmm0, dword ptr [edi+20]
|
|
shufps xmm0, xmm0, 0
|
|
mulps xmm0, xmm7
|
|
addps xmm3, xmm4
|
|
addps xmm0, xmm3
|
|
movlps qword ptr [eax+24], xmm0
|
|
movhps qword ptr [eax+32], xmm0
|
|
movss xmm1, dword ptr [edi+24]
|
|
shufps xmm1, xmm1, 0
|
|
mulps xmm1, xmm5
|
|
movss xmm2, dword ptr [edi+28]
|
|
shufps xmm2, xmm2, 0
|
|
mulps xmm2, xmm6
|
|
movss xmm3, dword ptr [edi+32]
|
|
shufps xmm3, xmm3, 0
|
|
mulps xmm3, xmm7
|
|
addps xmm1, xmm2
|
|
addps xmm1, xmm3
|
|
movaps xmmword ptr [eax+48], xmm1
|
|
movlps xmm5, qword ptr [esi+16]
|
|
movlps xmm6, qword ptr [esi+40]
|
|
movlps xmm7, qword ptr [esi+64]
|
|
shufps xmm5, xmm5, 0x44
|
|
shufps xmm6, xmm6, 0x44
|
|
shufps xmm7, xmm7, 0x44
|
|
movaps xmm3, xmmword ptr [edi]
|
|
movlps xmm4, qword ptr [edi+16]
|
|
movaps xmm0, xmm3
|
|
shufps xmm0, xmm0, 0xF0
|
|
mulps xmm0, xmm5
|
|
movaps xmm1, xmm3
|
|
shufps xmm1, xmm4, 0x05
|
|
mulps xmm1, xmm6
|
|
shufps xmm3, xmm4, 0x5A
|
|
mulps xmm3, xmm7
|
|
addps xmm1, xmm0
|
|
addps xmm1, xmm3
|
|
movlps qword ptr [eax+16], xmm1
|
|
movhps qword ptr [eax+40], xmm1
|
|
movss xmm0, dword ptr [edi+24]
|
|
shufps xmm0, xmm0, 0
|
|
mulps xmm0, xmm5
|
|
movss xmm2, dword ptr [edi+28]
|
|
shufps xmm2, xmm2, 0
|
|
mulps xmm2, xmm6
|
|
movss xmm4, dword ptr [edi+32]
|
|
shufps xmm4, xmm4, 0
|
|
mulps xmm4, xmm7
|
|
addps xmm0, xmm2
|
|
addps xmm0, xmm4
|
|
movlps qword ptr [eax+64], xmm0
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x3 * 3x6
|
|
#define MUL_Nx3_3x6_FIRST4COLUMNS_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movlps xmm0, [esi+ 0*4] \
|
|
__asm movhps xmm0, [esi+ 2*4] \
|
|
__asm movlps xmm1, [esi+ 6*4] \
|
|
__asm movhps xmm1, [esi+ 8*4] \
|
|
__asm movlps xmm2, [esi+12*4] \
|
|
__asm movhps xmm2, [esi+14*4]
|
|
|
|
#define MUL_Nx3_3x6_FIRST4COLUMNS_ROW( row ) \
|
|
__asm movss xmm3, [edi+(row*3+0)*4] \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm3, xmm0 \
|
|
__asm movss xmm4, [edi+(row*3+1)*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm4, xmm1 \
|
|
__asm addps xmm3, xmm4 \
|
|
__asm movss xmm5, [edi+(row*3+2)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm3, xmm5 \
|
|
__asm movlps [eax+(row*6+0)*4], xmm3 \
|
|
__asm movhps [eax+(row*6+2)*4], xmm3
|
|
|
|
#define MUL_Nx3_3x6_LAST2COLUMNS_ROW6 \
|
|
__asm movlps xmm0, [esi+ 4*4] \
|
|
__asm movlps xmm1, [esi+10*4] \
|
|
__asm movlps xmm2, [esi+16*4] \
|
|
__asm shufps xmm0, xmm0, 0x44 \
|
|
__asm shufps xmm1, xmm1, 0x44 \
|
|
__asm shufps xmm2, xmm2, 0x44 \
|
|
__asm movlps xmm3, [edi+0*4] \
|
|
__asm movhps xmm3, [edi+2*4] \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm movaps xmm5, xmm3 \
|
|
__asm shufps xmm3, xmm3, 0xF0 \
|
|
__asm mulps xmm3, xmm0 \
|
|
__asm movlps xmm6, [edi+4*4] \
|
|
__asm movhps xmm6, [edi+6*4] \
|
|
__asm shufps xmm4, xmm6, 0x05 \
|
|
__asm mulps xmm4, xmm1 \
|
|
__asm addps xmm3, xmm4 \
|
|
__asm shufps xmm5, xmm6, 0x5A \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm3, xmm5 \
|
|
__asm movlps [eax+4*4], xmm3 \
|
|
__asm movhps [eax+10*4], xmm3 \
|
|
__asm movaps xmm5, xmm6 \
|
|
__asm movlps xmm3, [edi+8*4] \
|
|
__asm movhps xmm3, [edi+10*4] \
|
|
__asm movaps xmm4, xmm3 \
|
|
__asm shufps xmm5, xmm3, 0x5A \
|
|
__asm mulps xmm5, xmm0 \
|
|
__asm shufps xmm6, xmm3, 0xAF \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm5, xmm6 \
|
|
__asm shufps xmm4, xmm4, 0xF0 \
|
|
__asm mulps xmm4, xmm2 \
|
|
__asm addps xmm4, xmm5 \
|
|
__asm movlps [eax+16*4], xmm4 \
|
|
__asm movhps [eax+22*4], xmm4 \
|
|
__asm movlps xmm6, [edi+12*4] \
|
|
__asm movhps xmm6, [edi+14*4] \
|
|
__asm movaps xmm5, xmm6 \
|
|
__asm movaps xmm4, xmm6 \
|
|
__asm shufps xmm6, xmm6, 0xF0 \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm movlps xmm3, [edi+16*4] \
|
|
__asm shufps xmm5, xmm3, 0x05 \
|
|
__asm mulps xmm5, xmm1 \
|
|
__asm addps xmm5, xmm6 \
|
|
__asm shufps xmm4, xmm3, 0x5A \
|
|
__asm mulps xmm4, xmm2 \
|
|
__asm addps xmm4, xmm5 \
|
|
__asm movlps [eax+28*4], xmm4 \
|
|
__asm movhps [eax+34*4], xmm4
|
|
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_INIT
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 0 )
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 1 )
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 2 )
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 3 )
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 4 )
|
|
MUL_Nx3_3x6_FIRST4COLUMNS_ROW( 5 )
|
|
MUL_Nx3_3x6_LAST2COLUMNS_ROW6
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 3;
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
if ( !(l^6) ) {
|
|
switch( k ) {
|
|
case 4: { // 4x4 * 4x6
|
|
|
|
#define MUL_Nx4_4x6_FIRST4COLUMNS_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movlps xmm0, [esi+ 0*4] \
|
|
__asm movhps xmm0, [esi+ 2*4] \
|
|
__asm movlps xmm1, [esi+ 6*4] \
|
|
__asm movhps xmm1, [esi+ 8*4] \
|
|
__asm movlps xmm2, [esi+12*4] \
|
|
__asm movhps xmm2, [esi+14*4] \
|
|
__asm movlps xmm3, [esi+18*4] \
|
|
__asm movhps xmm3, [esi+20*4]
|
|
|
|
#define MUL_Nx4_4x6_FIRST4COLUMNS_ROW( row ) \
|
|
__asm movss xmm4, [edi+row*16+0*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm4, xmm0 \
|
|
__asm movss xmm5, [edi+row*16+1*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm1 \
|
|
__asm addps xmm4, xmm5 \
|
|
__asm movss xmm6, [edi+row*16+2*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm4, xmm6 \
|
|
__asm movss xmm7, [edi+row*16+3*4] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm3 \
|
|
__asm addps xmm4, xmm7 \
|
|
__asm movlps [eax+row*24+0], xmm4 \
|
|
__asm movhps [eax+row*24+8], xmm4
|
|
|
|
#define MUL_Nx4_4x6_LAST2COLUMNS_INIT \
|
|
__asm movlps xmm0, [esi+ 4*4] \
|
|
__asm movlps xmm1, [esi+10*4] \
|
|
__asm movlps xmm2, [esi+16*4] \
|
|
__asm movlps xmm3, [esi+22*4] \
|
|
__asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm1, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
|
|
#define MUL_Nx4_4x6_LAST2COLUMNS_ROW2( row ) \
|
|
__asm movlps xmm7, [edi+row*32+ 0*4] \
|
|
__asm movhps xmm7, [edi+row*32+ 4*4] \
|
|
__asm movaps xmm6, xmm7 \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 3, 3 ) \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 1, 1, 2, 2 ) \
|
|
__asm mulps xmm7, xmm1 \
|
|
__asm addps xmm6, xmm7 \
|
|
__asm movlps xmm4, [edi+row*32+ 2*4] \
|
|
__asm movhps xmm4, [edi+row*32+ 6*4] \
|
|
__asm movaps xmm5, xmm4 \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 3, 3 ) \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 1, 1, 2, 2 ) \
|
|
__asm mulps xmm4, xmm3 \
|
|
__asm addps xmm6, xmm4 \
|
|
__asm movlps [eax+row*48+ 4*4], xmm6 \
|
|
__asm movhps [eax+row*48+10*4], xmm6
|
|
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_INIT
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
|
|
MUL_Nx4_4x6_LAST2COLUMNS_INIT
|
|
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
|
|
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
|
|
|
|
return;
|
|
}
|
|
case 6: { // 6x4 * 4x6
|
|
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_INIT
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 0 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 1 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 2 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 3 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 4 )
|
|
MUL_Nx4_4x6_FIRST4COLUMNS_ROW( 5 )
|
|
MUL_Nx4_4x6_LAST2COLUMNS_INIT
|
|
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 0 )
|
|
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 1 )
|
|
MUL_Nx4_4x6_LAST2COLUMNS_ROW2( 2 )
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
|
|
m1Ptr[3] * m2Ptr[3*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 4;
|
|
}
|
|
break;
|
|
}
|
|
case 5: {
|
|
if ( !(l^6) ) {
|
|
switch( k ) {
|
|
case 5: { // 5x5 * 5x6
|
|
|
|
#define MUL_Nx5_5x6_FIRST4COLUMNS_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movlps xmm0, [esi+ 0*4] \
|
|
__asm movhps xmm0, [esi+ 2*4] \
|
|
__asm movlps xmm1, [esi+ 6*4] \
|
|
__asm movhps xmm1, [esi+ 8*4] \
|
|
__asm movlps xmm2, [esi+12*4] \
|
|
__asm movhps xmm2, [esi+14*4] \
|
|
__asm movlps xmm3, [esi+18*4] \
|
|
__asm movhps xmm3, [esi+20*4] \
|
|
__asm movlps xmm4, [esi+24*4] \
|
|
__asm movhps xmm4, [esi+26*4]
|
|
|
|
#define MUL_Nx5_5x6_FIRST4COLUMNS_ROW( row ) \
|
|
__asm movss xmm6, [edi+row*20+0*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm movss xmm5, [edi+row*20+1*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm1 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movss xmm5, [edi+row*20+2*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movss xmm5, [edi+row*20+3*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm3 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movss xmm5, [edi+row*20+4*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm4 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movlps [eax+row*24+0], xmm6 \
|
|
__asm movhps [eax+row*24+8], xmm6
|
|
|
|
#define MUL_Nx5_5x6_LAST2COLUMNS_INIT \
|
|
__asm movlps xmm0, [esi+ 4*4] \
|
|
__asm movlps xmm1, [esi+10*4] \
|
|
__asm movlps xmm2, [esi+16*4] \
|
|
__asm movlps xmm3, [esi+22*4] \
|
|
__asm movlps xmm4, [esi+28*4] \
|
|
__asm shufps xmm0, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm1, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm2, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm3, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm4, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
|
|
#define MUL_Nx5_5x6_LAST2COLUMNS_ROW2( row ) \
|
|
__asm movlps xmm7, [edi+row*40+ 0*4] \
|
|
__asm movhps xmm7, [edi+row*40+ 6*4] \
|
|
__asm movaps xmm6, xmm7 \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 2, 2 ) \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm movaps xmm5, xmm7 \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
|
|
__asm mulps xmm5, xmm1 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movlps xmm7, [edi+row*40+ 2*4] \
|
|
__asm movhps xmm7, [edi+row*40+ 8*4] \
|
|
__asm movaps xmm5, xmm7 \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 2, 2 ) \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movaps xmm5, xmm7 \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 3, 3 ) \
|
|
__asm mulps xmm5, xmm3 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movlps xmm5, [edi+row*40+ 4*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm5, xmm4 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movlps [eax+row*48+ 4*4], xmm6 \
|
|
__asm movhps [eax+row*48+10*4], xmm6
|
|
|
|
#define MUL_Nx5_5x6_LAST2COLUMNS_ROW( row ) \
|
|
__asm movlps xmm6, [edi+20*4+0*4] \
|
|
__asm unpcklps xmm6, xmm6 \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm movlps xmm5, [edi+20*4+2*4] \
|
|
__asm unpcklps xmm5, xmm5 \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movss xmm5, [edi+20*4+4*4] \
|
|
__asm unpcklps xmm5, xmm5 \
|
|
__asm mulps xmm5, xmm4 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm movhlps xmm7, xmm6 \
|
|
__asm addps xmm6, xmm7 \
|
|
__asm movlps [eax+row*24+4*4], xmm6
|
|
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_INIT
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
|
|
MUL_Nx5_5x6_LAST2COLUMNS_INIT
|
|
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
|
|
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
|
|
MUL_Nx5_5x6_LAST2COLUMNS_ROW( 4 )
|
|
|
|
return;
|
|
}
|
|
case 6: { // 6x5 * 5x6
|
|
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_INIT
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 0 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 1 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 2 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 3 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 4 )
|
|
MUL_Nx5_5x6_FIRST4COLUMNS_ROW( 5 )
|
|
MUL_Nx5_5x6_LAST2COLUMNS_INIT
|
|
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 0 )
|
|
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 1 )
|
|
MUL_Nx5_5x6_LAST2COLUMNS_ROW2( 2 )
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
|
|
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 5;
|
|
}
|
|
break;
|
|
}
|
|
case 6: {
|
|
switch( k ) {
|
|
case 1: {
|
|
if ( !(l^1) ) { // 1x6 * 6x1
|
|
dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
|
|
m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
if ( !(l^2) ) { // 2x6 * 6x2
|
|
|
|
#define MUL_Nx6_6x2_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm movaps xmm1, [esi+16] \
|
|
__asm movaps xmm2, [esi+32]
|
|
|
|
#define MUL_Nx6_6x2_ROW2( row ) \
|
|
__asm movaps xmm7, [edi+row*48+0*4] \
|
|
__asm movaps xmm6, xmm7 \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 2, 2, 3, 3 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movaps xmm6, [edi+row*48+4*4] \
|
|
__asm movaps xmm5, xmm6 \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 2, 2, 3, 3 ) \
|
|
__asm mulps xmm5, xmm0 \
|
|
__asm movaps xmm6, [edi+row*48+24+2*4] \
|
|
__asm movaps xmm4, xmm6 \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm5, xmm6 \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 2, 2, 3, 3 ) \
|
|
__asm mulps xmm4, xmm2 \
|
|
__asm addps xmm5, xmm4 \
|
|
__asm movaps xmm4, xmm5 \
|
|
__asm movhlps xmm5, xmm7 \
|
|
__asm movlhps xmm7, xmm4 \
|
|
__asm addps xmm7, xmm5 \
|
|
__asm movaps [eax+row*16], xmm7
|
|
|
|
MUL_Nx6_6x2_INIT
|
|
MUL_Nx6_6x2_ROW2( 0 )
|
|
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
if ( !(l^3) ) { // 3x6 * 6x3
|
|
|
|
#define MUL_Nx6_6x3_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movss xmm0, [esi+ 0*4] \
|
|
__asm movhps xmm0, [esi+ 1*4] \
|
|
__asm movss xmm1, [esi+ 3*4] \
|
|
__asm movhps xmm1, [esi+ 4*4] \
|
|
__asm movss xmm2, [esi+ 6*4] \
|
|
__asm movhps xmm2, [esi+ 7*4] \
|
|
__asm movss xmm3, [esi+ 9*4] \
|
|
__asm movhps xmm3, [esi+10*4] \
|
|
__asm movss xmm4, [esi+12*4] \
|
|
__asm movhps xmm4, [esi+13*4] \
|
|
__asm movss xmm5, [esi+15*4] \
|
|
__asm movhps xmm5, [esi+16*4]
|
|
|
|
#define MUL_Nx6_6x3_ROW( row ) \
|
|
__asm movss xmm7, [edi+row*24+0] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm movss xmm6, [edi+row*24+4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+8] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+12] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+16] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+20] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm5 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss [eax+row*12+0], xmm7 \
|
|
__asm movhps [eax+row*12+4], xmm7
|
|
|
|
MUL_Nx6_6x3_INIT
|
|
MUL_Nx6_6x3_ROW( 0 )
|
|
MUL_Nx6_6x3_ROW( 1 )
|
|
MUL_Nx6_6x3_ROW( 2 )
|
|
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
if ( !(l^4) ) { // 4x6 * 6x4
|
|
|
|
#define MUL_Nx6_6x4_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm movaps xmm1, [esi+16] \
|
|
__asm movaps xmm2, [esi+32] \
|
|
__asm movaps xmm3, [esi+48] \
|
|
__asm movaps xmm4, [esi+64] \
|
|
__asm movaps xmm5, [esi+80]
|
|
|
|
#define MUL_Nx6_6x4_ROW( row ) \
|
|
__asm movss xmm7, [edi+row*24+0] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm movss xmm6, [edi+row*24+4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+8] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+12] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+16] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+row*24+20] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm5 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movaps [eax+row*16], xmm7
|
|
|
|
MUL_Nx6_6x4_INIT
|
|
MUL_Nx6_6x4_ROW( 0 )
|
|
MUL_Nx6_6x4_ROW( 1 )
|
|
MUL_Nx6_6x4_ROW( 2 )
|
|
MUL_Nx6_6x4_ROW( 3 )
|
|
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 5: {
|
|
if ( !(l^5) ) { // 5x6 * 6x5
|
|
|
|
#define MUL_Nx6_6x5_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm movlps xmm1, [esi+20] \
|
|
__asm movhps xmm1, [esi+28] \
|
|
__asm movlps xmm2, [esi+40] \
|
|
__asm movhps xmm2, [esi+48] \
|
|
__asm movlps xmm3, [esi+60] \
|
|
__asm movhps xmm3, [esi+68] \
|
|
__asm movaps xmm4, [esi+80] \
|
|
__asm movlps xmm5, [esi+100] \
|
|
__asm movhps xmm5, [esi+108]
|
|
|
|
#define MUL_Nx6_6x5_ROW( row ) \
|
|
__asm movss xmm7, [edi+row*24+0] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm fld dword ptr [edi+(row*6+0)*4] \
|
|
__asm fmul dword ptr [esi+(4+0*5)*4] \
|
|
__asm movss xmm6, [edi+row*24+4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm fld dword ptr [edi+(row*6+1)*4] \
|
|
__asm fmul dword ptr [esi+(4+1*5)*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm6, [edi+row*24+8] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm fld dword ptr [edi+(row*6+2)*4] \
|
|
__asm fmul dword ptr [esi+(4+2*5)*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm6, [edi+row*24+12] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm fld dword ptr [edi+(row*6+3)*4] \
|
|
__asm fmul dword ptr [esi+(4+3*5)*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm6, [edi+row*24+16] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm fld dword ptr [edi+(row*6+4)*4] \
|
|
__asm fmul dword ptr [esi+(4+4*5)*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm6, [edi+row*24+20] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm5 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm fld dword ptr [edi+(row*6+5)*4] \
|
|
__asm fmul dword ptr [esi+(4+5*5)*4] \
|
|
__asm faddp st(1),st \
|
|
__asm fstp dword ptr [eax+(row*5+4)*4] \
|
|
__asm movlps [eax+row*20], xmm7 \
|
|
__asm movhps [eax+row*20+8], xmm7
|
|
|
|
MUL_Nx6_6x5_INIT
|
|
MUL_Nx6_6x5_ROW( 0 )
|
|
MUL_Nx6_6x5_ROW( 1 )
|
|
MUL_Nx6_6x5_ROW( 2 )
|
|
MUL_Nx6_6x5_ROW( 3 )
|
|
MUL_Nx6_6x5_ROW( 4 )
|
|
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 6: {
|
|
switch( l ) {
|
|
case 1: { // 6x6 * 6x1
|
|
__asm {
|
|
mov esi, m2Ptr
|
|
mov edi, m1Ptr
|
|
mov eax, dstPtr
|
|
movlps xmm7, qword ptr [esi]
|
|
movlps xmm6, qword ptr [esi+8]
|
|
shufps xmm7, xmm7, 0x44
|
|
shufps xmm6, xmm6, 0x44
|
|
movlps xmm0, qword ptr [edi ]
|
|
movhps xmm0, qword ptr [edi+ 24]
|
|
mulps xmm0, xmm7
|
|
movlps xmm3, qword ptr [edi+ 8]
|
|
movhps xmm3, qword ptr [edi+ 32]
|
|
mulps xmm3, xmm6
|
|
movlps xmm1, qword ptr [edi+ 48]
|
|
movhps xmm1, qword ptr [edi+ 72]
|
|
mulps xmm1, xmm7
|
|
movlps xmm2, qword ptr [edi+ 96]
|
|
movhps xmm2, qword ptr [edi+120]
|
|
mulps xmm2, xmm7
|
|
movlps xmm4, qword ptr [edi+ 56]
|
|
movhps xmm4, qword ptr [edi+ 80]
|
|
movlps xmm5, qword ptr [edi+104]
|
|
movhps xmm5, qword ptr [edi+128]
|
|
mulps xmm4, xmm6
|
|
movlps xmm7, qword ptr [esi+16]
|
|
addps xmm0, xmm3
|
|
shufps xmm7, xmm7, 0x44
|
|
mulps xmm5, xmm6
|
|
addps xmm1, xmm4
|
|
movlps xmm3, qword ptr [edi+ 16]
|
|
movhps xmm3, qword ptr [edi+ 40]
|
|
addps xmm2, xmm5
|
|
movlps xmm4, qword ptr [edi+ 64]
|
|
movhps xmm4, qword ptr [edi+ 88]
|
|
mulps xmm3, xmm7
|
|
movlps xmm5, qword ptr [edi+112]
|
|
movhps xmm5, qword ptr [edi+136]
|
|
addps xmm0, xmm3
|
|
mulps xmm4, xmm7
|
|
mulps xmm5, xmm7
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
movaps xmm6, xmm0
|
|
shufps xmm0, xmm1, 0x88
|
|
shufps xmm6, xmm1, 0xDD
|
|
movaps xmm7, xmm2
|
|
shufps xmm7, xmm2, 0x88
|
|
shufps xmm2, xmm2, 0xDD
|
|
addps xmm0, xmm6
|
|
addps xmm2, xmm7
|
|
movlps [eax], xmm0
|
|
movhps [eax+8], xmm0
|
|
movlps [eax+16], xmm2
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 6x6 * 6x2
|
|
|
|
MUL_Nx6_6x2_INIT
|
|
MUL_Nx6_6x2_ROW2( 0 )
|
|
MUL_Nx6_6x2_ROW2( 1 )
|
|
MUL_Nx6_6x2_ROW2( 2 )
|
|
|
|
return;
|
|
}
|
|
case 3: { // 6x6 * 6x3
|
|
|
|
MUL_Nx6_6x3_INIT
|
|
MUL_Nx6_6x3_ROW( 0 )
|
|
MUL_Nx6_6x3_ROW( 1 )
|
|
MUL_Nx6_6x3_ROW( 2 )
|
|
MUL_Nx6_6x3_ROW( 3 )
|
|
MUL_Nx6_6x3_ROW( 4 )
|
|
MUL_Nx6_6x3_ROW( 5 )
|
|
|
|
return;
|
|
}
|
|
case 4: { // 6x6 * 6x4
|
|
|
|
MUL_Nx6_6x4_INIT
|
|
MUL_Nx6_6x4_ROW( 0 )
|
|
MUL_Nx6_6x4_ROW( 1 )
|
|
MUL_Nx6_6x4_ROW( 2 )
|
|
MUL_Nx6_6x4_ROW( 3 )
|
|
MUL_Nx6_6x4_ROW( 4 )
|
|
MUL_Nx6_6x4_ROW( 5 )
|
|
|
|
return;
|
|
}
|
|
case 5: { // 6x6 * 6x5
|
|
|
|
MUL_Nx6_6x5_INIT
|
|
MUL_Nx6_6x5_ROW( 0 )
|
|
MUL_Nx6_6x5_ROW( 1 )
|
|
MUL_Nx6_6x5_ROW( 2 )
|
|
MUL_Nx6_6x5_ROW( 3 )
|
|
MUL_Nx6_6x5_ROW( 4 )
|
|
MUL_Nx6_6x5_ROW( 5 )
|
|
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x6
|
|
__asm {
|
|
mov ecx, dword ptr m2Ptr
|
|
movlps xmm3, qword ptr [ecx+72]
|
|
mov edx, dword ptr m1Ptr
|
|
// Loading first 4 columns (upper 4 rows) of m2Ptr.
|
|
movaps xmm0, xmmword ptr [ecx]
|
|
movlps xmm1, qword ptr [ecx+24]
|
|
movhps xmm1, qword ptr [ecx+32]
|
|
movaps xmm2, xmmword ptr [ecx+48]
|
|
movhps xmm3, qword ptr [ecx+80]
|
|
// Calculating first 4 elements in the first row of the destination matrix.
|
|
movss xmm4, dword ptr [edx]
|
|
movss xmm5, dword ptr [edx+4]
|
|
mov eax, dword ptr dstPtr
|
|
shufps xmm4, xmm4, 0
|
|
movss xmm6, dword ptr [edx+8]
|
|
shufps xmm5, xmm5, 0
|
|
movss xmm7, dword ptr [edx+12]
|
|
mulps xmm4, xmm0
|
|
shufps xmm6, xmm6, 0
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm5, xmm1
|
|
mulps xmm6, xmm2
|
|
addps xmm5, xmm4
|
|
mulps xmm7, xmm3
|
|
addps xmm6, xmm5
|
|
addps xmm7, xmm6
|
|
movaps xmmword ptr [eax], xmm7
|
|
// Calculating first 4 elements in the second row of the destination matrix.
|
|
movss xmm4, dword ptr [edx+24]
|
|
shufps xmm4, xmm4, 0
|
|
mulps xmm4, xmm0
|
|
movss xmm5, dword ptr [edx+28]
|
|
shufps xmm5, xmm5, 0
|
|
mulps xmm5, xmm1
|
|
movss xmm6, dword ptr [edx+32]
|
|
shufps xmm6, xmm6, 0
|
|
movss xmm7, dword ptr [edx+36]
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm6, xmm2
|
|
mulps xmm7, xmm3
|
|
addps xmm7, xmm6
|
|
addps xmm5, xmm4
|
|
addps xmm7, xmm5
|
|
// Calculating first 4 elements in the third row of the destination matrix.
|
|
movss xmm4, dword ptr [edx+48]
|
|
movss xmm5, dword ptr [edx+52]
|
|
movlps qword ptr [eax+24], xmm7 ; save 2nd
|
|
movhps qword ptr [eax+32], xmm7 ; row
|
|
movss xmm6, dword ptr [edx+56]
|
|
movss xmm7, dword ptr [edx+60]
|
|
shufps xmm4, xmm4, 0
|
|
shufps xmm5, xmm5, 0
|
|
shufps xmm6, xmm6, 0
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm6, xmm2
|
|
mulps xmm7, xmm3
|
|
addps xmm5, xmm4
|
|
addps xmm7, xmm6
|
|
addps xmm7, xmm5
|
|
movaps xmmword ptr [eax+48], xmm7
|
|
// Calculating first 4 elements in the fourth row of the destination matrix.
|
|
movss xmm4, dword ptr [edx+72]
|
|
movss xmm5, dword ptr [edx+76]
|
|
movss xmm6, dword ptr [edx+80]
|
|
movss xmm7, dword ptr [edx+84]
|
|
shufps xmm4, xmm4, 0
|
|
shufps xmm5, xmm5, 0
|
|
shufps xmm6, xmm6, 0
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm6, xmm2
|
|
mulps xmm7, xmm3
|
|
addps xmm4, xmm5
|
|
addps xmm6, xmm4
|
|
addps xmm7, xmm6
|
|
movlps qword ptr [eax+72], xmm7
|
|
movhps qword ptr [eax+80], xmm7
|
|
// Calculating first 4 elements in the fifth row of the destination matrix.
|
|
movss xmm4, dword ptr [edx+96]
|
|
movss xmm5, dword ptr [edx+100]
|
|
movss xmm6, dword ptr [edx+104]
|
|
movss xmm7, dword ptr [edx+108]
|
|
shufps xmm4, xmm4, 0
|
|
shufps xmm5, xmm5, 0
|
|
shufps xmm6, xmm6, 0
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm6, xmm2
|
|
mulps xmm7, xmm3
|
|
addps xmm5, xmm4
|
|
addps xmm7, xmm6
|
|
addps xmm7, xmm5
|
|
movaps xmmword ptr [eax+96], xmm7
|
|
// Calculating first 4 elements in the sixth row of the destination matrix.
|
|
movss xmm4, dword ptr [edx+120]
|
|
movss xmm5, dword ptr [edx+124]
|
|
movss xmm6, dword ptr [edx+128]
|
|
movss xmm7, dword ptr [edx+132]
|
|
shufps xmm4, xmm4, 0
|
|
shufps xmm5, xmm5, 0
|
|
shufps xmm6, xmm6, 0
|
|
shufps xmm7, xmm7, 0
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm6, xmm2
|
|
mulps xmm7, xmm3
|
|
addps xmm4, xmm5
|
|
addps xmm6, xmm4
|
|
addps xmm7, xmm6
|
|
movhps qword ptr [eax+128], xmm7
|
|
movlps qword ptr [eax+120], xmm7
|
|
// Loading first 4 columns (lower 2 rows) of m2Ptr.
|
|
movlps xmm0, qword ptr [ecx+96]
|
|
movhps xmm0, qword ptr [ecx+104]
|
|
movlps xmm1, qword ptr [ecx+120]
|
|
movhps xmm1, qword ptr [ecx+128]
|
|
// Calculating first 4 elements in the first row of the destination matrix.
|
|
movss xmm2, dword ptr [edx+16]
|
|
shufps xmm2, xmm2, 0
|
|
movss xmm4, dword ptr [edx+40]
|
|
movss xmm3, dword ptr [edx+20]
|
|
movss xmm5, dword ptr [edx+44]
|
|
movaps xmm6, xmmword ptr [eax]
|
|
movlps xmm7, qword ptr [eax+24]
|
|
shufps xmm3, xmm3, 0
|
|
shufps xmm5, xmm5, 0
|
|
movhps xmm7, qword ptr [eax+32]
|
|
shufps xmm4, xmm4, 0
|
|
mulps xmm5, xmm1
|
|
mulps xmm2, xmm0
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm0
|
|
addps xmm6, xmm2
|
|
addps xmm7, xmm4
|
|
addps xmm7, xmm5
|
|
addps xmm6, xmm3
|
|
movlps qword ptr [eax+24], xmm7
|
|
movaps xmmword ptr [eax], xmm6
|
|
movhps qword ptr [eax+32], xmm7
|
|
// Calculating first 4 elements in the third row of the destination matrix.
|
|
movss xmm2, dword ptr [edx+64]
|
|
movss xmm4, dword ptr [edx+88]
|
|
movss xmm5, dword ptr [edx+92]
|
|
movss xmm3, dword ptr [edx+68]
|
|
movaps xmm6, xmmword ptr [eax+48]
|
|
movlps xmm7, qword ptr [eax+72]
|
|
movhps xmm7, qword ptr [eax+80]
|
|
shufps xmm2, xmm2, 0
|
|
shufps xmm4, xmm4, 0
|
|
shufps xmm5, xmm5, 0
|
|
shufps xmm3, xmm3, 0
|
|
mulps xmm2, xmm0
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
mulps xmm3, xmm1
|
|
addps xmm6, xmm2
|
|
addps xmm6, xmm3
|
|
addps xmm7, xmm4
|
|
addps xmm7, xmm5
|
|
movlps qword ptr [eax+72], xmm7
|
|
movaps xmmword ptr [eax+48], xmm6
|
|
movhps qword ptr [eax+80], xmm7
|
|
// Calculating first 4 elements in the fifth row of the destination matrix.
|
|
movss xmm2, dword ptr [edx+112]
|
|
movss xmm3, dword ptr [edx+116]
|
|
movaps xmm6, xmmword ptr [eax+96]
|
|
shufps xmm2, xmm2, 0
|
|
shufps xmm3, xmm3, 0
|
|
mulps xmm2, xmm0
|
|
mulps xmm3, xmm1
|
|
addps xmm6, xmm2
|
|
addps xmm6, xmm3
|
|
movaps xmmword ptr [eax+96], xmm6
|
|
// Calculating first 4 elements in the sixth row of the destination matrix.
|
|
movss xmm4, dword ptr [edx+136]
|
|
movss xmm5, dword ptr [edx+140]
|
|
movhps xmm7, qword ptr [eax+128]
|
|
movlps xmm7, qword ptr [eax+120]
|
|
shufps xmm4, xmm4, 0
|
|
shufps xmm5, xmm5, 0
|
|
mulps xmm4, xmm0
|
|
mulps xmm5, xmm1
|
|
addps xmm7, xmm4
|
|
addps xmm7, xmm5
|
|
// Calculating last 2 columns of the destination matrix.
|
|
movlps xmm0, qword ptr [ecx+16]
|
|
movhps xmm0, qword ptr [ecx+40]
|
|
movhps qword ptr [eax+128], xmm7
|
|
movlps qword ptr [eax+120], xmm7
|
|
movlps xmm2, qword ptr [ecx+64]
|
|
movhps xmm2, qword ptr [ecx+88]
|
|
movaps xmm3, xmm2
|
|
shufps xmm3, xmm3, 4Eh
|
|
movlps xmm4, qword ptr [ecx+112]
|
|
movhps xmm4, qword ptr [ecx+136]
|
|
movaps xmm5, xmm4
|
|
shufps xmm5, xmm5, 4Eh
|
|
movlps xmm6, qword ptr [edx]
|
|
movhps xmm6, qword ptr [edx+24]
|
|
movaps xmm7, xmm6
|
|
shufps xmm7, xmm7, 0F0h
|
|
mulps xmm7, xmm0
|
|
shufps xmm6, xmm6, 0A5h
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, 4Eh
|
|
mulps xmm1, xmm6
|
|
addps xmm7, xmm1
|
|
movlps xmm6, qword ptr [edx+8]
|
|
movhps xmm6, qword ptr [edx+32]
|
|
movaps xmm1, xmm6
|
|
shufps xmm1, xmm1, 0F0h
|
|
shufps xmm6, xmm6, 0A5h
|
|
mulps xmm1, xmm2
|
|
mulps xmm6, xmm3
|
|
addps xmm7, xmm1
|
|
addps xmm7, xmm6
|
|
movhps xmm6, qword ptr [edx+40]
|
|
movlps xmm6, qword ptr [edx+16]
|
|
movaps xmm1, xmm6
|
|
shufps xmm1, xmm1, 0F0h
|
|
shufps xmm6, xmm6, 0A5h
|
|
mulps xmm1, xmm4
|
|
mulps xmm6, xmm5
|
|
addps xmm7, xmm1
|
|
addps xmm7, xmm6
|
|
movlps qword ptr [eax+16], xmm7
|
|
movhps qword ptr [eax+40], xmm7
|
|
movlps xmm6, qword ptr [edx+48]
|
|
movhps xmm6, qword ptr [edx+72]
|
|
movaps xmm7, xmm6
|
|
shufps xmm7, xmm7, 0F0h
|
|
mulps xmm7, xmm0
|
|
shufps xmm6, xmm6, 0A5h
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, 4Eh
|
|
mulps xmm1, xmm6
|
|
addps xmm7, xmm1
|
|
movhps xmm6, qword ptr [edx+80]
|
|
movlps xmm6, qword ptr [edx+56]
|
|
movaps xmm1, xmm6
|
|
shufps xmm1, xmm1, 0F0h
|
|
shufps xmm6, xmm6, 0A5h
|
|
mulps xmm1, xmm2
|
|
mulps xmm6, xmm3
|
|
addps xmm7, xmm1
|
|
addps xmm7, xmm6
|
|
movlps xmm6, qword ptr [edx+64]
|
|
movhps xmm6, qword ptr [edx+88]
|
|
movaps xmm1, xmm6
|
|
shufps xmm1, xmm1, 0F0h
|
|
shufps xmm6, xmm6, 0A5h
|
|
mulps xmm1, xmm4
|
|
mulps xmm6, xmm5
|
|
addps xmm7, xmm1
|
|
addps xmm7, xmm6
|
|
movlps qword ptr [eax+64], xmm7
|
|
movhps qword ptr [eax+88], xmm7
|
|
movlps xmm6, qword ptr [edx+96]
|
|
movhps xmm6, qword ptr [edx+120]
|
|
movaps xmm7, xmm6
|
|
shufps xmm7, xmm7, 0F0h
|
|
mulps xmm7, xmm0
|
|
shufps xmm6, xmm6, 0A5h
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, 4Eh
|
|
mulps xmm1, xmm6
|
|
addps xmm7, xmm1
|
|
movlps xmm6, qword ptr [edx+104]
|
|
movhps xmm6, qword ptr [edx+128]
|
|
movaps xmm1, xmm6
|
|
shufps xmm1, xmm1, 0F0h
|
|
shufps xmm6, xmm6, 0A5h
|
|
mulps xmm1, xmm2
|
|
mulps xmm6, xmm3
|
|
addps xmm7, xmm1
|
|
addps xmm7, xmm6
|
|
movlps xmm6, qword ptr [edx+112]
|
|
movhps xmm6, qword ptr [edx+136]
|
|
movaps xmm1, xmm6
|
|
shufps xmm1, xmm1, 0F0h
|
|
shufps xmm6, xmm6, 0A5h
|
|
mulps xmm1, xmm4
|
|
mulps xmm6, xmm5
|
|
addps xmm7, xmm1
|
|
addps xmm7, xmm6
|
|
movlps qword ptr [eax+112], xmm7
|
|
movhps qword ptr [eax+136], xmm7
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
|
|
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
for ( i = 0; i < k; i++ ) {
|
|
for ( j = 0; j < l; j++ ) {
|
|
m2Ptr = m2.ToFloatPtr() + j;
|
|
sum = m1Ptr[0] * m2Ptr[0];
|
|
for ( n = 1; n < m1.GetNumColumns(); n++ ) {
|
|
m2Ptr += l;
|
|
sum += m1Ptr[n] * m2Ptr[0];
|
|
}
|
|
*dstPtr++ = sum;
|
|
}
|
|
m1Ptr += m1.GetNumColumns();
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_TransposeMultiplyMatX
|
|
|
|
optimizes the following transpose matrix multiplications:
|
|
|
|
Nx6 * NxN
|
|
6xN * 6x6
|
|
|
|
with N in the range [1-6].
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
|
|
int i, j, k, l, n;
|
|
float *dstPtr;
|
|
const float *m1Ptr, *m2Ptr;
|
|
double sum;
|
|
|
|
assert( m1.GetNumRows() == m2.GetNumRows() );
|
|
|
|
m1Ptr = m1.ToFloatPtr();
|
|
m2Ptr = m2.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
k = m1.GetNumColumns();
|
|
l = m2.GetNumColumns();
|
|
|
|
switch( m1.GetNumRows() ) {
|
|
case 1:
|
|
if ( !((k^6)|(l^1)) ) { // 1x6 * 1x1
|
|
__asm {
|
|
mov esi, m2Ptr
|
|
mov edi, m1Ptr
|
|
mov eax, dstPtr
|
|
movss xmm0, [esi]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm1, xmm0
|
|
mulps xmm0, [edi]
|
|
mulps xmm1, [edi+16]
|
|
movaps [eax], xmm0
|
|
movlps [eax+16], xmm1
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
if ( !((k^6)|(l^2)) ) { // 2x6 * 2x2
|
|
#define MUL_2xN_2x2_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movlps xmm0, [esi] \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm movlps xmm1, [esi+8] \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
|
|
#define MUL_2xN_2x2_ROW2( N, row ) \
|
|
__asm movlps xmm6, [edi+(row+0*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm movlps xmm7, [edi+(row+1*N)*4] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm mulps xmm7, xmm1 \
|
|
__asm addps xmm6, xmm7 \
|
|
__asm movaps [eax+(row*2)*4], xmm6
|
|
|
|
MUL_2xN_2x2_INIT
|
|
MUL_2xN_2x2_ROW2( 6, 0 )
|
|
MUL_2xN_2x2_ROW2( 6, 2 )
|
|
MUL_2xN_2x2_ROW2( 6, 4 )
|
|
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 3:
|
|
if ( !((k^6)|(l^3)) ) { // 3x6 * 3x3
|
|
|
|
#define MUL_3xN_3x3_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movss xmm0, [esi+(0*3+0)*4] \
|
|
__asm movhps xmm0, [esi+(0*3+1)*4] \
|
|
__asm movss xmm1, [esi+(1*3+0)*4] \
|
|
__asm movhps xmm1, [esi+(1*3+1)*4] \
|
|
__asm movss xmm2, [esi+(2*3+0)*4] \
|
|
__asm movhps xmm2, [esi+(2*3+1)*4]
|
|
|
|
#define MUL_3xN_3x3_INIT_ROW4 \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 2, 3, 0 ) \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 0 ) \
|
|
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 2, 3, 0 )
|
|
|
|
#define MUL_3xN_3x3_ROW4( N, row ) \
|
|
__asm movlps xmm3, [edi+(row+0*N+0)*4] \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 1 ) \
|
|
__asm movlps xmm4, [edi+(row+1*N+0)*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 1 ) \
|
|
__asm movlps xmm5, [edi+(row+2*N+0)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 1 ) \
|
|
__asm mulps xmm3, xmm0 \
|
|
__asm mulps xmm4, xmm1 \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm3, xmm4 \
|
|
__asm addps xmm3, xmm5 \
|
|
__asm movaps [eax+(row*3+0)*4], xmm3 \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
|
|
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
|
|
__asm movlps xmm3, [edi+(row+0*N+1)*4] \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm movlps xmm4, [edi+(row+1*N+1)*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm movlps xmm5, [edi+(row+2*N+1)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm3, xmm0 \
|
|
__asm mulps xmm4, xmm1 \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm3, xmm4 \
|
|
__asm addps xmm3, xmm5 \
|
|
__asm movaps [eax+(row*3+4)*4], xmm3 \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 1 ) \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 1 ) \
|
|
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 1 ) \
|
|
__asm movlps xmm3, [edi+(row+0*N+2)*4] \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 1, 1 ) \
|
|
__asm movlps xmm4, [edi+(row+1*N+2)*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 1, 1 ) \
|
|
__asm movlps xmm5, [edi+(row+2*N+2)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 1, 1 ) \
|
|
__asm mulps xmm3, xmm0 \
|
|
__asm mulps xmm4, xmm1 \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm3, xmm4 \
|
|
__asm addps xmm3, xmm5 \
|
|
__asm movaps [eax+(row*3+8)*4], xmm3
|
|
|
|
#define MUL_3xN_3x3_INIT_ROW4_ROW4 \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 ) \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 ) \
|
|
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
#define MUL_3xN_3x3_INIT_ROW4_ROW \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 1, 1, 2, 3 ) \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 2, 3 ) \
|
|
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 1, 1, 2, 3 )
|
|
|
|
#define MUL_3xN_3x3_ROW( N, row ) \
|
|
__asm movss xmm3, [edi+(row+0*N)*4] \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm movss xmm4, [edi+(row+1*N)*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm movss xmm5, [edi+(row+2*N)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm3, xmm0 \
|
|
__asm mulps xmm4, xmm1 \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm3, xmm4 \
|
|
__asm addps xmm3, xmm5 \
|
|
__asm movss [eax+(row*3+0)*4], xmm3 \
|
|
__asm movhps [eax+(row*3+1)*4], xmm3
|
|
|
|
MUL_3xN_3x3_INIT
|
|
MUL_3xN_3x3_INIT_ROW4
|
|
MUL_3xN_3x3_ROW4( 6, 0 )
|
|
MUL_3xN_3x3_INIT_ROW4_ROW
|
|
MUL_3xN_3x3_ROW( 6, 4 )
|
|
MUL_3xN_3x3_ROW( 6, 5 )
|
|
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 4:
|
|
if ( !((k^6)|(l^4)) ) { // 4x6 * 4x4
|
|
|
|
#define MUL_4xN_4x4_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movaps xmm0, [esi] \
|
|
__asm movaps xmm1, [esi+16] \
|
|
__asm movaps xmm2, [esi+32] \
|
|
__asm movaps xmm3, [esi+48]
|
|
|
|
#define MUL_4xN_4x4_ROW( N, row ) \
|
|
__asm movss xmm7, [edi+(row+0*N)*4] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm movss xmm6, [edi+(row+1*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(row+2*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(row+3*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movaps [eax+row*16], xmm7
|
|
|
|
MUL_4xN_4x4_INIT
|
|
MUL_4xN_4x4_ROW( 6, 0 )
|
|
MUL_4xN_4x4_ROW( 6, 1 )
|
|
MUL_4xN_4x4_ROW( 6, 2 )
|
|
MUL_4xN_4x4_ROW( 6, 3 )
|
|
MUL_4xN_4x4_ROW( 6, 4 )
|
|
MUL_4xN_4x4_ROW( 6, 5 )
|
|
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
|
|
m1Ptr[3*k] * m2Ptr[3*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 5:
|
|
if ( !((k^6)|(l^5)) ) { // 5x6 * 5x5
|
|
|
|
#define MUL_5xN_5x5_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movlps xmm0, [esi+ 0*4] \
|
|
__asm movhps xmm0, [esi+ 2*4] \
|
|
__asm movlps xmm1, [esi+ 5*4] \
|
|
__asm movhps xmm1, [esi+ 7*4] \
|
|
__asm movlps xmm2, [esi+10*4] \
|
|
__asm movhps xmm2, [esi+12*4] \
|
|
__asm movlps xmm3, [esi+15*4] \
|
|
__asm movhps xmm3, [esi+17*4] \
|
|
__asm movlps xmm4, [esi+20*4] \
|
|
__asm movhps xmm4, [esi+22*4]
|
|
|
|
#define MUL_5xN_5x5_ROW( N, row ) \
|
|
__asm movss xmm6, [edi+(row+0*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm0 \
|
|
__asm fld dword ptr [edi+(row+0*N)*4] \
|
|
__asm fmul dword ptr [esi+ 4*4] \
|
|
__asm movss xmm5, [edi+(row+1*N)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm1 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm fld dword ptr [edi+(row+1*N)*4] \
|
|
__asm fmul dword ptr [esi+ 9*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm5, [edi+(row+2*N)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm2 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm fld dword ptr [edi+(row+2*N)*4] \
|
|
__asm fmul dword ptr [esi+14*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm5, [edi+(row+3*N)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm3 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm fld dword ptr [edi+(row+3*N)*4] \
|
|
__asm fmul dword ptr [esi+19*4] \
|
|
__asm faddp st(1),st \
|
|
__asm movss xmm5, [edi+(row+4*N)*4] \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm5, xmm4 \
|
|
__asm addps xmm6, xmm5 \
|
|
__asm fld dword ptr [edi+(row+4*N)*4] \
|
|
__asm fmul dword ptr [esi+24*4] \
|
|
__asm faddp st(1),st \
|
|
__asm fstp dword ptr [eax+(row*5+4)*4] \
|
|
__asm movlps [eax+(row*5+0)*4], xmm6 \
|
|
__asm movhps [eax+(row*5+2)*4], xmm6
|
|
|
|
MUL_5xN_5x5_INIT
|
|
MUL_5xN_5x5_ROW( 6, 0 )
|
|
MUL_5xN_5x5_ROW( 6, 1 )
|
|
MUL_5xN_5x5_ROW( 6, 2 )
|
|
MUL_5xN_5x5_ROW( 6, 3 )
|
|
MUL_5xN_5x5_ROW( 6, 4 )
|
|
MUL_5xN_5x5_ROW( 6, 5 )
|
|
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
|
|
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 6:
|
|
if ( !(l^6) ) {
|
|
switch( k ) {
|
|
case 1: { // 6x1 * 6x6
|
|
#define MUL_6xN_6x6_FIRST4COLUMNS_INIT \
|
|
__asm mov esi, m2Ptr \
|
|
__asm mov edi, m1Ptr \
|
|
__asm mov eax, dstPtr \
|
|
__asm movlps xmm0, [esi+ 0*4] \
|
|
__asm movhps xmm0, [esi+ 2*4] \
|
|
__asm movlps xmm1, [esi+ 6*4] \
|
|
__asm movhps xmm1, [esi+ 8*4] \
|
|
__asm movlps xmm2, [esi+12*4] \
|
|
__asm movhps xmm2, [esi+14*4] \
|
|
__asm movlps xmm3, [esi+18*4] \
|
|
__asm movhps xmm3, [esi+20*4] \
|
|
__asm movlps xmm4, [esi+24*4] \
|
|
__asm movhps xmm4, [esi+26*4] \
|
|
__asm movlps xmm5, [esi+30*4] \
|
|
__asm movhps xmm5, [esi+32*4]
|
|
|
|
#define MUL_6xN_6x6_FIRST4COLUMNS_ROW( N, row ) \
|
|
__asm movss xmm7, [edi+(row+0*N)*4] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm movss xmm6, [edi+(row+1*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(row+2*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(row+3*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(row+4*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(row+5*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm5 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps [eax+(row*6+0)*4], xmm7 \
|
|
__asm movhps [eax+(row*6+2)*4], xmm7
|
|
|
|
#define MUL_6xN_6x6_LAST2COLUMNS_INIT \
|
|
__asm movlps xmm0, [esi+ 4*4] \
|
|
__asm movlps xmm1, [esi+10*4] \
|
|
__asm shufps xmm0, xmm0, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm movlps xmm2, [esi+16*4] \
|
|
__asm movlps xmm3, [esi+22*4] \
|
|
__asm shufps xmm2, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm3, xmm3, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm movlps xmm4, [esi+28*4] \
|
|
__asm movlps xmm5, [esi+34*4] \
|
|
__asm shufps xmm4, xmm4, R_SHUFFLEPS( 0, 1, 0, 1 ) \
|
|
__asm shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
|
|
#define MUL_6xN_6x6_LAST2COLUMNS_ROW2( N, row ) \
|
|
__asm movlps xmm7, [edi+(row*2+0*N)*4] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm movlps xmm6, [edi+(row*2+1*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps xmm6, [edi+(row*2+2*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps xmm6, [edi+(row*2+3*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps xmm6, [edi+(row*2+4*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps xmm6, [edi+(row*2+5*N)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 1, 1 ) \
|
|
__asm mulps xmm6, xmm5 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps [eax+(row*12+ 4)*4], xmm7 \
|
|
__asm movhps [eax+(row*12+10)*4], xmm7
|
|
|
|
#define MUL_6xN_6x6_LAST2COLUMNS_ROW( N, row ) \
|
|
__asm movss xmm7, [edi+(1*N-1)*4] \
|
|
__asm shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm7, xmm0 \
|
|
__asm movss xmm6, [edi+(2*N-1)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm1 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(3*N-1)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm2 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(4*N-1)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm3 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(5*N-1)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm4 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movss xmm6, [edi+(6*N-1)*4] \
|
|
__asm shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 ) \
|
|
__asm mulps xmm6, xmm5 \
|
|
__asm addps xmm7, xmm6 \
|
|
__asm movlps [eax+(row*6+4)*4], xmm7
|
|
|
|
MUL_6xN_6x6_FIRST4COLUMNS_INIT
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 1, 0 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_INIT
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW( 1, 0 )
|
|
|
|
return;
|
|
}
|
|
case 2: { // 6x2 * 6x6
|
|
|
|
MUL_6xN_6x6_FIRST4COLUMNS_INIT
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 0 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 2, 1 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_INIT
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 2, 0 )
|
|
|
|
return;
|
|
}
|
|
case 3: { // 6x3 * 6x6
|
|
|
|
MUL_6xN_6x6_FIRST4COLUMNS_INIT
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 0 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 1 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 3, 2 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_INIT
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 3, 0 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW( 3, 2 )
|
|
|
|
return;
|
|
}
|
|
case 4: { // 6x4 * 6x6
|
|
|
|
MUL_6xN_6x6_FIRST4COLUMNS_INIT
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 0 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 1 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 2 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 4, 3 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_INIT
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 0 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 4, 1 )
|
|
|
|
return;
|
|
}
|
|
case 5: { // 6x5 * 6x6
|
|
|
|
MUL_6xN_6x6_FIRST4COLUMNS_INIT
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 0 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 1 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 2 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 3 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 5, 4 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_INIT
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 0 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 5, 1 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW( 5, 4 )
|
|
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x6
|
|
|
|
MUL_6xN_6x6_FIRST4COLUMNS_INIT
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 0 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 1 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 2 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 3 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 4 )
|
|
MUL_6xN_6x6_FIRST4COLUMNS_ROW( 6, 5 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_INIT
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 0 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 1 )
|
|
MUL_6xN_6x6_LAST2COLUMNS_ROW2( 6, 2 )
|
|
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
|
|
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
default:
|
|
for ( i = 0; i < k; i++ ) {
|
|
for ( j = 0; j < l; j++ ) {
|
|
m1Ptr = m1.ToFloatPtr() + i;
|
|
m2Ptr = m2.ToFloatPtr() + j;
|
|
sum = m1Ptr[0] * m2Ptr[0];
|
|
for ( n = 1; n < m1.GetNumRows(); n++ ) {
|
|
m1Ptr += k;
|
|
m2Ptr += l;
|
|
sum += m1Ptr[0] * m2Ptr[0];
|
|
}
|
|
*dstPtr++ = sum;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_LowerTriangularSolve
|
|
|
|
solves x in Lx = b for the n * n sub-matrix of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
if ( skip >= n ) {
|
|
return;
|
|
}
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
#define NSKIP( n, s ) ((n<<3)|(s&7))
|
|
switch( NSKIP( n, skip ) ) {
|
|
case NSKIP( 1, 0 ): x[0] = b[0];
|
|
return;
|
|
case NSKIP( 2, 0 ): x[0] = b[0];
|
|
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
return;
|
|
case NSKIP( 3, 0 ): x[0] = b[0];
|
|
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
return;
|
|
case NSKIP( 4, 0 ): x[0] = b[0];
|
|
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
return;
|
|
case NSKIP( 5, 0 ): x[0] = b[0];
|
|
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
return;
|
|
case NSKIP( 6, 0 ): x[0] = b[0];
|
|
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
return;
|
|
case NSKIP( 7, 0 ): x[0] = b[0];
|
|
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// process first 4 rows
|
|
switch( skip ) {
|
|
case 0: x[0] = b[0];
|
|
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
skip = 4;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
|
|
// this code assumes n > 4
|
|
__asm {
|
|
push ebx
|
|
mov eax, skip // eax = i
|
|
shl eax, 2 // eax = i*4
|
|
mov edx, n // edx = n
|
|
shl edx, 2 // edx = n*4
|
|
mov esi, x // esi = x
|
|
mov edi, lptr // edi = lptr
|
|
add esi, eax
|
|
add edi, eax
|
|
mov ebx, b // ebx = b
|
|
|
|
// check for aligned memory
|
|
mov ecx, nc
|
|
shl ecx, 2
|
|
or ecx, esi
|
|
or ecx, edi
|
|
and ecx, 15
|
|
jnz loopurow
|
|
|
|
// aligned
|
|
looprow:
|
|
mov ecx, eax
|
|
neg ecx
|
|
movaps xmm0, [esi+ecx]
|
|
mulps xmm0, [edi+ecx]
|
|
add ecx, 12*4
|
|
jg donedot8
|
|
dot8:
|
|
movaps xmm1, [esi+ecx-(8*4)]
|
|
mulps xmm1, [edi+ecx-(8*4)]
|
|
addps xmm0, xmm1
|
|
movaps xmm3, [esi+ecx-(4*4)]
|
|
mulps xmm3, [edi+ecx-(4*4)]
|
|
addps xmm0, xmm3
|
|
add ecx, 8*4
|
|
jle dot8
|
|
donedot8:
|
|
sub ecx, 4*4
|
|
jg donedot4
|
|
//dot4:
|
|
movaps xmm1, [esi+ecx-(4*4)]
|
|
mulps xmm1, [edi+ecx-(4*4)]
|
|
addps xmm0, xmm1
|
|
add ecx, 4*4
|
|
donedot4:
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm1
|
|
sub ecx, 4*4
|
|
jz dot0
|
|
add ecx, 4
|
|
jz dot1
|
|
add ecx, 4
|
|
jz dot2
|
|
//dot3:
|
|
movss xmm1, [esi-(3*4)]
|
|
mulss xmm1, [edi-(3*4)]
|
|
addss xmm0, xmm1
|
|
dot2:
|
|
movss xmm3, [esi-(2*4)]
|
|
mulss xmm3, [edi-(2*4)]
|
|
addss xmm0, xmm3
|
|
dot1:
|
|
movss xmm5, [esi-(1*4)]
|
|
mulss xmm5, [edi-(1*4)]
|
|
addss xmm0, xmm5
|
|
dot0:
|
|
movss xmm1, [ebx+eax]
|
|
subss xmm1, xmm0
|
|
movss [esi], xmm1
|
|
add eax, 4
|
|
cmp eax, edx
|
|
jge done
|
|
add esi, 4
|
|
mov ecx, nc
|
|
shl ecx, 2
|
|
add edi, ecx
|
|
add edi, 4
|
|
jmp looprow
|
|
|
|
// unaligned
|
|
loopurow:
|
|
mov ecx, eax
|
|
neg ecx
|
|
movups xmm0, [esi+ecx]
|
|
movups xmm1, [edi+ecx]
|
|
mulps xmm0, xmm1
|
|
add ecx, 12*4
|
|
jg doneudot8
|
|
udot8:
|
|
movups xmm1, [esi+ecx-(8*4)]
|
|
movups xmm2, [edi+ecx-(8*4)]
|
|
mulps xmm1, xmm2
|
|
addps xmm0, xmm1
|
|
movups xmm3, [esi+ecx-(4*4)]
|
|
movups xmm4, [edi+ecx-(4*4)]
|
|
mulps xmm3, xmm4
|
|
addps xmm0, xmm3
|
|
add ecx, 8*4
|
|
jle udot8
|
|
doneudot8:
|
|
sub ecx, 4*4
|
|
jg doneudot4
|
|
//udot4:
|
|
movups xmm1, [esi+ecx-(4*4)]
|
|
movups xmm2, [edi+ecx-(4*4)]
|
|
mulps xmm1, xmm2
|
|
addps xmm0, xmm1
|
|
add ecx, 4*4
|
|
doneudot4:
|
|
movhlps xmm1, xmm0
|
|
addps xmm0, xmm1
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm1
|
|
sub ecx, 4*4
|
|
jz udot0
|
|
add ecx, 4
|
|
jz udot1
|
|
add ecx, 4
|
|
jz udot2
|
|
//udot3:
|
|
movss xmm1, [esi-(3*4)]
|
|
movss xmm2, [edi-(3*4)]
|
|
mulss xmm1, xmm2
|
|
addss xmm0, xmm1
|
|
udot2:
|
|
movss xmm3, [esi-(2*4)]
|
|
movss xmm4, [edi-(2*4)]
|
|
mulss xmm3, xmm4
|
|
addss xmm0, xmm3
|
|
udot1:
|
|
movss xmm5, [esi-(1*4)]
|
|
movss xmm6, [edi-(1*4)]
|
|
mulss xmm5, xmm6
|
|
addss xmm0, xmm5
|
|
udot0:
|
|
movss xmm1, [ebx+eax]
|
|
subss xmm1, xmm0
|
|
movss [esi], xmm1
|
|
add eax, 4
|
|
cmp eax, edx
|
|
jge done
|
|
add esi, 4
|
|
mov ecx, nc
|
|
shl ecx, 2
|
|
add edi, ecx
|
|
add edi, 4
|
|
jmp loopurow
|
|
done:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L'x = b for the n * n sub-matrix of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x[1] = b[1];
|
|
x[0] = b[0] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 3:
|
|
x[2] = b[2];
|
|
x[1] = b[1] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 4:
|
|
x[3] = b[3];
|
|
x[2] = b[2] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 5:
|
|
x[4] = b[4];
|
|
x[3] = b[3] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 6:
|
|
x[5] = b[5];
|
|
x[4] = b[4] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 7:
|
|
x[6] = b[6];
|
|
x[5] = b[5] - lptr[6*nc+5] * x[6];
|
|
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
#if 1
|
|
|
|
int i, j, m;
|
|
float *xptr;
|
|
double s0;
|
|
|
|
// if the number of columns is not a multiple of 2 we're screwed for alignment.
|
|
// however, if the number of columns is a multiple of 2 but the number of to be
|
|
// processed rows is not a multiple of 2 we can still run 8 byte aligned
|
|
m = n;
|
|
if ( m & 1 ) {
|
|
|
|
m--;
|
|
x[m] = b[m];
|
|
|
|
lptr = L.ToFloatPtr() + m * nc + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows_1:
|
|
movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
xor ecx, ecx
|
|
sub eax, m
|
|
neg eax
|
|
jz done4x4_1
|
|
process4x4_1: // process 4x4 blocks
|
|
movlps xmm2, [edi+0]
|
|
movhps xmm2, [edi+8]
|
|
add edi, edx
|
|
movss xmm1, [esi+4*ecx+0]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm3, [edi+0]
|
|
movhps xmm3, [edi+8]
|
|
add edi, edx
|
|
mulps xmm1, xmm2
|
|
subps xmm0, xmm1
|
|
movss xmm1, [esi+4*ecx+4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm4, [edi+0]
|
|
movhps xmm4, [edi+8]
|
|
add edi, edx
|
|
mulps xmm1, xmm3
|
|
subps xmm0, xmm1
|
|
movss xmm1, [esi+4*ecx+8]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm5, [edi+0]
|
|
movhps xmm5, [edi+8]
|
|
add edi, edx
|
|
mulps xmm1, xmm4
|
|
subps xmm0, xmm1
|
|
movss xmm1, [esi+4*ecx+12]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
add ecx, 4
|
|
cmp ecx, eax
|
|
mulps xmm1, xmm5
|
|
subps xmm0, xmm1
|
|
jl process4x4_1
|
|
done4x4_1: // process left over of the 4 rows
|
|
movlps xmm2, [edi+0]
|
|
movhps xmm2, [edi+8]
|
|
movss xmm1, [esi+4*ecx]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm1, xmm2
|
|
subps xmm0, xmm1
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
|
|
add eax, m
|
|
sub eax, 4
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movaps xmm2, xmm0
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
|
|
sub edi, edx
|
|
movss [esi-4], xmm3 // xptr[-1] = s3
|
|
movss xmm4, xmm3
|
|
movss xmm5, xmm3
|
|
mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
|
|
mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
|
|
mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
|
|
subss xmm2, xmm3
|
|
movss [esi-8], xmm2 // xptr[-2] = s2
|
|
movss xmm6, xmm2
|
|
sub edi, edx
|
|
subss xmm0, xmm5
|
|
subss xmm1, xmm4
|
|
mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
|
|
mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
|
|
subss xmm1, xmm2
|
|
movss [esi-12], xmm1 // xptr[-3] = s1
|
|
subss xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
|
|
subss xmm0, xmm1
|
|
movss [esi-16], xmm0 // xptr[-4] = s0
|
|
jl done4rows_1
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows_1
|
|
done4rows_1:
|
|
pop ebx
|
|
}
|
|
|
|
} else {
|
|
|
|
lptr = L.ToFloatPtr() + m * nc + m - 4;
|
|
xptr = x + m;
|
|
__asm {
|
|
push ebx
|
|
mov eax, m // eax = i
|
|
mov esi, xptr // esi = xptr
|
|
mov edi, lptr // edi = lptr
|
|
mov ebx, b // ebx = b
|
|
mov edx, nc // edx = nc*sizeof(float)
|
|
shl edx, 2
|
|
process4rows:
|
|
movlps xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
|
|
movhps xmm0, [ebx+eax*4-8] // load b[i-4], b[i-3]
|
|
sub eax, m
|
|
jz done4x4
|
|
neg eax
|
|
xor ecx, ecx
|
|
process4x4: // process 4x4 blocks
|
|
movlps xmm2, [edi+0]
|
|
movhps xmm2, [edi+8]
|
|
add edi, edx
|
|
movss xmm1, [esi+4*ecx+0]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm3, [edi+0]
|
|
movhps xmm3, [edi+8]
|
|
add edi, edx
|
|
mulps xmm1, xmm2
|
|
subps xmm0, xmm1
|
|
movss xmm1, [esi+4*ecx+4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm4, [edi+0]
|
|
movhps xmm4, [edi+8]
|
|
add edi, edx
|
|
mulps xmm1, xmm3
|
|
subps xmm0, xmm1
|
|
movss xmm1, [esi+4*ecx+8]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps xmm5, [edi+0]
|
|
movhps xmm5, [edi+8]
|
|
add edi, edx
|
|
mulps xmm1, xmm4
|
|
subps xmm0, xmm1
|
|
movss xmm1, [esi+4*ecx+12]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
add ecx, 4
|
|
cmp ecx, eax
|
|
mulps xmm1, xmm5
|
|
subps xmm0, xmm1
|
|
jl process4x4
|
|
imul ecx, edx
|
|
sub edi, ecx
|
|
neg eax
|
|
done4x4: // process left over of the 4 rows
|
|
add eax, m
|
|
sub eax, 4
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
movaps xmm2, xmm0
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 2, 2, 2 )
|
|
movaps xmm3, xmm0
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 3, 3, 3 )
|
|
sub edi, edx
|
|
movss [esi-4], xmm3 // xptr[-1] = s3
|
|
movss xmm4, xmm3
|
|
movss xmm5, xmm3
|
|
mulss xmm3, [edi+8] // lptr[-1*nc+2] * s3
|
|
mulss xmm4, [edi+4] // lptr[-1*nc+1] * s3
|
|
mulss xmm5, [edi+0] // lptr[-1*nc+0] * s3
|
|
subss xmm2, xmm3
|
|
movss [esi-8], xmm2 // xptr[-2] = s2
|
|
movss xmm6, xmm2
|
|
sub edi, edx
|
|
subss xmm0, xmm5
|
|
subss xmm1, xmm4
|
|
mulss xmm2, [edi+4] // lptr[-2*nc+1] * s2
|
|
mulss xmm6, [edi+0] // lptr[-2*nc+0] * s2
|
|
subss xmm1, xmm2
|
|
movss [esi-12], xmm1 // xptr[-3] = s1
|
|
subss xmm0, xmm6
|
|
sub edi, edx
|
|
cmp eax, 4
|
|
mulss xmm1, [edi+0] // lptr[-3*nc+0] * s1
|
|
subss xmm0, xmm1
|
|
movss [esi-16], xmm0 // xptr[-4] = s0
|
|
jl done4rows
|
|
sub edi, edx
|
|
sub edi, 16
|
|
sub esi, 16
|
|
jmp process4rows
|
|
done4rows:
|
|
pop ebx
|
|
}
|
|
}
|
|
|
|
// process left over rows
|
|
for ( i = (m&3)-1; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[0] + i;
|
|
for ( j = i + 1; j < n; j++ ) {
|
|
s0 -= lptr[j*nc] * x[j];
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j, m;
|
|
double s0, s1, s2, s3, t;
|
|
const float *lptr2;
|
|
float *xptr, *xptr2;
|
|
|
|
m = n;
|
|
if ( m & 1 ) {
|
|
|
|
m--;
|
|
x[m] = b[m];
|
|
|
|
lptr = L.ToFloatPtr() + m * nc + m - 4;
|
|
xptr = x + m;
|
|
// process 4 rows at a time
|
|
for ( i = m; i >= 4; i -= 4 ) {
|
|
s0 = b[i-4];
|
|
s1 = b[i-3];
|
|
s2 = b[i-2];
|
|
s3 = b[i-1];
|
|
// process 4x4 blocks
|
|
xptr2 = xptr; // x + i;
|
|
lptr2 = lptr; // ptr = L[i] + i - 4;
|
|
for ( j = 0; j < m-i; j += 4 ) {
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
}
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
// process left over of the 4 rows
|
|
lptr -= nc;
|
|
s0 -= lptr[0] * s3;
|
|
s1 -= lptr[1] * s3;
|
|
s2 -= lptr[2] * s3;
|
|
lptr -= nc;
|
|
s0 -= lptr[0] * s2;
|
|
s1 -= lptr[1] * s2;
|
|
lptr -= nc;
|
|
s0 -= lptr[0] * s1;
|
|
lptr -= nc;
|
|
// store result
|
|
xptr[-4] = s0;
|
|
xptr[-3] = s1;
|
|
xptr[-2] = s2;
|
|
xptr[-1] = s3;
|
|
// update pointers for next four rows
|
|
lptr -= 4;
|
|
xptr -= 4;
|
|
}
|
|
|
|
} else {
|
|
|
|
lptr = L.ToFloatPtr() + m * nc + m - 4;
|
|
xptr = x + m;
|
|
// process 4 rows at a time
|
|
for ( i = m; i >= 4; i -= 4 ) {
|
|
s0 = b[i-4];
|
|
s1 = b[i-3];
|
|
s2 = b[i-2];
|
|
s3 = b[i-1];
|
|
// process 4x4 blocks
|
|
xptr2 = xptr; // x + i;
|
|
lptr2 = lptr; // ptr = L[i] + i - 4;
|
|
for ( j = 0; j < m-i; j += 4 ) {
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
t = xptr2[0];
|
|
s0 -= lptr2[0] * t;
|
|
s1 -= lptr2[1] * t;
|
|
s2 -= lptr2[2] * t;
|
|
s3 -= lptr2[3] * t;
|
|
lptr2 += nc;
|
|
xptr2++;
|
|
}
|
|
// process left over of the 4 rows
|
|
lptr -= nc;
|
|
s0 -= lptr[0] * s3;
|
|
s1 -= lptr[1] * s3;
|
|
s2 -= lptr[2] * s3;
|
|
lptr -= nc;
|
|
s0 -= lptr[0] * s2;
|
|
s1 -= lptr[1] * s2;
|
|
lptr -= nc;
|
|
s0 -= lptr[0] * s1;
|
|
lptr -= nc;
|
|
// store result
|
|
xptr[-4] = s0;
|
|
xptr[-3] = s1;
|
|
xptr[-2] = s2;
|
|
xptr[-1] = s3;
|
|
// update pointers for next four rows
|
|
lptr -= 4;
|
|
xptr -= 4;
|
|
}
|
|
}
|
|
// process left over rows
|
|
for ( i--; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[0] + i;
|
|
for ( j = i + 1; j < m; j++ ) {
|
|
s0 -= lptr[j*nc] * x[j];
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MatX_LDLTFactor
|
|
|
|
in-place factorization LDL' of the n * n sub-matrix of mat
|
|
the reciprocal of the diagonal elements are stored in invDiag
|
|
currently assumes the number of columns of mat is a multiple of 4
|
|
============
|
|
*/
|
|
bool VPCALL idSIMD_SSE::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
|
|
#if 1
|
|
|
|
int j, nc;
|
|
float *v, *diag, *invDiagPtr, *mptr;
|
|
double s0, s1, s2, sum, d;
|
|
|
|
v = (float *) _alloca16( n * sizeof( float ) );
|
|
diag = (float *) _alloca16( n * sizeof( float ) );
|
|
invDiagPtr = invDiag.ToFloatPtr();
|
|
|
|
nc = mat.GetNumColumns();
|
|
|
|
assert( ( nc & 3 ) == 0 );
|
|
|
|
if ( n <= 0 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
|
|
sum = mptr[0];
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
diag[0] = sum;
|
|
invDiagPtr[0] = d = 1.0f / sum;
|
|
|
|
if ( n <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 1; j < n; j++ ) {
|
|
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[1];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
sum = mptr[1] - s0;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[1][1] = sum;
|
|
diag[1] = sum;
|
|
invDiagPtr[1] = d = 1.0f / sum;
|
|
|
|
if ( n <= 2 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 2; j < n; j++ ) {
|
|
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[2];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
sum = mptr[2] - s0 - s1;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[2][2] = sum;
|
|
diag[2] = sum;
|
|
invDiagPtr[2] = d = 1.0f / sum;
|
|
|
|
if ( n <= 3 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 3; j < n; j++ ) {
|
|
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
|
|
}
|
|
|
|
mptr = mat[3];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
sum = mptr[3] - s0 - s1 - s2;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[3][3] = sum;
|
|
diag[3] = sum;
|
|
invDiagPtr[3] = d = 1.0f / sum;
|
|
|
|
if ( n <= 4 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 4; j < n; j++ ) {
|
|
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
|
}
|
|
|
|
int ncf = nc * sizeof( float );
|
|
mptr = mat[0];
|
|
|
|
__asm {
|
|
xorps xmm2, xmm2
|
|
xorps xmm3, xmm3
|
|
xorps xmm4, xmm4
|
|
|
|
push ebx
|
|
mov ebx, 4
|
|
|
|
loopRow:
|
|
cmp ebx, n
|
|
jge done
|
|
|
|
mov ecx, ebx // esi = i
|
|
shl ecx, 2 // esi = i * 4
|
|
mov edx, diag // edx = diag
|
|
add edx, ecx // edx = &diag[i]
|
|
mov edi, ebx // edi = i
|
|
imul edi, ncf // edi = i * nc * sizeof( float )
|
|
add edi, mptr // edi = mat[i]
|
|
add edi, ecx // edi = &mat[i][i]
|
|
mov esi, v // ecx = v
|
|
add esi, ecx // ecx = &v[i]
|
|
mov eax, invDiagPtr // eax = invDiagPtr
|
|
add eax, ecx // eax = &invDiagPtr[i]
|
|
neg ecx
|
|
|
|
movaps xmm0, [edx+ecx]
|
|
mulps xmm0, [edi+ecx]
|
|
movaps [esi+ecx], xmm0
|
|
mulps xmm0, [edi+ecx]
|
|
add ecx, 12*4
|
|
jg doneDot8
|
|
dot8:
|
|
movaps xmm1, [edx+ecx-(8*4)]
|
|
mulps xmm1, [edi+ecx-(8*4)]
|
|
movaps [esi+ecx-(8*4)], xmm1
|
|
mulps xmm1, [edi+ecx-(8*4)]
|
|
addps xmm0, xmm1
|
|
movaps xmm2, [edx+ecx-(4*4)]
|
|
mulps xmm2, [edi+ecx-(4*4)]
|
|
movaps [esi+ecx-(4*4)], xmm2
|
|
mulps xmm2, [edi+ecx-(4*4)]
|
|
addps xmm0, xmm2
|
|
add ecx, 8*4
|
|
jle dot8
|
|
doneDot8:
|
|
sub ecx, 4*4
|
|
jg doneDot4
|
|
movaps xmm1, [edx+ecx-(4*4)]
|
|
mulps xmm1, [edi+ecx-(4*4)]
|
|
movaps [esi+ecx-(4*4)], xmm1
|
|
mulps xmm1, [edi+ecx-(4*4)]
|
|
addps xmm0, xmm1
|
|
add ecx, 4*4
|
|
doneDot4:
|
|
sub ecx, 2*4
|
|
jg doneDot2
|
|
movlps xmm3, [edx+ecx-(2*4)]
|
|
movlps xmm4, [edi+ecx-(2*4)]
|
|
mulps xmm3, xmm4
|
|
movlps [esi+ecx-(2*4)], xmm3
|
|
mulps xmm3, xmm4
|
|
addps xmm0, xmm3
|
|
add ecx, 2*4
|
|
doneDot2:
|
|
sub ecx, 1*4
|
|
jg doneDot1
|
|
movss xmm3, [edx+ecx-(1*4)]
|
|
movss xmm4, [edi+ecx-(1*4)]
|
|
mulss xmm3, xmm4
|
|
movss [esi+ecx-(1*4)], xmm3
|
|
mulss xmm3, xmm4
|
|
addss xmm0, xmm3
|
|
doneDot1:
|
|
movhlps xmm2, xmm0
|
|
addps xmm0, xmm2
|
|
movaps xmm2, xmm0
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm2
|
|
movss xmm1, [edi]
|
|
subss xmm1, xmm0
|
|
movss [edi], xmm1 // mptr[i] = sum;
|
|
movss [edx], xmm1 // diag[i] = sum;
|
|
|
|
// if ( sum == 0.0f ) return false;
|
|
movaps xmm2, xmm1
|
|
cmpeqss xmm2, SIMD_SP_zero
|
|
andps xmm2, SIMD_SP_tiny
|
|
orps xmm1, xmm2
|
|
|
|
rcpss xmm7, xmm1
|
|
mulss xmm1, xmm7
|
|
mulss xmm1, xmm7
|
|
addss xmm7, xmm7
|
|
subss xmm7, xmm1
|
|
movss [eax], xmm7 // invDiagPtr[i] = 1.0f / sum;
|
|
|
|
mov edx, n // edx = n
|
|
sub edx, ebx // edx = n - i
|
|
dec edx // edx = n - i - 1
|
|
jle doneSubRow // if ( i + 1 >= n ) return true;
|
|
|
|
mov eax, ebx // eax = i
|
|
shl eax, 2 // eax = i * 4
|
|
neg eax
|
|
|
|
loopSubRow:
|
|
add edi, ncf
|
|
mov ecx, eax
|
|
movaps xmm0, [esi+ecx]
|
|
mulps xmm0, [edi+ecx]
|
|
add ecx, 12*4
|
|
jg doneSubDot8
|
|
subDot8:
|
|
movaps xmm1, [esi+ecx-(8*4)]
|
|
mulps xmm1, [edi+ecx-(8*4)]
|
|
addps xmm0, xmm1
|
|
movaps xmm2, [esi+ecx-(4*4)]
|
|
mulps xmm2, [edi+ecx-(4*4)]
|
|
addps xmm0, xmm2
|
|
add ecx, 8*4
|
|
jle subDot8
|
|
doneSubDot8:
|
|
sub ecx, 4*4
|
|
jg doneSubDot4
|
|
movaps xmm1, [esi+ecx-(4*4)]
|
|
mulps xmm1, [edi+ecx-(4*4)]
|
|
addps xmm0, xmm1
|
|
add ecx, 4*4
|
|
doneSubDot4:
|
|
sub ecx, 2*4
|
|
jg doneSubDot2
|
|
movlps xmm3, [esi+ecx-(2*4)]
|
|
movlps xmm4, [edi+ecx-(2*4)]
|
|
mulps xmm3, xmm4
|
|
addps xmm0, xmm3
|
|
add ecx, 2*4
|
|
doneSubDot2:
|
|
sub ecx, 1*4
|
|
jg doneSubDot1
|
|
movss xmm3, [esi+ecx-(1*4)]
|
|
movss xmm4, [edi+ecx-(1*4)]
|
|
mulss xmm3, xmm4
|
|
addss xmm0, xmm3
|
|
doneSubDot1:
|
|
movhlps xmm2, xmm0
|
|
addps xmm0, xmm2
|
|
movaps xmm2, xmm0
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 0, 0, 0 )
|
|
addss xmm0, xmm2
|
|
movss xmm1, [edi]
|
|
subss xmm1, xmm0
|
|
mulss xmm1, xmm7
|
|
movss [edi], xmm1
|
|
dec edx
|
|
jg loopSubRow
|
|
doneSubRow:
|
|
inc ebx
|
|
jmp loopRow
|
|
done:
|
|
pop ebx
|
|
}
|
|
|
|
return true;
|
|
|
|
#else
|
|
|
|
int i, j, k, nc;
|
|
float *v, *diag, *mptr;
|
|
double s0, s1, s2, s3, sum, d;
|
|
|
|
v = (float *) _alloca16( n * sizeof( float ) );
|
|
diag = (float *) _alloca16( n * sizeof( float ) );
|
|
|
|
nc = mat.GetNumColumns();
|
|
|
|
if ( n <= 0 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
|
|
sum = mptr[0];
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
diag[0] = sum;
|
|
invDiag[0] = d = 1.0f / sum;
|
|
|
|
if ( n <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 1; j < n; j++ ) {
|
|
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[1];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
sum = mptr[1] - s0;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[1][1] = sum;
|
|
diag[1] = sum;
|
|
invDiag[1] = d = 1.0f / sum;
|
|
|
|
if ( n <= 2 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 2; j < n; j++ ) {
|
|
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[2];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
sum = mptr[2] - s0 - s1;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[2][2] = sum;
|
|
diag[2] = sum;
|
|
invDiag[2] = d = 1.0f / sum;
|
|
|
|
if ( n <= 3 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 3; j < n; j++ ) {
|
|
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
|
|
}
|
|
|
|
mptr = mat[3];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
sum = mptr[3] - s0 - s1 - s2;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[3][3] = sum;
|
|
diag[3] = sum;
|
|
invDiag[3] = d = 1.0f / sum;
|
|
|
|
if ( n <= 4 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 4; j < n; j++ ) {
|
|
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
|
}
|
|
|
|
for ( i = 4; i < n; i++ ) {
|
|
|
|
mptr = mat[i];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
|
|
for ( k = 4; k < i-3; k += 4 ) {
|
|
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
|
|
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
|
|
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
|
|
}
|
|
switch( i - k ) {
|
|
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
|
|
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
|
|
}
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
sum = mptr[i] - sum;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[i][i] = sum;
|
|
diag[i] = sum;
|
|
invDiag[i] = d = 1.0f / sum;
|
|
|
|
if ( i + 1 >= n ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[i+1];
|
|
for ( j = i+1; j < n; j++ ) {
|
|
s0 = mptr[0] * v[0];
|
|
s1 = mptr[1] * v[1];
|
|
s2 = mptr[2] * v[2];
|
|
s3 = mptr[3] * v[3];
|
|
for ( k = 4; k < i-7; k += 8 ) {
|
|
s0 += mptr[k+0] * v[k+0];
|
|
s1 += mptr[k+1] * v[k+1];
|
|
s2 += mptr[k+2] * v[k+2];
|
|
s3 += mptr[k+3] * v[k+3];
|
|
s0 += mptr[k+4] * v[k+4];
|
|
s1 += mptr[k+5] * v[k+5];
|
|
s2 += mptr[k+6] * v[k+6];
|
|
s3 += mptr[k+7] * v[k+7];
|
|
}
|
|
switch( i - k ) {
|
|
case 7: s0 += mptr[k+6] * v[k+6];
|
|
case 6: s1 += mptr[k+5] * v[k+5];
|
|
case 5: s2 += mptr[k+4] * v[k+4];
|
|
case 4: s3 += mptr[k+3] * v[k+3];
|
|
case 3: s0 += mptr[k+2] * v[k+2];
|
|
case 2: s1 += mptr[k+1] * v[k+1];
|
|
case 1: s2 += mptr[k+0] * v[k+0];
|
|
}
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
mptr[i] = ( mptr[i] - sum ) * d;
|
|
mptr += nc;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::BlendJoints
|
|
============
|
|
*/
|
|
#define REFINE_BLENDJOINTS_RECIPROCAL
|
|
|
|
void VPCALL idSIMD_SSE::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
|
|
int i;
|
|
|
|
if ( lerp <= 0.0f ) {
|
|
return;
|
|
} else if ( lerp >= 1.0f ) {
|
|
for ( i = 0; i < numJoints; i++ ) {
|
|
int j = index[i];
|
|
joints[j] = blendJoints[j];
|
|
}
|
|
return;
|
|
}
|
|
|
|
for ( i = 0; i <= numJoints - 4; i += 4 ) {
|
|
ALIGN16( float jointVert0[4] );
|
|
ALIGN16( float jointVert1[4] );
|
|
ALIGN16( float jointVert2[4] );
|
|
ALIGN16( float blendVert0[4] );
|
|
ALIGN16( float blendVert1[4] );
|
|
ALIGN16( float blendVert2[4] );
|
|
ALIGN16( float jointQuat0[4] );
|
|
ALIGN16( float jointQuat1[4] );
|
|
ALIGN16( float jointQuat2[4] );
|
|
ALIGN16( float jointQuat3[4] );
|
|
ALIGN16( float blendQuat0[4] );
|
|
ALIGN16( float blendQuat1[4] );
|
|
ALIGN16( float blendQuat2[4] );
|
|
ALIGN16( float blendQuat3[4] );
|
|
|
|
for ( int j = 0; j < 4; j++ ) {
|
|
int n = index[i+j];
|
|
|
|
jointVert0[j] = joints[n].t[0];
|
|
jointVert1[j] = joints[n].t[1];
|
|
jointVert2[j] = joints[n].t[2];
|
|
|
|
blendVert0[j] = blendJoints[n].t[0];
|
|
blendVert1[j] = blendJoints[n].t[1];
|
|
blendVert2[j] = blendJoints[n].t[2];
|
|
|
|
jointQuat0[j] = joints[n].q[0];
|
|
jointQuat1[j] = joints[n].q[1];
|
|
jointQuat2[j] = joints[n].q[2];
|
|
jointQuat3[j] = joints[n].q[3];
|
|
|
|
blendQuat0[j] = blendJoints[n].q[0];
|
|
blendQuat1[j] = blendJoints[n].q[1];
|
|
blendQuat2[j] = blendJoints[n].q[2];
|
|
blendQuat3[j] = blendJoints[n].q[3];
|
|
}
|
|
|
|
#if 1
|
|
__asm {
|
|
// lerp translation
|
|
movss xmm7, lerp
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movaps xmm0, blendVert0
|
|
subps xmm0, jointVert0
|
|
mulps xmm0, xmm7
|
|
addps xmm0, jointVert0
|
|
movaps jointVert0, xmm0
|
|
movaps xmm1, blendVert1
|
|
subps xmm1, jointVert1
|
|
mulps xmm1, xmm7
|
|
addps xmm1, jointVert1
|
|
movaps jointVert1, xmm1
|
|
movaps xmm2, blendVert2
|
|
subps xmm2, jointVert2
|
|
mulps xmm2, xmm7
|
|
addps xmm2, jointVert2
|
|
movaps jointVert2, xmm2
|
|
|
|
// lerp quaternions
|
|
movaps xmm0, jointQuat0
|
|
mulps xmm0, blendQuat0
|
|
movaps xmm1, jointQuat1
|
|
mulps xmm1, blendQuat1
|
|
addps xmm0, xmm1
|
|
movaps xmm2, jointQuat2
|
|
mulps xmm2, blendQuat2
|
|
addps xmm0, xmm2
|
|
movaps xmm3, jointQuat3
|
|
mulps xmm3, blendQuat3
|
|
addps xmm0, xmm3 // xmm0 = cosom
|
|
|
|
movaps xmm1, xmm0
|
|
movaps xmm2, xmm0
|
|
andps xmm1, SIMD_SP_signBitMask // xmm1 = signBit
|
|
xorps xmm0, xmm1
|
|
mulps xmm2, xmm2
|
|
|
|
xorps xmm4, xmm4
|
|
movaps xmm3, SIMD_SP_one
|
|
subps xmm3, xmm2 // xmm3 = scale0
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
|
|
#ifdef REFINE_BLENDJOINTS_RECIPROCAL
|
|
movaps xmm2, xmm3
|
|
rsqrtps xmm4, xmm2
|
|
mulps xmm2, xmm4
|
|
mulps xmm2, xmm4
|
|
subps xmm2, SIMD_SP_rsqrt_c0
|
|
mulps xmm4, SIMD_SP_rsqrt_c1
|
|
mulps xmm2, xmm4
|
|
#else
|
|
rsqrtps xmm2, xmm3 // xmm2 = sinom
|
|
#endif
|
|
mulps xmm3, xmm2 // xmm3 = sqrt( scale0 )
|
|
|
|
// omega0 = atan2( xmm3, xmm0 )
|
|
movaps xmm4, xmm0
|
|
minps xmm0, xmm3
|
|
maxps xmm3, xmm4
|
|
cmpeqps xmm4, xmm0
|
|
|
|
#ifdef REFINE_BLENDJOINTS_RECIPROCAL
|
|
rcpps xmm5, xmm3
|
|
mulps xmm3, xmm5
|
|
mulps xmm3, xmm5
|
|
addps xmm5, xmm5
|
|
subps xmm5, xmm3 // xmm5 = 1 / y or 1 / x
|
|
mulps xmm0, xmm5 // xmm0 = x / y or y / x
|
|
#else
|
|
rcpps xmm3, xmm3 // xmm3 = 1 / y or 1 / x
|
|
mulps xmm0, xmm3 // xmm0 = x / y or y / x
|
|
#endif
|
|
movaps xmm3, xmm4
|
|
andps xmm3, SIMD_SP_signBitMask
|
|
xorps xmm0, xmm3 // xmm0 = -x / y or y / x
|
|
andps xmm4, SIMD_SP_halfPI // xmm4 = HALF_PI or 0.0f
|
|
movaps xmm3, xmm0
|
|
mulps xmm3, xmm3 // xmm3 = s
|
|
movaps xmm5, SIMD_SP_atan_c0
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c1
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c2
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c3
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c4
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c5
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c6
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_atan_c7
|
|
mulps xmm5, xmm3
|
|
addps xmm5, SIMD_SP_one
|
|
mulps xmm5, xmm0
|
|
addps xmm5, xmm4 // xmm5 = omega0
|
|
|
|
movaps xmm6, xmm7 // xmm6 = lerp
|
|
mulps xmm6, xmm5 // xmm6 = omega1
|
|
subps xmm5, xmm6 // xmm5 = omega0
|
|
|
|
// scale0 = sin( xmm5 ) * xmm2
|
|
// scale1 = sin( xmm6 ) * xmm2
|
|
movaps xmm3, xmm5
|
|
movaps xmm7, xmm6
|
|
mulps xmm3, xmm3
|
|
mulps xmm7, xmm7
|
|
movaps xmm4, SIMD_SP_sin_c0
|
|
movaps xmm0, SIMD_SP_sin_c0
|
|
mulps xmm4, xmm3
|
|
mulps xmm0, xmm7
|
|
addps xmm4, SIMD_SP_sin_c1
|
|
addps xmm0, SIMD_SP_sin_c1
|
|
mulps xmm4, xmm3
|
|
mulps xmm0, xmm7
|
|
addps xmm4, SIMD_SP_sin_c2
|
|
addps xmm0, SIMD_SP_sin_c2
|
|
mulps xmm4, xmm3
|
|
mulps xmm0, xmm7
|
|
addps xmm4, SIMD_SP_sin_c3
|
|
addps xmm0, SIMD_SP_sin_c3
|
|
mulps xmm4, xmm3
|
|
mulps xmm0, xmm7
|
|
addps xmm4, SIMD_SP_sin_c4
|
|
addps xmm0, SIMD_SP_sin_c4
|
|
mulps xmm4, xmm3
|
|
mulps xmm0, xmm7
|
|
addps xmm4, SIMD_SP_one
|
|
addps xmm0, SIMD_SP_one
|
|
mulps xmm5, xmm4
|
|
mulps xmm6, xmm0
|
|
mulps xmm5, xmm2 // xmm5 = scale0
|
|
mulps xmm6, xmm2 // xmm6 = scale1
|
|
|
|
xorps xmm6, xmm1
|
|
|
|
movaps xmm0, jointQuat0
|
|
mulps xmm0, xmm5
|
|
movaps xmm1, blendQuat0
|
|
mulps xmm1, xmm6
|
|
addps xmm0, xmm1
|
|
movaps jointQuat0, xmm0
|
|
|
|
movaps xmm1, jointQuat1
|
|
mulps xmm1, xmm5
|
|
movaps xmm2, blendQuat1
|
|
mulps xmm2, xmm6
|
|
addps xmm1, xmm2
|
|
movaps jointQuat1, xmm1
|
|
|
|
movaps xmm2, jointQuat2
|
|
mulps xmm2, xmm5
|
|
movaps xmm3, blendQuat2
|
|
mulps xmm3, xmm6
|
|
addps xmm2, xmm3
|
|
movaps jointQuat2, xmm2
|
|
|
|
movaps xmm3, jointQuat3
|
|
mulps xmm3, xmm5
|
|
movaps xmm4, blendQuat3
|
|
mulps xmm4, xmm6
|
|
addps xmm3, xmm4
|
|
movaps jointQuat3, xmm3
|
|
}
|
|
|
|
#else
|
|
|
|
jointVert0[0] += lerp * ( blendVert0[0] - jointVert0[0] );
|
|
jointVert0[1] += lerp * ( blendVert0[1] - jointVert0[1] );
|
|
jointVert0[2] += lerp * ( blendVert0[2] - jointVert0[2] );
|
|
jointVert0[3] += lerp * ( blendVert0[3] - jointVert0[3] );
|
|
|
|
jointVert1[0] += lerp * ( blendVert1[0] - jointVert1[0] );
|
|
jointVert1[1] += lerp * ( blendVert1[1] - jointVert1[1] );
|
|
jointVert1[2] += lerp * ( blendVert1[2] - jointVert1[2] );
|
|
jointVert1[3] += lerp * ( blendVert1[3] - jointVert1[3] );
|
|
|
|
jointVert2[0] += lerp * ( blendVert2[0] - jointVert2[0] );
|
|
jointVert2[1] += lerp * ( blendVert2[1] - jointVert2[1] );
|
|
jointVert2[2] += lerp * ( blendVert2[2] - jointVert2[2] );
|
|
jointVert2[3] += lerp * ( blendVert2[3] - jointVert2[3] );
|
|
|
|
ALIGN16( float cosom[4] );
|
|
ALIGN16( float sinom[4] );
|
|
ALIGN16( float omega0[4] );
|
|
ALIGN16( float omega1[4] );
|
|
ALIGN16( float scale0[4] );
|
|
ALIGN16( float scale1[4] );
|
|
ALIGN16( unsigned int signBit[4] );
|
|
|
|
cosom[0] = jointQuat0[0] * blendQuat0[0];
|
|
cosom[1] = jointQuat0[1] * blendQuat0[1];
|
|
cosom[2] = jointQuat0[2] * blendQuat0[2];
|
|
cosom[3] = jointQuat0[3] * blendQuat0[3];
|
|
|
|
cosom[0] += jointQuat1[0] * blendQuat1[0];
|
|
cosom[1] += jointQuat1[1] * blendQuat1[1];
|
|
cosom[2] += jointQuat1[2] * blendQuat1[2];
|
|
cosom[3] += jointQuat1[3] * blendQuat1[3];
|
|
|
|
cosom[0] += jointQuat2[0] * blendQuat2[0];
|
|
cosom[1] += jointQuat2[1] * blendQuat2[1];
|
|
cosom[2] += jointQuat2[2] * blendQuat2[2];
|
|
cosom[3] += jointQuat2[3] * blendQuat2[3];
|
|
|
|
cosom[0] += jointQuat3[0] * blendQuat3[0];
|
|
cosom[1] += jointQuat3[1] * blendQuat3[1];
|
|
cosom[2] += jointQuat3[2] * blendQuat3[2];
|
|
cosom[3] += jointQuat3[3] * blendQuat3[3];
|
|
|
|
signBit[0] = (*(unsigned int *)&cosom[0]) & ( 1 << 31 );
|
|
signBit[1] = (*(unsigned int *)&cosom[1]) & ( 1 << 31 );
|
|
signBit[2] = (*(unsigned int *)&cosom[2]) & ( 1 << 31 );
|
|
signBit[3] = (*(unsigned int *)&cosom[3]) & ( 1 << 31 );
|
|
|
|
(*(unsigned int *)&cosom[0]) ^= signBit[0];
|
|
(*(unsigned int *)&cosom[1]) ^= signBit[1];
|
|
(*(unsigned int *)&cosom[2]) ^= signBit[2];
|
|
(*(unsigned int *)&cosom[3]) ^= signBit[3];
|
|
|
|
scale0[0] = 1.0f - cosom[0] * cosom[0];
|
|
scale0[1] = 1.0f - cosom[1] * cosom[1];
|
|
scale0[2] = 1.0f - cosom[2] * cosom[2];
|
|
scale0[3] = 1.0f - cosom[3] * cosom[3];
|
|
|
|
scale0[0] = ( scale0[0] <= 0.0f ) ? SIMD_SP_tiny[0] : scale0[0];
|
|
scale0[1] = ( scale0[1] <= 0.0f ) ? SIMD_SP_tiny[1] : scale0[1];
|
|
scale0[2] = ( scale0[2] <= 0.0f ) ? SIMD_SP_tiny[2] : scale0[2];
|
|
scale0[3] = ( scale0[3] <= 0.0f ) ? SIMD_SP_tiny[3] : scale0[3];
|
|
|
|
sinom[0] = idMath::RSqrt( scale0[0] );
|
|
sinom[1] = idMath::RSqrt( scale0[1] );
|
|
sinom[2] = idMath::RSqrt( scale0[2] );
|
|
sinom[3] = idMath::RSqrt( scale0[3] );
|
|
|
|
scale0[0] *= sinom[0];
|
|
scale0[1] *= sinom[1];
|
|
scale0[2] *= sinom[2];
|
|
scale0[3] *= sinom[3];
|
|
|
|
omega0[0] = SSE_ATanPositive( scale0[0], cosom[0] );
|
|
omega0[1] = SSE_ATanPositive( scale0[1], cosom[1] );
|
|
omega0[2] = SSE_ATanPositive( scale0[2], cosom[2] );
|
|
omega0[3] = SSE_ATanPositive( scale0[3], cosom[3] );
|
|
|
|
omega1[0] = lerp * omega0[0];
|
|
omega1[1] = lerp * omega0[1];
|
|
omega1[2] = lerp * omega0[2];
|
|
omega1[3] = lerp * omega0[3];
|
|
|
|
omega0[0] -= omega1[0];
|
|
omega0[1] -= omega1[1];
|
|
omega0[2] -= omega1[2];
|
|
omega0[3] -= omega1[3];
|
|
|
|
scale0[0] = SSE_SinZeroHalfPI( omega0[0] ) * sinom[0];
|
|
scale0[1] = SSE_SinZeroHalfPI( omega0[1] ) * sinom[1];
|
|
scale0[2] = SSE_SinZeroHalfPI( omega0[2] ) * sinom[2];
|
|
scale0[3] = SSE_SinZeroHalfPI( omega0[3] ) * sinom[3];
|
|
|
|
scale1[0] = SSE_SinZeroHalfPI( omega1[0] ) * sinom[0];
|
|
scale1[1] = SSE_SinZeroHalfPI( omega1[1] ) * sinom[1];
|
|
scale1[2] = SSE_SinZeroHalfPI( omega1[2] ) * sinom[2];
|
|
scale1[3] = SSE_SinZeroHalfPI( omega1[3] ) * sinom[3];
|
|
|
|
(*(unsigned int *)&scale1[0]) ^= signBit[0];
|
|
(*(unsigned int *)&scale1[1]) ^= signBit[1];
|
|
(*(unsigned int *)&scale1[2]) ^= signBit[2];
|
|
(*(unsigned int *)&scale1[3]) ^= signBit[3];
|
|
|
|
jointQuat0[0] = scale0[0] * jointQuat0[0] + scale1[0] * blendQuat0[0];
|
|
jointQuat0[1] = scale0[1] * jointQuat0[1] + scale1[1] * blendQuat0[1];
|
|
jointQuat0[2] = scale0[2] * jointQuat0[2] + scale1[2] * blendQuat0[2];
|
|
jointQuat0[3] = scale0[3] * jointQuat0[3] + scale1[3] * blendQuat0[3];
|
|
|
|
jointQuat1[0] = scale0[0] * jointQuat1[0] + scale1[0] * blendQuat1[0];
|
|
jointQuat1[1] = scale0[1] * jointQuat1[1] + scale1[1] * blendQuat1[1];
|
|
jointQuat1[2] = scale0[2] * jointQuat1[2] + scale1[2] * blendQuat1[2];
|
|
jointQuat1[3] = scale0[3] * jointQuat1[3] + scale1[3] * blendQuat1[3];
|
|
|
|
jointQuat2[0] = scale0[0] * jointQuat2[0] + scale1[0] * blendQuat2[0];
|
|
jointQuat2[1] = scale0[1] * jointQuat2[1] + scale1[1] * blendQuat2[1];
|
|
jointQuat2[2] = scale0[2] * jointQuat2[2] + scale1[2] * blendQuat2[2];
|
|
jointQuat2[3] = scale0[3] * jointQuat2[3] + scale1[3] * blendQuat2[3];
|
|
|
|
jointQuat3[0] = scale0[0] * jointQuat3[0] + scale1[0] * blendQuat3[0];
|
|
jointQuat3[1] = scale0[1] * jointQuat3[1] + scale1[1] * blendQuat3[1];
|
|
jointQuat3[2] = scale0[2] * jointQuat3[2] + scale1[2] * blendQuat3[2];
|
|
jointQuat3[3] = scale0[3] * jointQuat3[3] + scale1[3] * blendQuat3[3];
|
|
|
|
#endif
|
|
|
|
for ( int j = 0; j < 4; j++ ) {
|
|
int n = index[i+j];
|
|
|
|
joints[n].t[0] = jointVert0[j];
|
|
joints[n].t[1] = jointVert1[j];
|
|
joints[n].t[2] = jointVert2[j];
|
|
|
|
joints[n].q[0] = jointQuat0[j];
|
|
joints[n].q[1] = jointQuat1[j];
|
|
joints[n].q[2] = jointQuat2[j];
|
|
joints[n].q[3] = jointQuat3[j];
|
|
}
|
|
}
|
|
|
|
for ( ; i < numJoints; i++ ) {
|
|
int n = index[i];
|
|
|
|
idVec3 &jointVert = joints[n].t;
|
|
const idVec3 &blendVert = blendJoints[n].t;
|
|
|
|
jointVert[0] += lerp * ( blendVert[0] - jointVert[0] );
|
|
jointVert[1] += lerp * ( blendVert[1] - jointVert[1] );
|
|
jointVert[2] += lerp * ( blendVert[2] - jointVert[2] );
|
|
|
|
idQuat &jointQuat = joints[n].q;
|
|
const idQuat &blendQuat = blendJoints[n].q;
|
|
|
|
float cosom;
|
|
float sinom;
|
|
float omega;
|
|
float scale0;
|
|
float scale1;
|
|
unsigned int signBit;
|
|
|
|
cosom = jointQuat.x * blendQuat.x + jointQuat.y * blendQuat.y + jointQuat.z * blendQuat.z + jointQuat.w * blendQuat.w;
|
|
|
|
signBit = (*(unsigned int *)&cosom) & ( 1 << 31 );
|
|
|
|
(*(unsigned int *)&cosom) ^= signBit;
|
|
|
|
scale0 = 1.0f - cosom * cosom;
|
|
scale0 = ( scale0 <= 0.0f ) ? SIMD_SP_tiny[0] : scale0;
|
|
sinom = idMath::InvSqrt( scale0 );
|
|
omega = idMath::ATan16( scale0 * sinom, cosom );
|
|
scale0 = idMath::Sin16( ( 1.0f - lerp ) * omega ) * sinom;
|
|
scale1 = idMath::Sin16( lerp * omega ) * sinom;
|
|
|
|
(*(unsigned int *)&scale1) ^= signBit;
|
|
|
|
jointQuat.x = scale0 * jointQuat.x + scale1 * blendQuat.x;
|
|
jointQuat.y = scale0 * jointQuat.y + scale1 * blendQuat.y;
|
|
jointQuat.z = scale0 * jointQuat.z + scale1 * blendQuat.z;
|
|
jointQuat.w = scale0 * jointQuat.w + scale1 * blendQuat.w;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::ConvertJointQuatsToJointMats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
|
|
|
|
assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
|
|
|
|
for ( int i = 0; i < numJoints; i++ ) {
|
|
|
|
const float *q = jointQuats[i].q.ToFloatPtr();
|
|
float *m = jointMats[i].ToFloatPtr();
|
|
|
|
m[0*4+3] = q[4];
|
|
m[1*4+3] = q[5];
|
|
m[2*4+3] = q[6];
|
|
|
|
float x2 = q[0] + q[0];
|
|
float y2 = q[1] + q[1];
|
|
float z2 = q[2] + q[2];
|
|
|
|
{
|
|
float xx = q[0] * x2;
|
|
float yy = q[1] * y2;
|
|
float zz = q[2] * z2;
|
|
|
|
m[0*4+0] = 1.0f - yy - zz;
|
|
m[1*4+1] = 1.0f - xx - zz;
|
|
m[2*4+2] = 1.0f - xx - yy;
|
|
}
|
|
|
|
{
|
|
float yz = q[1] * z2;
|
|
float wx = q[3] * x2;
|
|
|
|
m[2*4+1] = yz - wx;
|
|
m[1*4+2] = yz + wx;
|
|
}
|
|
|
|
{
|
|
float xy = q[0] * y2;
|
|
float wz = q[3] * z2;
|
|
|
|
m[1*4+0] = xy - wz;
|
|
m[0*4+1] = xy + wz;
|
|
}
|
|
|
|
{
|
|
float xz = q[0] * z2;
|
|
float wy = q[3] * y2;
|
|
|
|
m[0*4+2] = xz - wy;
|
|
m[2*4+0] = xz + wy;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::ConvertJointMatsToJointQuats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
|
|
|
|
assert( sizeof( idJointQuat ) == JOINTQUAT_SIZE );
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
assert( (int)(&((idJointQuat *)0)->t) == (int)(&((idJointQuat *)0)->q) + (int)sizeof( ((idJointQuat *)0)->q ) );
|
|
|
|
#if 1
|
|
|
|
ALIGN16( byte shuffle[16] );
|
|
|
|
__asm {
|
|
mov eax, numJoints
|
|
mov esi, jointMats
|
|
mov edi, jointQuats
|
|
and eax, ~3
|
|
jz done4
|
|
imul eax, JOINTMAT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopMat4:
|
|
movss xmm5, [esi+eax+3*JOINTMAT_SIZE+0*16+0*4]
|
|
movss xmm6, [esi+eax+3*JOINTMAT_SIZE+1*16+1*4]
|
|
movss xmm7, [esi+eax+3*JOINTMAT_SIZE+2*16+2*4]
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm0, [esi+eax+2*JOINTMAT_SIZE+0*16+0*4]
|
|
movss xmm1, [esi+eax+2*JOINTMAT_SIZE+1*16+1*4]
|
|
movss xmm2, [esi+eax+2*JOINTMAT_SIZE+2*16+2*4]
|
|
|
|
movss xmm5, xmm0
|
|
movss xmm6, xmm1
|
|
movss xmm7, xmm2
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm0, [esi+eax+1*JOINTMAT_SIZE+0*16+0*4]
|
|
movss xmm1, [esi+eax+1*JOINTMAT_SIZE+1*16+1*4]
|
|
movss xmm2, [esi+eax+1*JOINTMAT_SIZE+2*16+2*4]
|
|
|
|
movss xmm5, xmm0
|
|
movss xmm6, xmm1
|
|
movss xmm7, xmm2
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm0, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
|
|
movss xmm1, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
|
|
movss xmm2, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
|
|
|
|
movss xmm5, xmm0
|
|
movss xmm6, xmm1
|
|
movss xmm7, xmm2
|
|
|
|
// -------------------
|
|
|
|
movaps xmm0, xmm5
|
|
addps xmm0, xmm6
|
|
addps xmm0, xmm7
|
|
cmpnltps xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
|
|
|
|
movaps xmm1, xmm5
|
|
movaps xmm2, xmm5
|
|
cmpnltps xmm1, xmm6
|
|
cmpnltps xmm2, xmm7
|
|
andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
|
|
|
|
movaps xmm4, xmm6
|
|
cmpnltps xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
|
|
|
|
movaps xmm1, xmm0
|
|
andnps xmm1, xmm2
|
|
orps xmm2, xmm0
|
|
movaps xmm3, xmm2
|
|
andnps xmm2, xmm4
|
|
orps xmm3, xmm2
|
|
xorps xmm3, SIMD_SP_not
|
|
|
|
andps xmm0, SIMD_DW_mat2quatShuffle0
|
|
movaps xmm4, xmm1
|
|
andps xmm4, SIMD_DW_mat2quatShuffle1
|
|
orps xmm0, xmm4
|
|
movaps xmm4, xmm2
|
|
andps xmm4, SIMD_DW_mat2quatShuffle2
|
|
orps xmm0, xmm4
|
|
movaps xmm4, xmm3
|
|
andps xmm4, SIMD_DW_mat2quatShuffle3
|
|
orps xmm4, xmm0
|
|
|
|
movaps shuffle, xmm4
|
|
|
|
movaps xmm0, xmm2
|
|
orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
|
|
orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
|
|
orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
|
|
|
|
andps xmm0, SIMD_SP_signBitMask
|
|
andps xmm1, SIMD_SP_signBitMask
|
|
andps xmm2, SIMD_SP_signBitMask
|
|
|
|
xorps xmm5, xmm0
|
|
xorps xmm6, xmm1
|
|
xorps xmm7, xmm2
|
|
addps xmm5, xmm6
|
|
addps xmm7, SIMD_SP_one
|
|
addps xmm5, xmm7 // xmm5 = t
|
|
|
|
movaps xmm7, xmm5 // xmm7 = t
|
|
rsqrtps xmm6, xmm5
|
|
mulps xmm5, xmm6
|
|
mulps xmm5, xmm6
|
|
subps xmm5, SIMD_SP_rsqrt_c0
|
|
mulps xmm6, SIMD_SP_mat2quat_rsqrt_c1
|
|
mulps xmm6, xmm5 // xmm5 = s
|
|
|
|
mulps xmm7, xmm6 // xmm7 = s * t
|
|
xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
|
|
|
|
// -------------------
|
|
|
|
add edi, 4*JOINTQUAT_SIZE
|
|
|
|
movzx ecx, byte ptr shuffle[0*4+0] // ecx = k0
|
|
movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
|
|
|
|
movzx edx, byte ptr shuffle[0*4+1] // edx = k1
|
|
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
|
|
xorps xmm4, xmm2
|
|
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
|
|
|
|
movzx ecx, byte ptr shuffle[0*4+2] // ecx = k2
|
|
movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
|
|
xorps xmm3, xmm1
|
|
subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
|
|
mulss xmm3, xmm6
|
|
movss [edi+ecx*4-4*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
|
|
|
|
movzx edx, byte ptr shuffle[0*4+3] // edx = k3
|
|
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
|
|
xorps xmm4, xmm0
|
|
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-4*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
|
|
|
|
mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
|
|
mov [edi-4*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
|
|
mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
|
|
mov [edi-4*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
|
|
mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
|
|
mov [edi-4*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
|
|
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movzx ecx, byte ptr shuffle[1*4+0] // ecx = k0
|
|
movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
|
|
|
|
movzx edx, byte ptr shuffle[1*4+1] // edx = k1
|
|
movss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+0*4]
|
|
xorps xmm4, xmm2
|
|
subss xmm4, [esi+eax+1*JOINTMAT_SIZE+0*16+1*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
|
|
|
|
movzx ecx, byte ptr shuffle[1*4+2] // ecx = k2
|
|
movss xmm3, [esi+eax+1*JOINTMAT_SIZE+0*16+2*4]
|
|
xorps xmm3, xmm1
|
|
subss xmm3, [esi+eax+1*JOINTMAT_SIZE+2*16+0*4]
|
|
mulss xmm3, xmm6
|
|
movss [edi+ecx*4-3*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
|
|
|
|
movzx edx, byte ptr shuffle[1*4+3] // edx = k3
|
|
movss xmm4, [esi+eax+1*JOINTMAT_SIZE+2*16+1*4]
|
|
xorps xmm4, xmm0
|
|
subss xmm4, [esi+eax+1*JOINTMAT_SIZE+1*16+2*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-3*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
|
|
|
|
mov ecx, [esi+eax+1*JOINTMAT_SIZE+0*16+3*4]
|
|
mov [edi-3*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
|
|
mov edx, [esi+eax+1*JOINTMAT_SIZE+1*16+3*4]
|
|
mov [edi-3*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
|
|
mov ecx, [esi+eax+1*JOINTMAT_SIZE+2*16+3*4]
|
|
mov [edi-3*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
|
|
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movzx ecx, byte ptr shuffle[2*4+0] // ecx = k0
|
|
movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
|
|
|
|
movzx edx, byte ptr shuffle[2*4+1] // edx = k1
|
|
movss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+0*4]
|
|
xorps xmm4, xmm2
|
|
subss xmm4, [esi+eax+2*JOINTMAT_SIZE+0*16+1*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
|
|
|
|
movzx ecx, byte ptr shuffle[2*4+2] // ecx = k2
|
|
movss xmm3, [esi+eax+2*JOINTMAT_SIZE+0*16+2*4]
|
|
xorps xmm3, xmm1
|
|
subss xmm3, [esi+eax+2*JOINTMAT_SIZE+2*16+0*4]
|
|
mulss xmm3, xmm6
|
|
movss [edi+ecx*4-2*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
|
|
|
|
movzx edx, byte ptr shuffle[2*4+3] // edx = k3
|
|
movss xmm4, [esi+eax+2*JOINTMAT_SIZE+2*16+1*4]
|
|
xorps xmm4, xmm0
|
|
subss xmm4, [esi+eax+2*JOINTMAT_SIZE+1*16+2*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-2*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
|
|
|
|
mov ecx, [esi+eax+2*JOINTMAT_SIZE+0*16+3*4]
|
|
mov [edi-2*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
|
|
mov edx, [esi+eax+2*JOINTMAT_SIZE+1*16+3*4]
|
|
mov [edi-2*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
|
|
mov ecx, [esi+eax+2*JOINTMAT_SIZE+2*16+3*4]
|
|
mov [edi-2*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
|
|
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movzx ecx, byte ptr shuffle[3*4+0] // ecx = k0
|
|
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
|
|
|
|
movzx edx, byte ptr shuffle[3*4+1] // edx = k1
|
|
movss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+0*4]
|
|
xorps xmm4, xmm2
|
|
subss xmm4, [esi+eax+3*JOINTMAT_SIZE+0*16+1*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
|
|
|
|
movzx ecx, byte ptr shuffle[3*4+2] // ecx = k2
|
|
movss xmm3, [esi+eax+3*JOINTMAT_SIZE+0*16+2*4]
|
|
xorps xmm3, xmm1
|
|
subss xmm3, [esi+eax+3*JOINTMAT_SIZE+2*16+0*4]
|
|
mulss xmm3, xmm6
|
|
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
|
|
|
|
movzx edx, byte ptr shuffle[3*4+3] // edx = k3
|
|
movss xmm4, [esi+eax+3*JOINTMAT_SIZE+2*16+1*4]
|
|
xorps xmm4, xmm0
|
|
subss xmm4, [esi+eax+3*JOINTMAT_SIZE+1*16+2*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
|
|
|
|
mov ecx, [esi+eax+3*JOINTMAT_SIZE+0*16+3*4]
|
|
mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
|
|
mov edx, [esi+eax+3*JOINTMAT_SIZE+1*16+3*4]
|
|
mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
|
|
mov ecx, [esi+eax+3*JOINTMAT_SIZE+2*16+3*4]
|
|
mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
|
|
|
|
add eax, 4*JOINTMAT_SIZE
|
|
jl loopMat4
|
|
|
|
done4:
|
|
mov eax, numJoints
|
|
and eax, 3
|
|
jz done1
|
|
imul eax, JOINTMAT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopMat1:
|
|
movss xmm5, [esi+eax+0*JOINTMAT_SIZE+0*16+0*4]
|
|
movss xmm6, [esi+eax+0*JOINTMAT_SIZE+1*16+1*4]
|
|
movss xmm7, [esi+eax+0*JOINTMAT_SIZE+2*16+2*4]
|
|
|
|
// -------------------
|
|
|
|
movaps xmm0, xmm5
|
|
addss xmm0, xmm6
|
|
addss xmm0, xmm7
|
|
cmpnltss xmm0, SIMD_SP_zero // xmm0 = m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f
|
|
|
|
movaps xmm1, xmm5
|
|
movaps xmm2, xmm5
|
|
cmpnltss xmm1, xmm6
|
|
cmpnltss xmm2, xmm7
|
|
andps xmm2, xmm1 // xmm2 = m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2]
|
|
|
|
movaps xmm4, xmm6
|
|
cmpnltss xmm4, xmm7 // xmm3 = m[1 * 4 + 1] > m[2 * 4 + 2]
|
|
|
|
movaps xmm1, xmm0
|
|
andnps xmm1, xmm2
|
|
orps xmm2, xmm0
|
|
movaps xmm3, xmm2
|
|
andnps xmm2, xmm4
|
|
orps xmm3, xmm2
|
|
xorps xmm3, SIMD_SP_not
|
|
|
|
andps xmm0, SIMD_DW_mat2quatShuffle0
|
|
movaps xmm4, xmm1
|
|
andps xmm4, SIMD_DW_mat2quatShuffle1
|
|
orps xmm0, xmm4
|
|
movaps xmm4, xmm2
|
|
andps xmm4, SIMD_DW_mat2quatShuffle2
|
|
orps xmm0, xmm4
|
|
movaps xmm4, xmm3
|
|
andps xmm4, SIMD_DW_mat2quatShuffle3
|
|
orps xmm4, xmm0
|
|
|
|
movss shuffle, xmm4
|
|
|
|
movaps xmm0, xmm2
|
|
orps xmm0, xmm3 // xmm0 = xmm2 | xmm3 = s0
|
|
orps xmm2, xmm1 // xmm2 = xmm1 | xmm2 = s2
|
|
orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 = s1
|
|
|
|
andps xmm0, SIMD_SP_signBitMask
|
|
andps xmm1, SIMD_SP_signBitMask
|
|
andps xmm2, SIMD_SP_signBitMask
|
|
|
|
xorps xmm5, xmm0
|
|
xorps xmm6, xmm1
|
|
xorps xmm7, xmm2
|
|
addss xmm5, xmm6
|
|
addss xmm7, SIMD_SP_one
|
|
addss xmm5, xmm7 // xmm5 = t
|
|
|
|
movss xmm7, xmm5 // xmm7 = t
|
|
rsqrtss xmm6, xmm5
|
|
mulss xmm5, xmm6
|
|
mulss xmm5, xmm6
|
|
subss xmm5, SIMD_SP_rsqrt_c0
|
|
mulss xmm6, SIMD_SP_mat2quat_rsqrt_c1
|
|
mulss xmm6, xmm5 // xmm5 = s
|
|
|
|
mulss xmm7, xmm6 // xmm7 = s * t
|
|
xorps xmm6, SIMD_SP_signBitMask // xmm6 = -s
|
|
|
|
// -------------------
|
|
|
|
movzx ecx, byte ptr shuffle[0] // ecx = k0
|
|
add edi, JOINTQUAT_SIZE
|
|
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm7 // q[k0] = s * t;
|
|
|
|
movzx edx, byte ptr shuffle[1] // edx = k1
|
|
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+0*4]
|
|
xorps xmm4, xmm2
|
|
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+0*16+1*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
|
|
|
|
movzx ecx, byte ptr shuffle[2] // ecx = k2
|
|
movss xmm3, [esi+eax+0*JOINTMAT_SIZE+0*16+2*4]
|
|
xorps xmm3, xmm1
|
|
subss xmm3, [esi+eax+0*JOINTMAT_SIZE+2*16+0*4]
|
|
mulss xmm3, xmm6
|
|
movss [edi+ecx*4-1*JOINTQUAT_SIZE], xmm3 // q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
|
|
|
|
movzx edx, byte ptr shuffle[3] // edx = k3
|
|
movss xmm4, [esi+eax+0*JOINTMAT_SIZE+2*16+1*4]
|
|
xorps xmm4, xmm0
|
|
subss xmm4, [esi+eax+0*JOINTMAT_SIZE+1*16+2*4]
|
|
mulss xmm4, xmm6
|
|
movss [edi+edx*4-1*JOINTQUAT_SIZE], xmm4 // q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
|
|
|
|
mov ecx, [esi+eax+0*JOINTMAT_SIZE+0*16+3*4]
|
|
mov [edi-1*JOINTQUAT_SIZE+16], ecx // q[4] = m[0 * 4 + 3];
|
|
mov edx, [esi+eax+0*JOINTMAT_SIZE+1*16+3*4]
|
|
mov [edi-1*JOINTQUAT_SIZE+20], edx // q[5] = m[1 * 4 + 3];
|
|
mov ecx, [esi+eax+0*JOINTMAT_SIZE+2*16+3*4]
|
|
mov [edi-1*JOINTQUAT_SIZE+24], ecx // q[6] = m[2 * 4 + 3];
|
|
|
|
add eax, JOINTMAT_SIZE
|
|
jl loopMat1
|
|
|
|
done1:
|
|
}
|
|
|
|
#elif 0
|
|
|
|
for ( int i = 0; i < numJoints; i++ ) {
|
|
float s0, s1, s2;
|
|
int k0, k1, k2, k3;
|
|
|
|
float *q = jointQuats[i].q.ToFloatPtr();
|
|
const float *m = jointMats[i].ToFloatPtr();
|
|
|
|
if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
|
|
|
|
k0 = 3;
|
|
k1 = 2;
|
|
k2 = 1;
|
|
k3 = 0;
|
|
s0 = 1.0f;
|
|
s1 = 1.0f;
|
|
s2 = 1.0f;
|
|
|
|
} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
|
|
|
|
k0 = 0;
|
|
k1 = 1;
|
|
k2 = 2;
|
|
k3 = 3;
|
|
s0 = 1.0f;
|
|
s1 = -1.0f;
|
|
s2 = -1.0f;
|
|
|
|
} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
|
|
|
|
k0 = 1;
|
|
k1 = 0;
|
|
k2 = 3;
|
|
k3 = 2;
|
|
s0 = -1.0f;
|
|
s1 = 1.0f;
|
|
s2 = -1.0f;
|
|
|
|
} else {
|
|
|
|
k0 = 2;
|
|
k1 = 3;
|
|
k2 = 0;
|
|
k3 = 1;
|
|
s0 = -1.0f;
|
|
s1 = -1.0f;
|
|
s2 = 1.0f;
|
|
|
|
}
|
|
|
|
float t = s0 * m[0 * 4 + 0] + s1 * m[1 * 4 + 1] + s2 * m[2 * 4 + 2] + 1.0f;
|
|
float s = idMath::InvSqrt( t ) * 0.5f;
|
|
|
|
q[k0] = s * t;
|
|
q[k1] = ( m[0 * 4 + 1] - s2 * m[1 * 4 + 0] ) * s;
|
|
q[k2] = ( m[2 * 4 + 0] - s1 * m[0 * 4 + 2] ) * s;
|
|
q[k3] = ( m[1 * 4 + 2] - s0 * m[2 * 4 + 1] ) * s;
|
|
|
|
q[4] = m[0 * 4 + 3];
|
|
q[5] = m[1 * 4 + 3];
|
|
q[6] = m[2 * 4 + 3];
|
|
}
|
|
|
|
#elif 1
|
|
|
|
for ( int i = 0; i < numJoints; i++ ) {
|
|
|
|
float *q = jointQuats[i].q.ToFloatPtr();
|
|
const float *m = jointMats[i].ToFloatPtr();
|
|
|
|
if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
|
|
|
|
float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
|
|
float s = idMath::InvSqrt( t ) * 0.5f;
|
|
|
|
q[3] = s * t;
|
|
q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
|
|
q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
|
|
q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
|
|
|
|
} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
|
|
|
|
float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
|
|
float s = idMath::InvSqrt( t ) * 0.5f;
|
|
|
|
q[0] = s * t;
|
|
q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
|
|
q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
|
|
q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
|
|
|
|
} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
|
|
|
|
float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
|
|
float s = idMath::InvSqrt( t ) * 0.5f;
|
|
|
|
q[1] = s * t;
|
|
q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
|
|
q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
|
|
q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
|
|
|
|
} else {
|
|
|
|
float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
|
|
float s = idMath::InvSqrt( t ) * 0.5f;
|
|
|
|
q[2] = s * t;
|
|
q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
|
|
q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
|
|
q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
|
|
|
|
}
|
|
|
|
q[4] = m[0 * 4 + 3];
|
|
q[5] = m[1 * 4 + 3];
|
|
q[6] = m[2 * 4 + 3];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::TransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
|
|
__asm {
|
|
|
|
mov ecx, firstJoint
|
|
mov eax, lastJoint
|
|
sub eax, ecx
|
|
jl done
|
|
imul ecx, 4
|
|
mov edi, parents
|
|
add edi, ecx
|
|
imul ecx, 12
|
|
mov esi, jointMats
|
|
imul eax, 4
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loopJoint:
|
|
|
|
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mov edx, [edi+eax]
|
|
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
|
|
imul edx, JOINTMAT_SIZE
|
|
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
|
|
|
|
movss xmm4, [esi+edx+ 0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm0
|
|
|
|
movss xmm5, [esi+edx+ 4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm1
|
|
addps xmm4, xmm5
|
|
movss xmm6, [esi+edx+ 8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm2
|
|
addps xmm4, xmm6
|
|
|
|
movss xmm5, [esi+edx+16]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm0
|
|
|
|
movss xmm7, [esi+edx+12]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
addps xmm4, xmm7
|
|
|
|
movaps [esi+ecx+ 0], xmm4
|
|
|
|
movss xmm6, [esi+edx+20]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm1
|
|
addps xmm5, xmm6
|
|
movss xmm7, [esi+edx+24]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm5, xmm7
|
|
|
|
movss xmm6, [esi+edx+32]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
|
|
movss xmm3, [esi+edx+28]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
addps xmm5, xmm3
|
|
|
|
movaps [esi+ecx+16], xmm5
|
|
|
|
movss xmm7, [esi+edx+36]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movss xmm3, [esi+edx+40]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm2
|
|
addps xmm6, xmm3
|
|
|
|
movss xmm7, [esi+edx+44]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
addps xmm6, xmm7
|
|
|
|
movaps [esi+ecx+32], xmm6
|
|
|
|
add ecx, JOINTMAT_SIZE
|
|
add eax, 4
|
|
jle loopJoint
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] *= jointMats[parents[i]];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::UntransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
|
|
__asm {
|
|
|
|
mov edx, firstJoint
|
|
mov eax, lastJoint
|
|
mov ecx, eax
|
|
sub eax, edx
|
|
jl done
|
|
mov esi, jointMats
|
|
imul ecx, JOINTMAT_SIZE
|
|
imul edx, 4
|
|
mov edi, parents
|
|
add edi, edx
|
|
imul eax, 4
|
|
|
|
loopJoint:
|
|
|
|
movaps xmm0, [esi+ecx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mov edx, [edi+eax]
|
|
movaps xmm1, [esi+ecx+16] // xmm1 = m2, m3, m4, t1
|
|
imul edx, JOINTMAT_SIZE
|
|
movaps xmm2, [esi+ecx+32] // xmm2 = m5, m6, m7, t2
|
|
|
|
movss xmm6, [esi+edx+12]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
subps xmm0, xmm6
|
|
movss xmm7, [esi+edx+28]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
subps xmm1, xmm7
|
|
movss xmm3, [esi+edx+44]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
subps xmm2, xmm3
|
|
|
|
movss xmm4, [esi+edx+ 0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm4, xmm0
|
|
movss xmm5, [esi+edx+16]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm1
|
|
addps xmm4, xmm5
|
|
movss xmm6, [esi+edx+32]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm2
|
|
addps xmm4, xmm6
|
|
|
|
movaps [esi+ecx+ 0], xmm4
|
|
|
|
movss xmm5, [esi+edx+ 4]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm5, xmm0
|
|
movss xmm6, [esi+edx+20]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm1
|
|
addps xmm5, xmm6
|
|
movss xmm7, [esi+edx+36]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm5, xmm7
|
|
|
|
movaps [esi+ecx+16], xmm5
|
|
|
|
movss xmm6, [esi+edx+ 8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movss xmm7, [esi+edx+24]
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movss xmm3, [esi+edx+40]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm3, xmm2
|
|
addps xmm6, xmm3
|
|
|
|
movaps [esi+ecx+32], xmm6
|
|
|
|
sub ecx, JOINTMAT_SIZE
|
|
sub eax, 4
|
|
jge loopJoint
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] /= jointMats[parents[i]];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::TransformVerts
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, const int numWeights ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
assert( sizeof( idVec4 ) == JOINTWEIGHT_SIZE );
|
|
assert( sizeof( idJointMat ) == JOINTMAT_SIZE );
|
|
|
|
__asm
|
|
{
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
imul eax, DRAWVERT_SIZE
|
|
|
|
mov ecx, verts
|
|
mov edx, index
|
|
mov esi, weights
|
|
mov edi, joints
|
|
|
|
add ecx, eax
|
|
neg eax
|
|
|
|
loopVert:
|
|
mov ebx, [edx]
|
|
movaps xmm2, [esi]
|
|
add edx, 8
|
|
movaps xmm0, xmm2
|
|
add esi, JOINTWEIGHT_SIZE
|
|
movaps xmm1, xmm2
|
|
|
|
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
|
|
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
|
|
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-4], 0
|
|
|
|
jne doneWeight
|
|
|
|
loopWeight:
|
|
mov ebx, [edx]
|
|
movaps xmm5, [esi]
|
|
add edx, 8
|
|
movaps xmm3, xmm5
|
|
add esi, JOINTWEIGHT_SIZE
|
|
movaps xmm4, xmm5
|
|
|
|
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
|
|
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
|
|
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
|
|
|
|
cmp dword ptr [edx-4], 0
|
|
|
|
addps xmm0, xmm3
|
|
addps xmm1, xmm4
|
|
addps xmm2, xmm5
|
|
|
|
je loopWeight
|
|
|
|
doneWeight:
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
movaps xmm6, xmm0 // xmm6 = m0, m1, m2, t0
|
|
unpcklps xmm6, xmm1 // xmm6 = m0, m3, m1, m4
|
|
unpckhps xmm0, xmm1 // xmm1 = m2, m5, t0, t1
|
|
addps xmm6, xmm0 // xmm6 = m0+m2, m3+m5, m1+t0, m4+t1
|
|
|
|
movaps xmm7, xmm2 // xmm7 = m6, m7, m8, t2
|
|
movlhps xmm2, xmm6 // xmm2 = m6, m7, m0+m2, m3+m5
|
|
movhlps xmm6, xmm7 // xmm6 = m8, t2, m1+t0, m4+t1
|
|
addps xmm6, xmm2 // xmm6 = m6+m8, m7+t2, m0+m1+m2+t0, m3+m4+m5+t1
|
|
|
|
movhps [ecx+eax-DRAWVERT_SIZE+0], xmm6
|
|
|
|
movaps xmm5, xmm6 // xmm5 = m6+m8, m7+t2
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 0, 2, 3 ) // xmm5 = m7+t2, m6+m8
|
|
addss xmm5, xmm6 // xmm5 = m6+m8+m7+t2
|
|
|
|
movss [ecx+eax-DRAWVERT_SIZE+8], xmm5
|
|
|
|
jl loopVert
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j;
|
|
const byte *jointsPtr = (byte *)joints;
|
|
|
|
for( j = i = 0; i < numVerts; i++ ) {
|
|
idVec3 v;
|
|
|
|
v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
while( index[j*2+1] == 0 ) {
|
|
j++;
|
|
v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
}
|
|
j++;
|
|
|
|
verts[i].xyz = v;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::TracePointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__asm {
|
|
push ebx
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
|
|
mov edi, planes
|
|
movlps xmm1, [edi] // xmm1 = 0, 1, X, X
|
|
movhps xmm1, [edi+16] // xmm1 = 0, 1, 4, 5
|
|
movlps xmm3, [edi+8] // xmm3 = 2, 3, X, X
|
|
movhps xmm3, [edi+24] // xmm3 = 2, 3, 6, 7
|
|
movlps xmm4, [edi+32] // xmm4 = 8, 9, X, X
|
|
movhps xmm4, [edi+48] // xmm4 = 8, 9, 12, 13
|
|
movlps xmm5, [edi+40] // xmm5 = 10, 11, X, X
|
|
movhps xmm5, [edi+56] // xmm5 = 10, 11, 14, 15
|
|
movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
|
|
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
|
|
movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
|
|
shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
|
|
shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
|
|
movss xmm7, radius
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
xor edx, edx
|
|
mov esi, verts
|
|
mov edi, cullBits
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopVert:
|
|
movss xmm4, [esi+eax+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm5, [esi+eax+DRAWVERT_XYZ_OFFSET+4]
|
|
mulps xmm4, xmm0
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movss xmm6, [esi+eax+DRAWVERT_XYZ_OFFSET+8]
|
|
mulps xmm5, xmm1
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
addps xmm4, xmm5
|
|
mulps xmm6, xmm2
|
|
addps xmm4, xmm3
|
|
addps xmm4, xmm6
|
|
movaps xmm5, xmm4
|
|
xorps xmm5, SIMD_SP_signBitMask
|
|
cmpltps xmm4, xmm7
|
|
movmskps ecx, xmm4
|
|
cmpltps xmm5, xmm7
|
|
movmskps ebx, xmm5
|
|
shl cx, 4
|
|
or cl, bl
|
|
inc edi
|
|
or dl, cl
|
|
add eax, DRAWVERT_SIZE
|
|
mov byte ptr [edi-1], cl
|
|
jl loopVert
|
|
|
|
done:
|
|
mov esi, totalOr
|
|
mov byte ptr [esi], dl
|
|
pop ebx
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
byte tOr;
|
|
|
|
tOr = 0;
|
|
|
|
for ( i = 0; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, t;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
|
|
d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
|
|
d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
|
|
d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
|
|
|
|
t = d0 + radius;
|
|
bits = FLOATSIGNBITSET( t ) << 0;
|
|
t = d1 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 1;
|
|
t = d2 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 2;
|
|
t = d3 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 3;
|
|
|
|
t = d0 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 4;
|
|
t = d1 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 5;
|
|
t = d2 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 6;
|
|
t = d3 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 7;
|
|
|
|
bits ^= 0x0F; // flip lower four bits
|
|
|
|
tOr |= bits;
|
|
cullBits[i] = bits;
|
|
}
|
|
|
|
totalOr = tOr;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::DecalPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
#if 1
|
|
|
|
ALIGN16( float p0[4] );
|
|
ALIGN16( float p1[4] );
|
|
ALIGN16( float p2[4] );
|
|
ALIGN16( float p3[4] );
|
|
ALIGN16( float p4[4] );
|
|
ALIGN16( float p5[4] );
|
|
ALIGN16( float p6[4] );
|
|
ALIGN16( float p7[4] );
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__asm {
|
|
mov ecx, planes
|
|
movlps xmm1, [ecx] // xmm1 = 0, 1, X, X
|
|
movhps xmm1, [ecx+16] // xmm1 = 0, 1, 4, 5
|
|
movlps xmm3, [ecx+8] // xmm3 = 2, 3, X, X
|
|
movhps xmm3, [ecx+24] // xmm3 = 2, 3, 6, 7
|
|
movlps xmm4, [ecx+32] // xmm4 = 8, 9, X, X
|
|
movhps xmm4, [ecx+48] // xmm4 = 8, 9, 12, 13
|
|
movlps xmm5, [ecx+40] // xmm5 = 10, 11, X, X
|
|
movhps xmm5, [ecx+56] // xmm5 = 10, 11, 14, 15
|
|
movaps xmm0, xmm1 // xmm0 = 0, 1, 4, 5
|
|
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm0 = 0, 4, 8, 12
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm1 = 1, 5, 9, 13
|
|
movaps xmm2, xmm3 // xmm2 = 2, 3, 6, 7
|
|
shufps xmm2, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm2 = 2, 6, 10, 14
|
|
shufps xmm3, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm3 = 3, 7, 11, 15
|
|
|
|
movaps p0, xmm0
|
|
movaps p1, xmm1
|
|
movaps p2, xmm2
|
|
movaps p3, xmm3
|
|
|
|
movlps xmm4, [ecx+64] // xmm4 = p40, p41, X, X
|
|
movhps xmm4, [ecx+80] // xmm4 = p40, p41, p50, p51
|
|
movaps xmm5, xmm4 // xmm5 = p40, p41, p50, p51
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm4 = p40, p50, p40, p50
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm5 = p41, p51, p41, p51
|
|
movlps xmm6, [ecx+72] // xmm6 = p42, p43, X, X
|
|
movhps xmm6, [ecx+88] // xmm6 = p42, p43, p52, p53
|
|
movaps xmm7, xmm6 // xmm7 = p42, p43, p52, p53
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 ) // xmm6 = p42, p52, p42, p52
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 1, 3, 1, 3 ) // xmm7 = p43, p53, p43, p53
|
|
|
|
movaps p4, xmm4
|
|
movaps p5, xmm5
|
|
movaps p6, xmm6
|
|
movaps p7, xmm7
|
|
|
|
mov esi, verts
|
|
mov edi, cullBits
|
|
mov eax, numVerts
|
|
and eax, ~1
|
|
jz done2
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
|
|
loopVert2:
|
|
movaps xmm6, p0
|
|
movss xmm0, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movaps xmm7, p1
|
|
movss xmm1, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movaps xmm7, p2
|
|
movss xmm2, [esi+eax+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
addps xmm6, p3
|
|
|
|
cmpnltps xmm6, SIMD_SP_zero
|
|
movmskps ecx, xmm6
|
|
|
|
movaps xmm6, p0
|
|
movss xmm3, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm3
|
|
movaps xmm7, p1
|
|
movss xmm4, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm4
|
|
addps xmm6, xmm7
|
|
movaps xmm7, p2
|
|
movss xmm5, [esi+eax+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm5
|
|
addps xmm6, xmm7
|
|
addps xmm6, p3
|
|
|
|
cmpnltps xmm6, SIMD_SP_zero
|
|
movmskps edx, xmm6
|
|
mov ch, dl
|
|
|
|
shufps xmm0, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm0, p4
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm1, p5
|
|
addps xmm0, xmm1
|
|
shufps xmm2, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm2, p6
|
|
addps xmm0, xmm2
|
|
addps xmm0, p7
|
|
|
|
cmpnltps xmm0, SIMD_SP_zero
|
|
movmskps edx, xmm0
|
|
|
|
add edi, 2
|
|
|
|
mov dh, dl
|
|
shl dl, 4
|
|
shl dh, 2
|
|
and edx, (3<<4)|(3<<12)
|
|
or ecx, edx
|
|
|
|
add eax, 2*DRAWVERT_SIZE
|
|
mov word ptr [edi-2], cx
|
|
jl loopVert2
|
|
|
|
done2:
|
|
|
|
mov eax, numVerts
|
|
and eax, 1
|
|
jz done
|
|
|
|
movaps xmm6, p0
|
|
movss xmm0, [esi+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm6, xmm0
|
|
movaps xmm7, p1
|
|
movss xmm1, [esi+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm1
|
|
addps xmm6, xmm7
|
|
movaps xmm7, p2
|
|
movss xmm2, [esi+DRAWVERT_XYZ_OFFSET+8]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm7, xmm2
|
|
addps xmm6, xmm7
|
|
addps xmm6, p3
|
|
|
|
cmpnltps xmm6, SIMD_SP_zero
|
|
movmskps ecx, xmm6
|
|
|
|
mulps xmm0, p4
|
|
mulps xmm1, p5
|
|
addps xmm0, xmm1
|
|
mulps xmm2, p6
|
|
addps xmm0, xmm2
|
|
addps xmm0, p7
|
|
|
|
cmpnltps xmm0, SIMD_SP_zero
|
|
movmskps edx, xmm0
|
|
|
|
and edx, 3
|
|
shl edx, 4
|
|
or ecx, edx
|
|
|
|
mov byte ptr [edi], cl
|
|
|
|
done:
|
|
}
|
|
|
|
|
|
#else
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i < numVerts; i += 2 ) {
|
|
unsigned short bits0, bits1;
|
|
float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
|
|
const idVec3 &v0 = verts[i+0].xyz;
|
|
const idVec3 &v1 = verts[i+1].xyz;
|
|
|
|
d0 = planes[0][0] * v0[0] + planes[0][1] * v0[1] + planes[0][2] * v0[2] + planes[0][3];
|
|
d1 = planes[1][0] * v0[0] + planes[1][1] * v0[1] + planes[1][2] * v0[2] + planes[1][3];
|
|
d2 = planes[2][0] * v0[0] + planes[2][1] * v0[1] + planes[2][2] * v0[2] + planes[2][3];
|
|
d3 = planes[3][0] * v0[0] + planes[3][1] * v0[1] + planes[3][2] * v0[2] + planes[3][3];
|
|
|
|
d4 = planes[4][0] * v0[0] + planes[4][1] * v0[1] + planes[4][2] * v0[2] + planes[4][3];
|
|
d5 = planes[5][0] * v0[0] + planes[5][1] * v0[1] + planes[5][2] * v0[2] + planes[5][3];
|
|
d10 = planes[4][0] * v1[0] + planes[4][1] * v1[1] + planes[4][2] * v1[2] + planes[4][3];
|
|
d11 = planes[5][0] * v1[0] + planes[5][1] * v1[1] + planes[5][2] * v1[2] + planes[5][3];
|
|
|
|
d6 = planes[0][0] * v1[0] + planes[0][1] * v1[1] + planes[0][2] * v1[2] + planes[0][3];
|
|
d7 = planes[1][0] * v1[0] + planes[1][1] * v1[1] + planes[1][2] * v1[2] + planes[1][3];
|
|
d8 = planes[2][0] * v1[0] + planes[2][1] * v1[1] + planes[2][2] * v1[2] + planes[2][3];
|
|
d9 = planes[3][0] * v1[0] + planes[3][1] * v1[1] + planes[3][2] * v1[2] + planes[3][3];
|
|
|
|
bits0 = FLOATSIGNBITSET( d0 ) << (0+0);
|
|
bits0 |= FLOATSIGNBITSET( d1 ) << (0+1);
|
|
bits0 |= FLOATSIGNBITSET( d2 ) << (0+2);
|
|
bits0 |= FLOATSIGNBITSET( d3 ) << (0+3);
|
|
bits0 |= FLOATSIGNBITSET( d4 ) << (0+4);
|
|
bits0 |= FLOATSIGNBITSET( d5 ) << (0+5);
|
|
|
|
bits1 = FLOATSIGNBITSET( d6 ) << (8+0);
|
|
bits1 |= FLOATSIGNBITSET( d7 ) << (8+1);
|
|
bits1 |= FLOATSIGNBITSET( d8 ) << (8+2);
|
|
bits1 |= FLOATSIGNBITSET( d9 ) << (8+3);
|
|
bits1 |= FLOATSIGNBITSET( d10 ) << (8+4);
|
|
bits1 |= FLOATSIGNBITSET( d11 ) << (8+5);
|
|
|
|
*(unsigned short *)(cullBits + i) = ( bits0 | bits1 ) ^ 0x3F3F;
|
|
}
|
|
|
|
if ( numVerts & 1 ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
const idVec3 &v = verts[numVerts - 1].xyz;
|
|
|
|
d0 = planes[0][0] * v[0] + planes[0][1] * v[1] + planes[0][2] * v[2] + planes[0][3];
|
|
d1 = planes[1][0] * v[0] + planes[1][1] * v[1] + planes[1][2] * v[2] + planes[1][3];
|
|
d2 = planes[2][0] * v[0] + planes[2][1] * v[1] + planes[2][2] * v[2] + planes[2][3];
|
|
d3 = planes[3][0] * v[0] + planes[3][1] * v[1] + planes[3][2] * v[2] + planes[3][3];
|
|
|
|
d4 = planes[4][0] * v[0] + planes[4][1] * v[1] + planes[4][2] * v[2] + planes[4][3];
|
|
d5 = planes[5][0] * v[0] + planes[5][1] * v[1] + planes[5][2] * v[2] + planes[5][3];
|
|
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 3;
|
|
|
|
bits |= FLOATSIGNBITSET( d4 ) << 4;
|
|
bits |= FLOATSIGNBITSET( d5 ) << 5;
|
|
|
|
cullBits[numVerts - 1] = bits ^ 0x3F; // flip lower 6 bits
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::OverlayPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__asm {
|
|
mov eax, numVerts
|
|
mov edx, verts
|
|
mov esi, texCoords
|
|
mov edi, cullBits
|
|
|
|
mov ecx, planes
|
|
movss xmm4, [ecx+ 0]
|
|
movss xmm5, [ecx+16]
|
|
shufps xmm4, xmm5, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
movss xmm5, [ecx+ 4]
|
|
movss xmm6, [ecx+20]
|
|
shufps xmm5, xmm6, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
movss xmm6, [ecx+ 8]
|
|
movss xmm7, [ecx+24]
|
|
shufps xmm6, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
movss xmm7, [ecx+12]
|
|
movss xmm0, [ecx+28]
|
|
shufps xmm7, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 2, 0, 2 )
|
|
|
|
and eax, ~1
|
|
jz done2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loopVert2:
|
|
movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm1, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm0, xmm4
|
|
movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm2, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm1, xmm5
|
|
movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm3, [edx+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
addps xmm0, xmm7
|
|
movaps [esi], xmm0
|
|
movaps xmm1, xmm0
|
|
movaps xmm2, SIMD_SP_one
|
|
subps xmm2, xmm0
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
shufps xmm1, xmm2, R_SHUFFLEPS( 2, 3, 2, 3 )
|
|
add edx, 2*DRAWVERT_SIZE
|
|
movmskps ecx, xmm0
|
|
mov byte ptr [edi+eax+0], cl
|
|
add esi, 4*4
|
|
movmskps ecx, xmm1
|
|
mov byte ptr [edi+eax+1], cl
|
|
add eax, 2
|
|
jl loopVert2
|
|
|
|
done2:
|
|
mov eax, numVerts
|
|
and eax, 1
|
|
jz done
|
|
|
|
movss xmm0, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm0, xmm4
|
|
movss xmm1, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm1, xmm5
|
|
movss xmm2, [edx+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm2, xmm6
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
addps xmm0, xmm7
|
|
movlps [esi], xmm0
|
|
movaps xmm1, xmm0
|
|
movaps xmm2, SIMD_SP_one
|
|
subps xmm2, xmm0
|
|
shufps xmm0, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
movmskps ecx, xmm0
|
|
mov byte ptr [edi], cl
|
|
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
const idPlane &p0 = planes[0];
|
|
const idPlane &p1 = planes[1];
|
|
|
|
for ( int i = 0; i < numVerts - 1; i += 2 ) {
|
|
unsigned short bits;
|
|
float d0, d1, d2, d3;
|
|
|
|
const idVec3 &v0 = verts[i+0].xyz;
|
|
const idVec3 &v1 = verts[i+1].xyz;
|
|
|
|
d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
|
|
d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
|
|
d2 = p0[0] * v1[0] + p0[1] * v1[1] + p0[2] * v1[2] + p0[3];
|
|
d3 = p1[0] * v1[0] + p1[1] * v1[1] + p1[2] * v1[2] + p1[3];
|
|
|
|
texCoords[i+0][0] = d0;
|
|
texCoords[i+0][1] = d1;
|
|
texCoords[i+1][0] = d2;
|
|
texCoords[i+1][1] = d3;
|
|
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 8;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 9;
|
|
|
|
d0 = 1.0f - d0;
|
|
d1 = 1.0f - d1;
|
|
d2 = 1.0f - d2;
|
|
d3 = 1.0f - d3;
|
|
|
|
bits |= FLOATSIGNBITSET( d0 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 3;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 10;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 11;
|
|
|
|
*(unsigned short *)(cullBits + i) = bits;
|
|
}
|
|
|
|
if ( numVerts & 1 ) {
|
|
byte bits;
|
|
float d0, d1;
|
|
|
|
const idPlane &p0 = planes[0];
|
|
const idPlane &p1 = planes[1];
|
|
const idVec3 &v0 = verts[numVerts - 1].xyz;
|
|
|
|
d0 = p0[0] * v0[0] + p0[1] * v0[1] + p0[2] * v0[2] + p0[3];
|
|
d1 = p1[0] * v0[0] + p1[1] * v0[1] + p1[2] * v0[2] + p1[3];
|
|
|
|
texCoords[i][0] = d0;
|
|
texCoords[i][1] = d1;
|
|
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
|
|
d0 = 1.0f - d0;
|
|
d1 = 1.0f - d1;
|
|
|
|
bits |= FLOATSIGNBITSET( d0 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 3;
|
|
|
|
cullBits[numVerts - 1] = bits;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::DeriveTriPlanes
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
#if 1
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
|
|
__asm {
|
|
mov eax, numIndexes
|
|
shl eax, 2
|
|
mov esi, verts
|
|
mov edi, indexes
|
|
mov edx, planes
|
|
|
|
add edi, eax
|
|
neg eax
|
|
|
|
add eax, 4*12
|
|
jge done4
|
|
|
|
loopPlane4:
|
|
mov ebx, [edi+eax-4*12+4]
|
|
imul ebx, DRAWVERT_SIZE
|
|
mov ecx, [edi+eax-4*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
|
|
movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
|
|
movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
mov ebx, [edi+eax-4*12+8]
|
|
imul ebx, DRAWVERT_SIZE
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
|
|
movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
|
|
movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
mov ebx, [edi+eax-3*12+4]
|
|
imul ebx, DRAWVERT_SIZE
|
|
mov ecx, [edi+eax-3*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm0, xmm6
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm1, xmm7
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm2, xmm6
|
|
|
|
mov ebx, [edi+eax-3*12+8]
|
|
imul ebx, DRAWVERT_SIZE
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm3, xmm7
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm4, xmm6
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm5, xmm7
|
|
|
|
mov ebx, [edi+eax-2*12+4]
|
|
imul ebx, DRAWVERT_SIZE
|
|
mov ecx, [edi+eax-2*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm0, xmm6
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm1, xmm7
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm2, xmm6
|
|
|
|
mov ebx, [edi+eax-2*12+8]
|
|
imul ebx, DRAWVERT_SIZE
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm3, xmm7
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm4, xmm6
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm5, xmm7
|
|
|
|
mov ebx, [edi+eax-1*12+4]
|
|
imul ebx, DRAWVERT_SIZE
|
|
mov ecx, [edi+eax-1*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 3, 0, 1, 2 )
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm0, xmm6
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm1, xmm7
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm2, xmm6
|
|
|
|
mov ebx, [edi+eax-1*12+8]
|
|
imul ebx, DRAWVERT_SIZE
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
movss xmm3, xmm7
|
|
|
|
movss xmm6, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm6, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
movss xmm4, xmm6
|
|
|
|
movss xmm7, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm7, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
movss xmm5, xmm7
|
|
|
|
movaps xmm6, xmm4
|
|
mulps xmm6, xmm2
|
|
movaps xmm7, xmm5
|
|
mulps xmm7, xmm1
|
|
subps xmm6, xmm7
|
|
|
|
mulps xmm5, xmm0
|
|
mulps xmm2, xmm3
|
|
subps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm1
|
|
mulps xmm4, xmm0
|
|
subps xmm3, xmm4
|
|
|
|
movaps xmm0, xmm6
|
|
mulps xmm6, xmm6
|
|
movaps xmm1, xmm5
|
|
mulps xmm5, xmm5
|
|
movaps xmm2, xmm3
|
|
mulps xmm3, xmm3
|
|
|
|
addps xmm3, xmm5
|
|
addps xmm3, xmm6
|
|
rsqrtps xmm3, xmm3
|
|
|
|
add edx, 4*16
|
|
mov ecx, [edi+eax-1*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm3
|
|
mulps xmm2, xmm3
|
|
|
|
movss [edx-1*16+0], xmm0
|
|
movss [edx-1*16+4], xmm1
|
|
movss [edx-1*16+8], xmm2
|
|
|
|
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
xorps xmm0, SIMD_SP_singleSignBitMask
|
|
subss xmm0, xmm1
|
|
subss xmm0, xmm2
|
|
movss [edx-1*16+12], xmm0
|
|
|
|
mov ecx, [edi+eax-2*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [edx-2*16+0], xmm0
|
|
movss [edx-2*16+4], xmm1
|
|
movss [edx-2*16+8], xmm2
|
|
|
|
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
xorps xmm0, SIMD_SP_singleSignBitMask
|
|
subss xmm0, xmm1
|
|
subss xmm0, xmm2
|
|
movss [edx-2*16+12], xmm0
|
|
|
|
mov ecx, [edi+eax-3*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [edx-3*16+0], xmm0
|
|
movss [edx-3*16+4], xmm1
|
|
movss [edx-3*16+8], xmm2
|
|
|
|
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
xorps xmm0, SIMD_SP_singleSignBitMask
|
|
subss xmm0, xmm1
|
|
subss xmm0, xmm2
|
|
movss [edx-3*16+12], xmm0
|
|
|
|
mov ecx, [edi+eax-4*12+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [edx-4*16+0], xmm0
|
|
movss [edx-4*16+4], xmm1
|
|
movss [edx-4*16+8], xmm2
|
|
|
|
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
xorps xmm0, SIMD_SP_singleSignBitMask
|
|
subss xmm0, xmm1
|
|
subss xmm0, xmm2
|
|
movss [edx-4*16+12], xmm0
|
|
|
|
add eax, 4*12
|
|
jle loopPlane4
|
|
|
|
done4:
|
|
|
|
sub eax, 4*12
|
|
jge done
|
|
|
|
loopPlane1:
|
|
mov ebx, [edi+eax+4]
|
|
imul ebx, DRAWVERT_SIZE
|
|
mov ecx, [edi+eax+0]
|
|
imul ecx, DRAWVERT_SIZE
|
|
|
|
movss xmm0, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
|
|
movss xmm1, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
|
|
movss xmm2, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
mov ebx, [edi+eax+8]
|
|
imul ebx, DRAWVERT_SIZE
|
|
|
|
movss xmm3, [esi+ebx+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm3, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
|
|
movss xmm4, [esi+ebx+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm4, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
|
|
movss xmm5, [esi+ebx+DRAWVERT_XYZ_OFFSET+8]
|
|
subss xmm5, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
movss xmm6, xmm4
|
|
mulss xmm6, xmm2
|
|
movss xmm7, xmm5
|
|
mulss xmm7, xmm1
|
|
subss xmm6, xmm7
|
|
|
|
mulss xmm5, xmm0
|
|
mulss xmm2, xmm3
|
|
subss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm1
|
|
mulss xmm4, xmm0
|
|
subss xmm3, xmm4
|
|
|
|
movss xmm0, xmm6
|
|
mulss xmm6, xmm6
|
|
movss xmm1, xmm5
|
|
mulss xmm5, xmm5
|
|
movss xmm2, xmm3
|
|
mulss xmm3, xmm3
|
|
|
|
addss xmm3, xmm5
|
|
addss xmm3, xmm6
|
|
rsqrtss xmm3, xmm3
|
|
|
|
add edx, 1*16
|
|
|
|
mulss xmm0, xmm3
|
|
mulss xmm1, xmm3
|
|
mulss xmm2, xmm3
|
|
|
|
movss [edx-1*16+0], xmm0
|
|
movss [edx-1*16+4], xmm1
|
|
movss [edx-1*16+8], xmm2
|
|
|
|
mulss xmm0, [esi+ecx+DRAWVERT_XYZ_OFFSET+0]
|
|
mulss xmm1, [esi+ecx+DRAWVERT_XYZ_OFFSET+4]
|
|
mulss xmm2, [esi+ecx+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
xorps xmm0, SIMD_SP_singleSignBitMask
|
|
subss xmm0, xmm1
|
|
subss xmm0, xmm2
|
|
movss [edx-1*16+12], xmm0
|
|
|
|
add eax, 1*12
|
|
jl loopPlane1
|
|
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j;
|
|
|
|
for ( i = 0; i <= numIndexes - 12; i += 12 ) {
|
|
ALIGN16( float d0[4] );
|
|
ALIGN16( float d1[4] );
|
|
ALIGN16( float d2[4] );
|
|
ALIGN16( float d3[4] );
|
|
ALIGN16( float d4[4] );
|
|
ALIGN16( float d5[4] );
|
|
ALIGN16( float n0[4] );
|
|
ALIGN16( float n1[4] );
|
|
ALIGN16( float n2[4] );
|
|
|
|
for ( j = 0; j < 4; j++ ) {
|
|
const idDrawVert *a, *b, *c;
|
|
|
|
a = verts + indexes[i + j * 3 + 0];
|
|
b = verts + indexes[i + j * 3 + 1];
|
|
c = verts + indexes[i + j * 3 + 2];
|
|
|
|
d0[j] = b->xyz[0] - a->xyz[0];
|
|
d1[j] = b->xyz[1] - a->xyz[1];
|
|
d2[j] = b->xyz[2] - a->xyz[2];
|
|
|
|
d3[j] = c->xyz[0] - a->xyz[0];
|
|
d4[j] = c->xyz[1] - a->xyz[1];
|
|
d5[j] = c->xyz[2] - a->xyz[2];
|
|
}
|
|
|
|
ALIGN16( float tmp[4] );
|
|
|
|
n0[0] = d4[0] * d2[0];
|
|
n0[1] = d4[1] * d2[1];
|
|
n0[2] = d4[2] * d2[2];
|
|
n0[3] = d4[3] * d2[3];
|
|
|
|
n0[0] -= d5[0] * d1[0];
|
|
n0[1] -= d5[1] * d1[1];
|
|
n0[2] -= d5[2] * d1[2];
|
|
n0[3] -= d5[3] * d1[3];
|
|
|
|
n1[0] = d5[0] * d0[0];
|
|
n1[1] = d5[1] * d0[1];
|
|
n1[2] = d5[2] * d0[2];
|
|
n1[3] = d5[3] * d0[3];
|
|
|
|
n1[0] -= d3[0] * d2[0];
|
|
n1[1] -= d3[1] * d2[1];
|
|
n1[2] -= d3[2] * d2[2];
|
|
n1[3] -= d3[3] * d2[3];
|
|
|
|
n2[0] = d3[0] * d1[0];
|
|
n2[1] = d3[1] * d1[1];
|
|
n2[2] = d3[2] * d1[2];
|
|
n2[3] = d3[3] * d1[3];
|
|
|
|
n2[0] -= d4[0] * d0[0];
|
|
n2[1] -= d4[1] * d0[1];
|
|
n2[2] -= d4[2] * d0[2];
|
|
n2[3] -= d4[3] * d0[3];
|
|
|
|
tmp[0] = n0[0] * n0[0];
|
|
tmp[1] = n0[1] * n0[1];
|
|
tmp[2] = n0[2] * n0[2];
|
|
tmp[3] = n0[3] * n0[3];
|
|
|
|
tmp[0] += n1[0] * n1[0];
|
|
tmp[1] += n1[1] * n1[1];
|
|
tmp[2] += n1[2] * n1[2];
|
|
tmp[3] += n1[3] * n1[3];
|
|
|
|
tmp[0] += n2[0] * n2[0];
|
|
tmp[1] += n2[1] * n2[1];
|
|
tmp[2] += n2[2] * n2[2];
|
|
tmp[3] += n2[3] * n2[3];
|
|
|
|
tmp[0] = idMath::RSqrt( tmp[0] );
|
|
tmp[1] = idMath::RSqrt( tmp[1] );
|
|
tmp[2] = idMath::RSqrt( tmp[2] );
|
|
tmp[3] = idMath::RSqrt( tmp[3] );
|
|
|
|
n0[0] *= tmp[0];
|
|
n0[1] *= tmp[1];
|
|
n0[2] *= tmp[2];
|
|
n0[3] *= tmp[3];
|
|
|
|
n1[0] *= tmp[0];
|
|
n1[1] *= tmp[1];
|
|
n1[2] *= tmp[2];
|
|
n1[3] *= tmp[3];
|
|
|
|
n2[0] *= tmp[0];
|
|
n2[1] *= tmp[1];
|
|
n2[2] *= tmp[2];
|
|
n2[3] *= tmp[3];
|
|
|
|
|
|
for ( j = 0; j < 4; j++ ) {
|
|
const idDrawVert *a;
|
|
|
|
a = verts + indexes[i + j * 3];
|
|
|
|
planes->Normal()[0] = n0[j];
|
|
planes->Normal()[1] = n1[j];
|
|
planes->Normal()[2] = n2[j];
|
|
planes->FitThroughPoint( a->xyz );
|
|
planes++;
|
|
}
|
|
}
|
|
|
|
for ( ; i < numIndexes; i += 3 ) {
|
|
const idDrawVert *a, *b, *c;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
float n0, n1, n2;
|
|
|
|
a = verts + indexes[i + 0];
|
|
b = verts + indexes[i + 1];
|
|
c = verts + indexes[i + 2];
|
|
|
|
d0 = b->xyz[0] - a->xyz[0];
|
|
d1 = b->xyz[1] - a->xyz[1];
|
|
d2 = b->xyz[2] - a->xyz[2];
|
|
|
|
d3 = c->xyz[0] - a->xyz[0];
|
|
d4 = c->xyz[1] - a->xyz[1];
|
|
d5 = c->xyz[2] - a->xyz[2];
|
|
|
|
float tmp;
|
|
|
|
n0 = d4 * d2 - d5 * d1;
|
|
n1 = d5 * d0 - d3 * d2;
|
|
n2 = d3 * d1 - d4 * d0;
|
|
|
|
tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
|
|
|
|
n0 *= tmp;
|
|
n1 *= tmp;
|
|
n2 *= tmp;
|
|
|
|
planes->Normal()[0] = n0;
|
|
planes->Normal()[1] = n1;
|
|
planes->Normal()[2] = n2;
|
|
planes->FitThroughPoint( a->xyz );
|
|
planes++;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::DeriveTangents
|
|
============
|
|
*/
|
|
//#define REFINE_TANGENT_SQUAREROOT
|
|
#define FIX_DEGENERATE_TANGENT
|
|
|
|
void VPCALL idSIMD_SSE::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
int i;
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
|
|
|
|
assert( planes != NULL );
|
|
assert( verts != NULL );
|
|
assert( numVerts >= 0 );
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
__asm {
|
|
movaps xmm6, SIMD_SP_rsqrt_c0
|
|
movaps xmm7, SIMD_SP_rsqrt_c1
|
|
}
|
|
#endif
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
for ( i = 0; i <= numIndexes - 12; i += 12 ) {
|
|
idDrawVert *a, *b, *c;
|
|
ALIGN16( unsigned int signBit[4] );
|
|
ALIGN16( float d0[4] );
|
|
ALIGN16( float d1[4] );
|
|
ALIGN16( float d2[4] );
|
|
ALIGN16( float d3[4] );
|
|
ALIGN16( float d4[4] );
|
|
ALIGN16( float d5[4] );
|
|
ALIGN16( float d6[4] );
|
|
ALIGN16( float d7[4] );
|
|
ALIGN16( float d8[4] );
|
|
ALIGN16( float d9[4] );
|
|
ALIGN16( float n0[4] );
|
|
ALIGN16( float n1[4] );
|
|
ALIGN16( float n2[4] );
|
|
ALIGN16( float t0[4] );
|
|
ALIGN16( float t1[4] );
|
|
ALIGN16( float t2[4] );
|
|
ALIGN16( float t3[4] );
|
|
ALIGN16( float t4[4] );
|
|
ALIGN16( float t5[4] );
|
|
|
|
for ( int j = 0; j < 4; j++ ) {
|
|
|
|
a = verts + indexes[i + j * 3 + 0];
|
|
b = verts + indexes[i + j * 3 + 1];
|
|
c = verts + indexes[i + j * 3 + 2];
|
|
|
|
d0[j] = b->xyz[0] - a->xyz[0];
|
|
d1[j] = b->xyz[1] - a->xyz[1];
|
|
d2[j] = b->xyz[2] - a->xyz[2];
|
|
d3[j] = b->st[0] - a->st[0];
|
|
d4[j] = b->st[1] - a->st[1];
|
|
|
|
d5[j] = c->xyz[0] - a->xyz[0];
|
|
d6[j] = c->xyz[1] - a->xyz[1];
|
|
d7[j] = c->xyz[2] - a->xyz[2];
|
|
d8[j] = c->st[0] - a->st[0];
|
|
d9[j] = c->st[1] - a->st[1];
|
|
}
|
|
|
|
#if 1
|
|
|
|
__asm {
|
|
// normal
|
|
movaps xmm0, d6
|
|
mulps xmm0, d2
|
|
movaps xmm1, d7
|
|
mulps xmm1, d1
|
|
subps xmm0, xmm1
|
|
|
|
movaps xmm1, d7
|
|
mulps xmm1, d0
|
|
movaps xmm2, d5
|
|
mulps xmm2, d2
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm2, d5
|
|
mulps xmm2, d1
|
|
movaps xmm3, d6
|
|
mulps xmm3, d0
|
|
subps xmm2, xmm3
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
mulps xmm5, xmm5
|
|
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
#ifdef FIX_DEGENERATE_TANGENT
|
|
xorps xmm4, xmm4
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
#endif
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtps xmm4, xmm3
|
|
mulps xmm3, xmm4
|
|
mulps xmm3, xmm4
|
|
subps xmm3, xmm6
|
|
mulps xmm4, xmm7
|
|
mulps xmm3, xmm4
|
|
#else
|
|
rsqrtps xmm3, xmm3
|
|
#endif
|
|
mulps xmm0, xmm3
|
|
movaps n0, xmm0
|
|
mulps xmm1, xmm3
|
|
movaps n1, xmm1
|
|
mulps xmm2, xmm3
|
|
movaps n2, xmm2
|
|
|
|
// area sign bit
|
|
movaps xmm0, d3
|
|
mulps xmm0, d9
|
|
movaps xmm1, d4
|
|
mulps xmm1, d8
|
|
subps xmm0, xmm1
|
|
andps xmm0, SIMD_SP_signBitMask
|
|
movaps signBit, xmm0
|
|
|
|
// first tangent
|
|
movaps xmm0, d0
|
|
mulps xmm0, d9
|
|
movaps xmm1, d4
|
|
mulps xmm1, d5
|
|
subps xmm0, xmm1
|
|
|
|
movaps xmm1, d1
|
|
mulps xmm1, d9
|
|
movaps xmm2, d4
|
|
mulps xmm2, d6
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm2, d2
|
|
mulps xmm2, d9
|
|
movaps xmm3, d4
|
|
mulps xmm3, d7
|
|
subps xmm2, xmm3
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
mulps xmm5, xmm5
|
|
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
#ifdef FIX_DEGENERATE_TANGENT
|
|
xorps xmm4, xmm4
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
#endif
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtps xmm4, xmm3
|
|
mulps xmm3, xmm4
|
|
mulps xmm3, xmm4
|
|
subps xmm3, xmm6
|
|
mulps xmm4, xmm7
|
|
mulps xmm3, xmm4
|
|
#else
|
|
rsqrtps xmm3, xmm3
|
|
#endif
|
|
xorps xmm3, signBit
|
|
|
|
mulps xmm0, xmm3
|
|
movaps t0, xmm0
|
|
mulps xmm1, xmm3
|
|
movaps t1, xmm1
|
|
mulps xmm2, xmm3
|
|
movaps t2, xmm2
|
|
|
|
// second tangent
|
|
movaps xmm0, d3
|
|
mulps xmm0, d5
|
|
movaps xmm1, d0
|
|
mulps xmm1, d8
|
|
subps xmm0, xmm1
|
|
|
|
movaps xmm1, d3
|
|
mulps xmm1, d6
|
|
movaps xmm2, d1
|
|
mulps xmm2, d8
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm2, d3
|
|
mulps xmm2, d7
|
|
movaps xmm3, d2
|
|
mulps xmm3, d8
|
|
subps xmm2, xmm3
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
mulps xmm5, xmm5
|
|
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
#ifdef FIX_DEGENERATE_TANGENT
|
|
xorps xmm4, xmm4
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
#endif
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtps xmm4, xmm3
|
|
mulps xmm3, xmm4
|
|
mulps xmm3, xmm4
|
|
subps xmm3, xmm6
|
|
mulps xmm4, xmm7
|
|
mulps xmm3, xmm4
|
|
#else
|
|
rsqrtps xmm3, xmm3
|
|
#endif
|
|
xorps xmm3, signBit
|
|
|
|
mulps xmm0, xmm3
|
|
movaps t3, xmm0
|
|
mulps xmm1, xmm3
|
|
movaps t4, xmm1
|
|
mulps xmm2, xmm3
|
|
movaps t5, xmm2
|
|
}
|
|
|
|
#else
|
|
|
|
ALIGN16( float tmp[4] );
|
|
|
|
// normal
|
|
n0[0] = d6[0] * d2[0];
|
|
n0[1] = d6[1] * d2[1];
|
|
n0[2] = d6[2] * d2[2];
|
|
n0[3] = d6[3] * d2[3];
|
|
|
|
n0[0] -= d7[0] * d1[0];
|
|
n0[1] -= d7[1] * d1[1];
|
|
n0[2] -= d7[2] * d1[2];
|
|
n0[3] -= d7[3] * d1[3];
|
|
|
|
n1[0] = d7[0] * d0[0];
|
|
n1[1] = d7[1] * d0[1];
|
|
n1[2] = d7[2] * d0[2];
|
|
n1[3] = d7[3] * d0[3];
|
|
|
|
n1[0] -= d5[0] * d2[0];
|
|
n1[1] -= d5[1] * d2[1];
|
|
n1[2] -= d5[2] * d2[2];
|
|
n1[3] -= d5[3] * d2[3];
|
|
|
|
n2[0] = d5[0] * d1[0];
|
|
n2[1] = d5[1] * d1[1];
|
|
n2[2] = d5[2] * d1[2];
|
|
n2[3] = d5[3] * d1[3];
|
|
|
|
n2[0] -= d6[0] * d0[0];
|
|
n2[1] -= d6[1] * d0[1];
|
|
n2[2] -= d6[2] * d0[2];
|
|
n2[3] -= d6[3] * d0[3];
|
|
|
|
tmp[0] = n0[0] * n0[0];
|
|
tmp[1] = n0[1] * n0[1];
|
|
tmp[2] = n0[2] * n0[2];
|
|
tmp[3] = n0[3] * n0[3];
|
|
|
|
tmp[0] += n1[0] * n1[0];
|
|
tmp[1] += n1[1] * n1[1];
|
|
tmp[2] += n1[2] * n1[2];
|
|
tmp[3] += n1[3] * n1[3];
|
|
|
|
tmp[0] += n2[0] * n2[0];
|
|
tmp[1] += n2[1] * n2[1];
|
|
tmp[2] += n2[2] * n2[2];
|
|
tmp[3] += n2[3] * n2[3];
|
|
|
|
tmp[0] = idMath::RSqrt( tmp[0] );
|
|
tmp[1] = idMath::RSqrt( tmp[1] );
|
|
tmp[2] = idMath::RSqrt( tmp[2] );
|
|
tmp[3] = idMath::RSqrt( tmp[3] );
|
|
|
|
n0[0] *= tmp[0];
|
|
n0[1] *= tmp[1];
|
|
n0[2] *= tmp[2];
|
|
n0[3] *= tmp[3];
|
|
|
|
n1[0] *= tmp[0];
|
|
n1[1] *= tmp[1];
|
|
n1[2] *= tmp[2];
|
|
n1[3] *= tmp[3];
|
|
|
|
n2[0] *= tmp[0];
|
|
n2[1] *= tmp[1];
|
|
n2[2] *= tmp[2];
|
|
n2[3] *= tmp[3];
|
|
|
|
// area sign bit
|
|
tmp[0] = d3[0] * d9[0];
|
|
tmp[1] = d3[1] * d9[1];
|
|
tmp[2] = d3[2] * d9[2];
|
|
tmp[3] = d3[3] * d9[3];
|
|
|
|
tmp[0] -= d4[0] * d8[0];
|
|
tmp[1] -= d4[1] * d8[1];
|
|
tmp[2] -= d4[2] * d8[2];
|
|
tmp[3] -= d4[3] * d8[3];
|
|
|
|
signBit[0] = ( *(unsigned int *)&tmp[0] ) & ( 1 << 31 );
|
|
signBit[1] = ( *(unsigned int *)&tmp[1] ) & ( 1 << 31 );
|
|
signBit[2] = ( *(unsigned int *)&tmp[2] ) & ( 1 << 31 );
|
|
signBit[3] = ( *(unsigned int *)&tmp[3] ) & ( 1 << 31 );
|
|
|
|
// first tangent
|
|
t0[0] = d0[0] * d9[0];
|
|
t0[1] = d0[1] * d9[1];
|
|
t0[2] = d0[2] * d9[2];
|
|
t0[3] = d0[3] * d9[3];
|
|
|
|
t0[0] -= d4[0] * d5[0];
|
|
t0[1] -= d4[1] * d5[1];
|
|
t0[2] -= d4[2] * d5[2];
|
|
t0[3] -= d4[3] * d5[3];
|
|
|
|
t1[0] = d1[0] * d9[0];
|
|
t1[1] = d1[1] * d9[1];
|
|
t1[2] = d1[2] * d9[2];
|
|
t1[3] = d1[3] * d9[3];
|
|
|
|
t1[0] -= d4[0] * d6[0];
|
|
t1[1] -= d4[1] * d6[1];
|
|
t1[2] -= d4[2] * d6[2];
|
|
t1[3] -= d4[3] * d6[3];
|
|
|
|
t2[0] = d2[0] * d9[0];
|
|
t2[1] = d2[1] * d9[1];
|
|
t2[2] = d2[2] * d9[2];
|
|
t2[3] = d2[3] * d9[3];
|
|
|
|
t2[0] -= d4[0] * d7[0];
|
|
t2[1] -= d4[1] * d7[1];
|
|
t2[2] -= d4[2] * d7[2];
|
|
t2[3] -= d4[3] * d7[3];
|
|
|
|
tmp[0] = t0[0] * t0[0];
|
|
tmp[1] = t0[1] * t0[1];
|
|
tmp[2] = t0[2] * t0[2];
|
|
tmp[3] = t0[3] * t0[3];
|
|
|
|
tmp[0] += t1[0] * t1[0];
|
|
tmp[1] += t1[1] * t1[1];
|
|
tmp[2] += t1[2] * t1[2];
|
|
tmp[3] += t1[3] * t1[3];
|
|
|
|
tmp[0] += t2[0] * t2[0];
|
|
tmp[1] += t2[1] * t2[1];
|
|
tmp[2] += t2[2] * t2[2];
|
|
tmp[3] += t2[3] * t2[3];
|
|
|
|
tmp[0] = idMath::RSqrt( tmp[0] );
|
|
tmp[1] = idMath::RSqrt( tmp[1] );
|
|
tmp[2] = idMath::RSqrt( tmp[2] );
|
|
tmp[3] = idMath::RSqrt( tmp[3] );
|
|
|
|
*(unsigned int *)&tmp[0] ^= signBit[0];
|
|
*(unsigned int *)&tmp[1] ^= signBit[1];
|
|
*(unsigned int *)&tmp[2] ^= signBit[2];
|
|
*(unsigned int *)&tmp[3] ^= signBit[3];
|
|
|
|
t0[0] *= tmp[0];
|
|
t0[1] *= tmp[1];
|
|
t0[2] *= tmp[2];
|
|
t0[3] *= tmp[3];
|
|
|
|
t1[0] *= tmp[0];
|
|
t1[1] *= tmp[1];
|
|
t1[2] *= tmp[2];
|
|
t1[3] *= tmp[3];
|
|
|
|
t2[0] *= tmp[0];
|
|
t2[1] *= tmp[1];
|
|
t2[2] *= tmp[2];
|
|
t2[3] *= tmp[3];
|
|
|
|
// second tangent
|
|
t3[0] = d3[0] * d5[0];
|
|
t3[1] = d3[1] * d5[1];
|
|
t3[2] = d3[2] * d5[2];
|
|
t3[3] = d3[3] * d5[3];
|
|
|
|
t3[0] -= d0[0] * d8[0];
|
|
t3[1] -= d0[1] * d8[1];
|
|
t3[2] -= d0[2] * d8[2];
|
|
t3[3] -= d0[3] * d8[3];
|
|
|
|
t4[0] = d3[0] * d6[0];
|
|
t4[1] = d3[1] * d6[1];
|
|
t4[2] = d3[2] * d6[2];
|
|
t4[3] = d3[3] * d6[3];
|
|
|
|
t4[0] -= d1[0] * d8[0];
|
|
t4[1] -= d1[1] * d8[1];
|
|
t4[2] -= d1[2] * d8[2];
|
|
t4[3] -= d1[3] * d8[3];
|
|
|
|
t5[0] = d3[0] * d7[0];
|
|
t5[1] = d3[1] * d7[1];
|
|
t5[2] = d3[2] * d7[2];
|
|
t5[3] = d3[3] * d7[3];
|
|
|
|
t5[0] -= d2[0] * d8[0];
|
|
t5[1] -= d2[1] * d8[1];
|
|
t5[2] -= d2[2] * d8[2];
|
|
t5[3] -= d2[3] * d8[3];
|
|
|
|
tmp[0] = t3[0] * t3[0];
|
|
tmp[1] = t3[1] * t3[1];
|
|
tmp[2] = t3[2] * t3[2];
|
|
tmp[3] = t3[3] * t3[3];
|
|
|
|
tmp[0] += t4[0] * t4[0];
|
|
tmp[1] += t4[1] * t4[1];
|
|
tmp[2] += t4[2] * t4[2];
|
|
tmp[3] += t4[3] * t4[3];
|
|
|
|
tmp[0] += t5[0] * t5[0];
|
|
tmp[1] += t5[1] * t5[1];
|
|
tmp[2] += t5[2] * t5[2];
|
|
tmp[3] += t5[3] * t5[3];
|
|
|
|
tmp[0] = idMath::RSqrt( tmp[0] );
|
|
tmp[1] = idMath::RSqrt( tmp[1] );
|
|
tmp[2] = idMath::RSqrt( tmp[2] );
|
|
tmp[3] = idMath::RSqrt( tmp[3] );
|
|
|
|
*(unsigned int *)&tmp[0] ^= signBit[0];
|
|
*(unsigned int *)&tmp[1] ^= signBit[1];
|
|
*(unsigned int *)&tmp[2] ^= signBit[2];
|
|
*(unsigned int *)&tmp[3] ^= signBit[3];
|
|
|
|
t3[0] *= tmp[0];
|
|
t3[1] *= tmp[1];
|
|
t3[2] *= tmp[2];
|
|
t3[3] *= tmp[3];
|
|
|
|
t4[0] *= tmp[0];
|
|
t4[1] *= tmp[1];
|
|
t4[2] *= tmp[2];
|
|
t4[3] *= tmp[3];
|
|
|
|
t5[0] *= tmp[0];
|
|
t5[1] *= tmp[1];
|
|
t5[2] *= tmp[2];
|
|
t5[3] *= tmp[3];
|
|
|
|
#endif
|
|
|
|
for ( int j = 0; j < 4; j++ ) {
|
|
|
|
const int v0 = indexes[i + j * 3 + 0];
|
|
const int v1 = indexes[i + j * 3 + 1];
|
|
const int v2 = indexes[i + j * 3 + 2];
|
|
|
|
a = verts + v0;
|
|
b = verts + v1;
|
|
c = verts + v2;
|
|
|
|
planes->Normal()[0] = n0[j];
|
|
planes->Normal()[1] = n1[j];
|
|
planes->Normal()[2] = n2[j];
|
|
planes->FitThroughPoint( a->xyz );
|
|
planes++;
|
|
|
|
if ( used[v0] ) {
|
|
a->normal[0] += n0[j];
|
|
a->normal[1] += n1[j];
|
|
a->normal[2] += n2[j];
|
|
|
|
a->tangents[0][0] += t0[j];
|
|
a->tangents[0][1] += t1[j];
|
|
a->tangents[0][2] += t2[j];
|
|
|
|
a->tangents[1][0] += t3[j];
|
|
a->tangents[1][1] += t4[j];
|
|
a->tangents[1][2] += t5[j];
|
|
} else {
|
|
a->normal[0] = n0[j];
|
|
a->normal[1] = n1[j];
|
|
a->normal[2] = n2[j];
|
|
|
|
a->tangents[0][0] = t0[j];
|
|
a->tangents[0][1] = t1[j];
|
|
a->tangents[0][2] = t2[j];
|
|
|
|
a->tangents[1][0] = t3[j];
|
|
a->tangents[1][1] = t4[j];
|
|
a->tangents[1][2] = t5[j];
|
|
|
|
used[v0] = true;
|
|
}
|
|
|
|
if ( used[v1] ) {
|
|
b->normal[0] += n0[j];
|
|
b->normal[1] += n1[j];
|
|
b->normal[2] += n2[j];
|
|
|
|
b->tangents[0][0] += t0[j];
|
|
b->tangents[0][1] += t1[j];
|
|
b->tangents[0][2] += t2[j];
|
|
|
|
b->tangents[1][0] += t3[j];
|
|
b->tangents[1][1] += t4[j];
|
|
b->tangents[1][2] += t5[j];
|
|
} else {
|
|
b->normal[0] = n0[j];
|
|
b->normal[1] = n1[j];
|
|
b->normal[2] = n2[j];
|
|
|
|
b->tangents[0][0] = t0[j];
|
|
b->tangents[0][1] = t1[j];
|
|
b->tangents[0][2] = t2[j];
|
|
|
|
b->tangents[1][0] = t3[j];
|
|
b->tangents[1][1] = t4[j];
|
|
b->tangents[1][2] = t5[j];
|
|
|
|
used[v1] = true;
|
|
}
|
|
|
|
if ( used[v2] ) {
|
|
c->normal[0] += n0[j];
|
|
c->normal[1] += n1[j];
|
|
c->normal[2] += n2[j];
|
|
|
|
c->tangents[0][0] += t0[j];
|
|
c->tangents[0][1] += t1[j];
|
|
c->tangents[0][2] += t2[j];
|
|
|
|
c->tangents[1][0] += t3[j];
|
|
c->tangents[1][1] += t4[j];
|
|
c->tangents[1][2] += t5[j];
|
|
} else {
|
|
c->normal[0] = n0[j];
|
|
c->normal[1] = n1[j];
|
|
c->normal[2] = n2[j];
|
|
|
|
c->tangents[0][0] = t0[j];
|
|
c->tangents[0][1] = t1[j];
|
|
c->tangents[0][2] = t2[j];
|
|
|
|
c->tangents[1][0] = t3[j];
|
|
c->tangents[1][1] = t4[j];
|
|
c->tangents[1][2] = t5[j];
|
|
|
|
used[v2] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
for ( ; i < numIndexes; i += 3 ) {
|
|
idDrawVert *a, *b, *c;
|
|
ALIGN16( unsigned int signBit[4] );
|
|
float d0, d1, d2, d3, d4;
|
|
float d5, d6, d7, d8, d9;
|
|
float n0, n1, n2;
|
|
float t0, t1, t2;
|
|
float t3, t4, t5;
|
|
|
|
const int v0 = indexes[i + 0];
|
|
const int v1 = indexes[i + 1];
|
|
const int v2 = indexes[i + 2];
|
|
|
|
a = verts + v0;
|
|
b = verts + v1;
|
|
c = verts + v2;
|
|
|
|
d0 = b->xyz[0] - a->xyz[0];
|
|
d1 = b->xyz[1] - a->xyz[1];
|
|
d2 = b->xyz[2] - a->xyz[2];
|
|
d3 = b->st[0] - a->st[0];
|
|
d4 = b->st[1] - a->st[1];
|
|
|
|
d5 = c->xyz[0] - a->xyz[0];
|
|
d6 = c->xyz[1] - a->xyz[1];
|
|
d7 = c->xyz[2] - a->xyz[2];
|
|
d8 = c->st[0] - a->st[0];
|
|
d9 = c->st[1] - a->st[1];
|
|
|
|
#if 1
|
|
|
|
__asm {
|
|
// normal
|
|
movss xmm0, d6
|
|
mulss xmm0, d2
|
|
movss xmm1, d7
|
|
mulss xmm1, d1
|
|
subss xmm0, xmm1
|
|
|
|
movss xmm1, d7
|
|
mulss xmm1, d0
|
|
movss xmm2, d5
|
|
mulss xmm2, d2
|
|
subss xmm1, xmm2
|
|
|
|
movss xmm2, d5
|
|
mulss xmm2, d1
|
|
movss xmm3, d6
|
|
mulss xmm3, d0
|
|
subss xmm2, xmm3
|
|
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm3
|
|
mulss xmm4, xmm4
|
|
mulss xmm5, xmm5
|
|
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
#ifdef FIX_DEGENERATE_TANGENT
|
|
xorps xmm4, xmm4
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
#endif
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtss xmm4, xmm3
|
|
mulss xmm3, xmm4
|
|
mulss xmm3, xmm4
|
|
subss xmm3, xmm6
|
|
mulss xmm4, xmm7
|
|
mulss xmm3, xmm4
|
|
#else
|
|
rsqrtss xmm3, xmm3
|
|
#endif
|
|
mulss xmm0, xmm3
|
|
movss n0, xmm0
|
|
mulss xmm1, xmm3
|
|
movss n1, xmm1
|
|
mulss xmm2, xmm3
|
|
movss n2, xmm2
|
|
|
|
// area sign bit
|
|
movss xmm0, d3
|
|
mulss xmm0, d9
|
|
movss xmm1, d4
|
|
mulss xmm1, d8
|
|
subss xmm0, xmm1
|
|
andps xmm0, SIMD_SP_signBitMask
|
|
movaps signBit, xmm0
|
|
|
|
// first tangent
|
|
movss xmm0, d0
|
|
mulss xmm0, d9
|
|
movss xmm1, d4
|
|
mulss xmm1, d5
|
|
subss xmm0, xmm1
|
|
|
|
movss xmm1, d1
|
|
mulss xmm1, d9
|
|
movss xmm2, d4
|
|
mulss xmm2, d6
|
|
subss xmm1, xmm2
|
|
|
|
movss xmm2, d2
|
|
mulss xmm2, d9
|
|
movss xmm3, d4
|
|
mulss xmm3, d7
|
|
subss xmm2, xmm3
|
|
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm3
|
|
mulss xmm4, xmm4
|
|
mulss xmm5, xmm5
|
|
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
#ifdef FIX_DEGENERATE_TANGENT
|
|
xorps xmm4, xmm4
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
#endif
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtss xmm4, xmm3
|
|
mulss xmm3, xmm4
|
|
mulss xmm3, xmm4
|
|
subss xmm3, xmm6
|
|
mulss xmm4, xmm7
|
|
mulss xmm3, xmm4
|
|
#else
|
|
rsqrtss xmm3, xmm3
|
|
#endif
|
|
xorps xmm3, signBit
|
|
|
|
mulss xmm0, xmm3
|
|
movss t0, xmm0
|
|
mulss xmm1, xmm3
|
|
movss t1, xmm1
|
|
mulss xmm2, xmm3
|
|
movss t2, xmm2
|
|
|
|
// second tangent
|
|
movss xmm0, d3
|
|
mulss xmm0, d5
|
|
movss xmm1, d0
|
|
mulss xmm1, d8
|
|
subss xmm0, xmm1
|
|
|
|
movss xmm1, d3
|
|
mulss xmm1, d6
|
|
movss xmm2, d1
|
|
mulss xmm2, d8
|
|
subss xmm1, xmm2
|
|
|
|
movss xmm2, d3
|
|
mulss xmm2, d7
|
|
movss xmm3, d2
|
|
mulss xmm3, d8
|
|
subss xmm2, xmm3
|
|
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm3
|
|
mulss xmm4, xmm4
|
|
mulss xmm5, xmm5
|
|
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
#ifdef FIX_DEGENERATE_TANGENT
|
|
xorps xmm4, xmm4
|
|
cmpeqps xmm4, xmm3
|
|
andps xmm4, SIMD_SP_tiny // if values are zero replace them with a tiny number
|
|
andps xmm3, SIMD_SP_absMask // make sure the values are positive
|
|
orps xmm3, xmm4
|
|
#endif
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtss xmm4, xmm3
|
|
mulss xmm3, xmm4
|
|
mulss xmm3, xmm4
|
|
subss xmm3, xmm6
|
|
mulss xmm4, xmm7
|
|
mulss xmm3, xmm4
|
|
#else
|
|
rsqrtss xmm3, xmm3
|
|
#endif
|
|
xorps xmm3, signBit
|
|
|
|
mulss xmm0, xmm3
|
|
movss t3, xmm0
|
|
mulss xmm1, xmm3
|
|
movss t4, xmm1
|
|
mulss xmm2, xmm3
|
|
movss t5, xmm2
|
|
}
|
|
|
|
#else
|
|
|
|
float tmp;
|
|
|
|
// normal
|
|
n0 = d6 * d2 - d7 * d1;
|
|
n1 = d7 * d0 - d5 * d2;
|
|
n2 = d5 * d1 - d6 * d0;
|
|
|
|
tmp = idMath::RSqrt( n0 * n0 + n1 * n1 + n2 * n2 );
|
|
|
|
n0 *= tmp;
|
|
n1 *= tmp;
|
|
n2 *= tmp;
|
|
|
|
// area sign bit
|
|
tmp = d3 * d9 - d4 * d8;
|
|
signBit[0] = ( *(unsigned int *)&tmp ) & ( 1 << 31 );
|
|
|
|
// first tangent
|
|
t0 = d0 * d9 - d4 * d5;
|
|
t1 = d1 * d9 - d4 * d6;
|
|
t2 = d2 * d9 - d4 * d7;
|
|
|
|
tmp = idMath::RSqrt( t0 * t0 + t1 * t1 + t2 * t2 );
|
|
*(unsigned int *)&tmp ^= signBit[0];
|
|
|
|
t0 *= tmp;
|
|
t1 *= tmp;
|
|
t2 *= tmp;
|
|
|
|
// second tangent
|
|
t3 = d3 * d5 - d0 * d8;
|
|
t4 = d3 * d6 - d1 * d8;
|
|
t5 = d3 * d7 - d2 * d8;
|
|
|
|
tmp = idMath::RSqrt( t3 * t3 + t4 * t4 + t5 * t5 );
|
|
*(unsigned int *)&tmp ^= signBit[0];
|
|
|
|
t3 *= tmp;
|
|
t4 *= tmp;
|
|
t5 *= tmp;
|
|
|
|
#endif
|
|
|
|
planes->Normal()[0] = n0;
|
|
planes->Normal()[1] = n1;
|
|
planes->Normal()[2] = n2;
|
|
planes->FitThroughPoint( a->xyz );
|
|
planes++;
|
|
|
|
if ( used[v0] ) {
|
|
a->normal[0] += n0;
|
|
a->normal[1] += n1;
|
|
a->normal[2] += n2;
|
|
|
|
a->tangents[0][0] += t0;
|
|
a->tangents[0][1] += t1;
|
|
a->tangents[0][2] += t2;
|
|
|
|
a->tangents[1][0] += t3;
|
|
a->tangents[1][1] += t4;
|
|
a->tangents[1][2] += t5;
|
|
} else {
|
|
a->normal[0] = n0;
|
|
a->normal[1] = n1;
|
|
a->normal[2] = n2;
|
|
|
|
a->tangents[0][0] = t0;
|
|
a->tangents[0][1] = t1;
|
|
a->tangents[0][2] = t2;
|
|
|
|
a->tangents[1][0] = t3;
|
|
a->tangents[1][1] = t4;
|
|
a->tangents[1][2] = t5;
|
|
|
|
used[v0] = true;
|
|
}
|
|
|
|
if ( used[v1] ) {
|
|
b->normal[0] += n0;
|
|
b->normal[1] += n1;
|
|
b->normal[2] += n2;
|
|
|
|
b->tangents[0][0] += t0;
|
|
b->tangents[0][1] += t1;
|
|
b->tangents[0][2] += t2;
|
|
|
|
b->tangents[1][0] += t3;
|
|
b->tangents[1][1] += t4;
|
|
b->tangents[1][2] += t5;
|
|
} else {
|
|
b->normal[0] = n0;
|
|
b->normal[1] = n1;
|
|
b->normal[2] = n2;
|
|
|
|
b->tangents[0][0] = t0;
|
|
b->tangents[0][1] = t1;
|
|
b->tangents[0][2] = t2;
|
|
|
|
b->tangents[1][0] = t3;
|
|
b->tangents[1][1] = t4;
|
|
b->tangents[1][2] = t5;
|
|
|
|
used[v1] = true;
|
|
}
|
|
|
|
if ( used[v2] ) {
|
|
c->normal[0] += n0;
|
|
c->normal[1] += n1;
|
|
c->normal[2] += n2;
|
|
|
|
c->tangents[0][0] += t0;
|
|
c->tangents[0][1] += t1;
|
|
c->tangents[0][2] += t2;
|
|
|
|
c->tangents[1][0] += t3;
|
|
c->tangents[1][1] += t4;
|
|
c->tangents[1][2] += t5;
|
|
} else {
|
|
c->normal[0] = n0;
|
|
c->normal[1] = n1;
|
|
c->normal[2] = n2;
|
|
|
|
c->tangents[0][0] = t0;
|
|
c->tangents[0][1] = t1;
|
|
c->tangents[0][2] = t2;
|
|
|
|
c->tangents[1][0] = t3;
|
|
c->tangents[1][1] = t4;
|
|
c->tangents[1][2] = t5;
|
|
|
|
used[v2] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::DeriveUnsmoothedTangents
|
|
============
|
|
*/
|
|
#define DERIVE_UNSMOOTHED_BITANGENT
|
|
|
|
void VPCALL idSIMD_SSE::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
|
|
int i, j;
|
|
|
|
for ( i = 0; i <= numVerts - 4; i += 4 ) {
|
|
ALIGN16( float s0[4] );
|
|
ALIGN16( float s1[4] );
|
|
ALIGN16( float s2[4] );
|
|
ALIGN16( float d0[4] );
|
|
ALIGN16( float d1[4] );
|
|
ALIGN16( float d2[4] );
|
|
ALIGN16( float d3[4] );
|
|
ALIGN16( float d4[4] );
|
|
ALIGN16( float d5[4] );
|
|
ALIGN16( float d6[4] );
|
|
ALIGN16( float d7[4] );
|
|
ALIGN16( float d8[4] );
|
|
ALIGN16( float d9[4] );
|
|
ALIGN16( float n0[4] );
|
|
ALIGN16( float n1[4] );
|
|
ALIGN16( float n2[4] );
|
|
ALIGN16( float t0[4] );
|
|
ALIGN16( float t1[4] );
|
|
ALIGN16( float t2[4] );
|
|
ALIGN16( float t3[4] );
|
|
ALIGN16( float t4[4] );
|
|
ALIGN16( float t5[4] );
|
|
|
|
for ( j = 0; j < 4; j++ ) {
|
|
const idDrawVert *a, *b, *c;
|
|
|
|
const dominantTri_s &dt = dominantTris[i+j];
|
|
|
|
s0[j] = dt.normalizationScale[0];
|
|
s1[j] = dt.normalizationScale[1];
|
|
s2[j] = dt.normalizationScale[2];
|
|
|
|
a = verts + i + j;
|
|
b = verts + dt.v2;
|
|
c = verts + dt.v3;
|
|
|
|
d0[j] = b->xyz[0] - a->xyz[0];
|
|
d1[j] = b->xyz[1] - a->xyz[1];
|
|
d2[j] = b->xyz[2] - a->xyz[2];
|
|
d3[j] = b->st[0] - a->st[0];
|
|
d4[j] = b->st[1] - a->st[1];
|
|
|
|
d5[j] = c->xyz[0] - a->xyz[0];
|
|
d6[j] = c->xyz[1] - a->xyz[1];
|
|
d7[j] = c->xyz[2] - a->xyz[2];
|
|
d8[j] = c->st[0] - a->st[0];
|
|
d9[j] = c->st[1] - a->st[1];
|
|
}
|
|
|
|
#if 1
|
|
|
|
__asm {
|
|
|
|
movaps xmm0, d6
|
|
mulps xmm0, d2
|
|
movaps xmm1, d7
|
|
mulps xmm1, d1
|
|
|
|
movaps xmm2, d7
|
|
mulps xmm2, d0
|
|
movaps xmm3, d5
|
|
mulps xmm3, d2
|
|
|
|
movaps xmm4, d5
|
|
mulps xmm4, d1
|
|
movaps xmm5, d6
|
|
mulps xmm5, d0
|
|
|
|
subps xmm0, xmm1
|
|
subps xmm2, xmm3
|
|
movaps xmm7, s2
|
|
subps xmm4, xmm5
|
|
|
|
mulps xmm0, xmm7
|
|
movaps n0, xmm0
|
|
mulps xmm2, xmm7
|
|
movaps n1, xmm2
|
|
mulps xmm4, xmm7
|
|
movaps n2, xmm4
|
|
|
|
movaps xmm0, d0
|
|
mulps xmm0, d9
|
|
movaps xmm1, d4
|
|
mulps xmm1, d5
|
|
|
|
movaps xmm2, d1
|
|
mulps xmm2, d9
|
|
movaps xmm3, d4
|
|
mulps xmm3, d6
|
|
|
|
movaps xmm4, d2
|
|
mulps xmm4, d9
|
|
movaps xmm5, d4
|
|
mulps xmm5, d7
|
|
|
|
subps xmm0, xmm1
|
|
subps xmm2, xmm3
|
|
movaps xmm7, s0
|
|
subps xmm4, xmm5
|
|
|
|
mulps xmm0, xmm7
|
|
movaps t0, xmm0
|
|
mulps xmm2, xmm7
|
|
movaps t1, xmm2
|
|
mulps xmm4, xmm7
|
|
movaps t2, xmm4
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
movaps xmm0, d3
|
|
mulps xmm0, d5
|
|
movaps xmm1, d0
|
|
mulps xmm1, d8
|
|
|
|
movaps xmm2, d3
|
|
mulps xmm2, d6
|
|
movaps xmm3, d1
|
|
mulps xmm3, d8
|
|
|
|
movaps xmm4, d3
|
|
mulps xmm4, d7
|
|
movaps xmm5, d2
|
|
mulps xmm5, d8
|
|
#else
|
|
movaps xmm0, n2
|
|
mulps xmm0, t1
|
|
movaps xmm1, n1
|
|
mulps xmm1, t2
|
|
|
|
movaps xmm2, n0
|
|
mulps xmm2, t2
|
|
movaps xmm3, n2
|
|
mulps xmm3, t0
|
|
|
|
movaps xmm4, n1
|
|
mulps xmm4, t0
|
|
movaps xmm5, n0
|
|
mulps xmm5, t1
|
|
#endif
|
|
subps xmm0, xmm1
|
|
subps xmm2, xmm3
|
|
movaps xmm7, s1
|
|
subps xmm4, xmm5
|
|
|
|
mulps xmm0, xmm7
|
|
movaps t3, xmm0
|
|
mulps xmm2, xmm7
|
|
movaps t4, xmm2
|
|
mulps xmm4, xmm7
|
|
movaps t5, xmm4
|
|
}
|
|
|
|
#else
|
|
|
|
n0[0] = d6[0] * d2[0];
|
|
n0[1] = d6[1] * d2[1];
|
|
n0[2] = d6[2] * d2[2];
|
|
n0[3] = d6[3] * d2[3];
|
|
|
|
n1[0] = d7[0] * d0[0];
|
|
n1[1] = d7[1] * d0[1];
|
|
n1[2] = d7[2] * d0[2];
|
|
n1[3] = d7[3] * d0[3];
|
|
|
|
n2[0] = d5[0] * d1[0];
|
|
n2[1] = d5[1] * d1[1];
|
|
n2[2] = d5[2] * d1[2];
|
|
n2[3] = d5[3] * d1[3];
|
|
|
|
n0[0] -= d7[0] * d1[0];
|
|
n0[1] -= d7[1] * d1[1];
|
|
n0[2] -= d7[2] * d1[2];
|
|
n0[3] -= d7[3] * d1[3];
|
|
|
|
n1[0] -= d5[0] * d2[0];
|
|
n1[1] -= d5[1] * d2[1];
|
|
n1[2] -= d5[2] * d2[2];
|
|
n1[3] -= d5[3] * d2[3];
|
|
|
|
n2[0] -= d6[0] * d0[0];
|
|
n2[1] -= d6[1] * d0[1];
|
|
n2[2] -= d6[2] * d0[2];
|
|
n2[3] -= d6[3] * d0[3];
|
|
|
|
n0[0] *= s2[0];
|
|
n0[1] *= s2[1];
|
|
n0[2] *= s2[2];
|
|
n0[3] *= s2[3];
|
|
|
|
n1[0] *= s2[0];
|
|
n1[1] *= s2[1];
|
|
n1[2] *= s2[2];
|
|
n1[3] *= s2[3];
|
|
|
|
n2[0] *= s2[0];
|
|
n2[1] *= s2[1];
|
|
n2[2] *= s2[2];
|
|
n2[3] *= s2[3];
|
|
|
|
t0[0] = d0[0] * d9[0];
|
|
t0[1] = d0[1] * d9[1];
|
|
t0[2] = d0[2] * d9[2];
|
|
t0[3] = d0[3] * d9[3];
|
|
|
|
t1[0] = d1[0] * d9[0];
|
|
t1[1] = d1[1] * d9[1];
|
|
t1[2] = d1[2] * d9[2];
|
|
t1[3] = d1[3] * d9[3];
|
|
|
|
t2[0] = d2[0] * d9[0];
|
|
t2[1] = d2[1] * d9[1];
|
|
t2[2] = d2[2] * d9[2];
|
|
t2[3] = d2[3] * d9[3];
|
|
|
|
t0[0] -= d4[0] * d5[0];
|
|
t0[1] -= d4[1] * d5[1];
|
|
t0[2] -= d4[2] * d5[2];
|
|
t0[3] -= d4[3] * d5[3];
|
|
|
|
t1[0] -= d4[0] * d6[0];
|
|
t1[1] -= d4[1] * d6[1];
|
|
t1[2] -= d4[2] * d6[2];
|
|
t1[3] -= d4[3] * d6[3];
|
|
|
|
t2[0] -= d4[0] * d7[0];
|
|
t2[1] -= d4[1] * d7[1];
|
|
t2[2] -= d4[2] * d7[2];
|
|
t2[3] -= d4[3] * d7[3];
|
|
|
|
t0[0] *= s0[0];
|
|
t0[1] *= s0[1];
|
|
t0[2] *= s0[2];
|
|
t0[3] *= s0[3];
|
|
|
|
t1[0] *= s0[0];
|
|
t1[1] *= s0[1];
|
|
t1[2] *= s0[2];
|
|
t1[3] *= s0[3];
|
|
|
|
t2[0] *= s0[0];
|
|
t2[1] *= s0[1];
|
|
t2[2] *= s0[2];
|
|
t2[3] *= s0[3];
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
t3[0] = d3[0] * d5[0];
|
|
t3[1] = d3[1] * d5[1];
|
|
t3[2] = d3[2] * d5[2];
|
|
t3[3] = d3[3] * d5[3];
|
|
|
|
t4[0] = d3[0] * d6[0];
|
|
t4[1] = d3[1] * d6[1];
|
|
t4[2] = d3[2] * d6[2];
|
|
t4[3] = d3[3] * d6[3];
|
|
|
|
t5[0] = d3[0] * d7[0];
|
|
t5[1] = d3[1] * d7[1];
|
|
t5[2] = d3[2] * d7[2];
|
|
t5[3] = d3[3] * d7[3];
|
|
|
|
t3[0] -= d0[0] * d8[0];
|
|
t3[1] -= d0[1] * d8[1];
|
|
t3[2] -= d0[2] * d8[2];
|
|
t3[3] -= d0[3] * d8[3];
|
|
|
|
t4[0] -= d1[0] * d8[0];
|
|
t4[1] -= d1[1] * d8[1];
|
|
t4[2] -= d1[2] * d8[2];
|
|
t4[3] -= d1[3] * d8[3];
|
|
|
|
t5[0] -= d2[0] * d8[0];
|
|
t5[1] -= d2[1] * d8[1];
|
|
t5[2] -= d2[2] * d8[2];
|
|
t5[3] -= d2[3] * d8[3];
|
|
#else
|
|
t3[0] = n2[0] * t1[0];
|
|
t3[1] = n2[1] * t1[1];
|
|
t3[2] = n2[2] * t1[2];
|
|
t3[3] = n2[3] * t1[3];
|
|
|
|
t4[0] = n0[0] * t2[0];
|
|
t4[1] = n0[1] * t2[1];
|
|
t4[2] = n0[2] * t2[2];
|
|
t4[3] = n0[3] * t2[3];
|
|
|
|
t5[0] = n1[0] * t0[0];
|
|
t5[1] = n1[1] * t0[1];
|
|
t5[2] = n1[2] * t0[2];
|
|
t5[3] = n1[3] * t0[3];
|
|
|
|
t3[0] -= n1[0] * t2[0];
|
|
t3[1] -= n1[1] * t2[1];
|
|
t3[2] -= n1[2] * t2[2];
|
|
t3[3] -= n1[3] * t2[3];
|
|
|
|
t4[0] -= n2[0] * t0[0];
|
|
t4[1] -= n2[1] * t0[1];
|
|
t4[2] -= n2[2] * t0[2];
|
|
t4[3] -= n2[3] * t0[3];
|
|
|
|
t5[0] -= n0[0] * t1[0];
|
|
t5[1] -= n0[1] * t1[1];
|
|
t5[2] -= n0[2] * t1[2];
|
|
t5[3] -= n0[3] * t1[3];
|
|
#endif
|
|
t3[0] *= s1[0];
|
|
t3[1] *= s1[1];
|
|
t3[2] *= s1[2];
|
|
t3[3] *= s1[3];
|
|
|
|
t4[0] *= s1[0];
|
|
t4[1] *= s1[1];
|
|
t4[2] *= s1[2];
|
|
t4[3] *= s1[3];
|
|
|
|
t5[0] *= s1[0];
|
|
t5[1] *= s1[1];
|
|
t5[2] *= s1[2];
|
|
t5[3] *= s1[3];
|
|
|
|
#endif
|
|
|
|
for ( j = 0; j < 4; j++ ) {
|
|
idDrawVert *a;
|
|
|
|
a = verts + i + j;
|
|
|
|
a->normal[0] = n0[j];
|
|
a->normal[1] = n1[j];
|
|
a->normal[2] = n2[j];
|
|
|
|
a->tangents[0][0] = t0[j];
|
|
a->tangents[0][1] = t1[j];
|
|
a->tangents[0][2] = t2[j];
|
|
|
|
a->tangents[1][0] = t3[j];
|
|
a->tangents[1][1] = t4[j];
|
|
a->tangents[1][2] = t5[j];
|
|
}
|
|
}
|
|
|
|
for ( ; i < numVerts; i++ ) {
|
|
idDrawVert *a, *b, *c;
|
|
float d0, d1, d2, d3, d4;
|
|
float d5, d6, d7, d8, d9;
|
|
float s0, s1, s2;
|
|
float n0, n1, n2;
|
|
float t0, t1, t2;
|
|
float t3, t4, t5;
|
|
|
|
const dominantTri_s &dt = dominantTris[i];
|
|
|
|
s0 = dt.normalizationScale[0];
|
|
s1 = dt.normalizationScale[1];
|
|
s2 = dt.normalizationScale[2];
|
|
|
|
a = verts + i;
|
|
b = verts + dt.v2;
|
|
c = verts + dt.v3;
|
|
|
|
d0 = b->xyz[0] - a->xyz[0];
|
|
d1 = b->xyz[1] - a->xyz[1];
|
|
d2 = b->xyz[2] - a->xyz[2];
|
|
d3 = b->st[0] - a->st[0];
|
|
d4 = b->st[1] - a->st[1];
|
|
|
|
d5 = c->xyz[0] - a->xyz[0];
|
|
d6 = c->xyz[1] - a->xyz[1];
|
|
d7 = c->xyz[2] - a->xyz[2];
|
|
d8 = c->st[0] - a->st[0];
|
|
d9 = c->st[1] - a->st[1];
|
|
|
|
#if 1
|
|
|
|
__asm {
|
|
|
|
movss xmm0, d6
|
|
mulss xmm0, d2
|
|
movss xmm1, d7
|
|
mulss xmm1, d1
|
|
|
|
movss xmm2, d7
|
|
mulss xmm2, d0
|
|
movss xmm3, d5
|
|
mulss xmm3, d2
|
|
|
|
movss xmm4, d5
|
|
mulss xmm4, d1
|
|
movss xmm5, d6
|
|
mulss xmm5, d0
|
|
|
|
subss xmm0, xmm1
|
|
subss xmm2, xmm3
|
|
movss xmm7, s2
|
|
subss xmm4, xmm5
|
|
|
|
mulss xmm0, xmm7
|
|
movss n0, xmm0
|
|
mulss xmm2, xmm7
|
|
movss n1, xmm2
|
|
mulss xmm4, xmm7
|
|
movss n2, xmm4
|
|
|
|
movss xmm0, d0
|
|
mulss xmm0, d9
|
|
movss xmm1, d4
|
|
mulss xmm1, d5
|
|
|
|
movss xmm2, d1
|
|
mulss xmm2, d9
|
|
movss xmm3, d4
|
|
mulss xmm3, d6
|
|
|
|
movss xmm4, d2
|
|
mulss xmm4, d9
|
|
movss xmm5, d4
|
|
mulss xmm5, d7
|
|
|
|
subss xmm0, xmm1
|
|
subss xmm2, xmm3
|
|
movss xmm7, s0
|
|
subss xmm4, xmm5
|
|
|
|
mulss xmm0, xmm7
|
|
movss t0, xmm0
|
|
mulss xmm2, xmm7
|
|
movss t1, xmm2
|
|
mulss xmm4, xmm7
|
|
movss t2, xmm4
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
movss xmm0, d3
|
|
mulss xmm0, d5
|
|
movss xmm1, d0
|
|
mulss xmm1, d8
|
|
|
|
movss xmm2, d3
|
|
mulss xmm2, d6
|
|
movss xmm3, d1
|
|
mulss xmm3, d8
|
|
|
|
movss xmm4, d3
|
|
mulss xmm4, d7
|
|
movss xmm5, d2
|
|
mulss xmm5, d8
|
|
#else
|
|
movss xmm0, n2
|
|
mulss xmm0, t1
|
|
movss xmm1, n1
|
|
mulss xmm1, t2
|
|
|
|
movss xmm2, n0
|
|
mulss xmm2, t2
|
|
movss xmm3, n2
|
|
mulss xmm3, t0
|
|
|
|
movss xmm4, n1
|
|
mulss xmm4, t0
|
|
movss xmm5, n0
|
|
mulss xmm5, t1
|
|
#endif
|
|
subss xmm0, xmm1
|
|
subss xmm2, xmm3
|
|
movss xmm7, s1
|
|
subss xmm4, xmm5
|
|
|
|
mulss xmm0, xmm7
|
|
movss t3, xmm0
|
|
mulss xmm2, xmm7
|
|
movss t4, xmm2
|
|
mulss xmm4, xmm7
|
|
movss t5, xmm4
|
|
}
|
|
|
|
#else
|
|
|
|
n0 = s2 * ( d6 * d2 - d7 * d1 );
|
|
n1 = s2 * ( d7 * d0 - d5 * d2 );
|
|
n2 = s2 * ( d5 * d1 - d6 * d0 );
|
|
|
|
t0 = s0 * ( d0 * d9 - d4 * d5 );
|
|
t1 = s0 * ( d1 * d9 - d4 * d6 );
|
|
t2 = s0 * ( d2 * d9 - d4 * d7 );
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
t3 = s1 * ( d3 * d5 - d0 * d8 );
|
|
t4 = s1 * ( d3 * d6 - d1 * d8 );
|
|
t5 = s1 * ( d3 * d7 - d2 * d8 );
|
|
#else
|
|
t3 = s1 * ( n2 * t1 - n1 * t2 );
|
|
t4 = s1 * ( n0 * t2 - n2 * t0 );
|
|
t5 = s1 * ( n1 * t0 - n0 * t1 );
|
|
#endif
|
|
|
|
#endif
|
|
|
|
a->normal[0] = n0;
|
|
a->normal[1] = n1;
|
|
a->normal[2] = n2;
|
|
|
|
a->tangents[0][0] = t0;
|
|
a->tangents[0][1] = t1;
|
|
a->tangents[0][2] = t2;
|
|
|
|
a->tangents[1][0] = t3;
|
|
a->tangents[1][1] = t4;
|
|
a->tangents[1][2] = t5;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::NormalizeTangents
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
|
|
ALIGN16( float normal[12] );
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
|
|
|
|
assert( verts != NULL );
|
|
assert( numVerts >= 0 );
|
|
|
|
__asm {
|
|
mov eax, numVerts
|
|
test eax, eax
|
|
jz done
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
movaps xmm6, SIMD_SP_rsqrt_c0
|
|
movaps xmm7, SIMD_SP_rsqrt_c1
|
|
#endif
|
|
mov esi, verts
|
|
imul eax, DRAWVERT_SIZE
|
|
add esi, eax
|
|
neg eax
|
|
add eax, DRAWVERT_SIZE*4
|
|
jle loopVert4
|
|
|
|
sub eax, DRAWVERT_SIZE*4
|
|
jl loopVert1
|
|
|
|
loopVert4:
|
|
|
|
sub eax, DRAWVERT_SIZE*4
|
|
|
|
// normalize 4 idDrawVert::normal
|
|
|
|
movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0] // 0, X, X, X
|
|
movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0] // 0, X, 3, 4
|
|
movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8] // 5, X, X, X
|
|
movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4] // 5, X, 1, 2
|
|
movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0] // 6, X, X, X
|
|
movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0] // 6, X, 9, 10
|
|
movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8] // 11, X, X, X
|
|
movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4] // 11, X, 7, 8
|
|
|
|
movaps xmm1, xmm0
|
|
movaps xmm5, xmm2
|
|
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
|
|
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
|
|
shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
mulps xmm5, xmm5
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtps xmm4, xmm3
|
|
mulps xmm3, xmm4
|
|
mulps xmm3, xmm4
|
|
subps xmm3, xmm6
|
|
mulps xmm4, xmm7
|
|
mulps xmm3, xmm4
|
|
#else
|
|
rsqrtps xmm3, xmm3
|
|
#endif
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm3
|
|
mulps xmm2, xmm3
|
|
|
|
// save the 4 idDrawVert::normal to project the tangents
|
|
|
|
movaps [normal+ 0], xmm0
|
|
movaps [normal+16], xmm1
|
|
movaps [normal+32], xmm2
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_NORMAL_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_NORMAL_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_NORMAL_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_NORMAL_OFFSET+8], xmm2
|
|
|
|
// project and normalize 4 idDrawVert::tangent[0]
|
|
|
|
movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, X, X
|
|
movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0] // 0, X, 3, 4
|
|
movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8] // 5, X, X, X
|
|
movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4] // 5, X, 1, 2
|
|
movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, X, X
|
|
movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0] // 6, X, 9, 10
|
|
movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8] // 11, X, X, X
|
|
movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4] // 11, X, 7, 8
|
|
|
|
movaps xmm1, xmm0
|
|
movaps xmm5, xmm2
|
|
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
|
|
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
|
|
shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, [normal+ 0]
|
|
mulps xmm4, [normal+16]
|
|
mulps xmm5, [normal+32]
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
mulps xmm3, [normal+ 0]
|
|
mulps xmm4, [normal+16]
|
|
mulps xmm5, [normal+32]
|
|
subps xmm0, xmm3
|
|
subps xmm1, xmm4
|
|
subps xmm2, xmm5
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
mulps xmm5, xmm5
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtps xmm4, xmm3
|
|
mulps xmm3, xmm4
|
|
mulps xmm3, xmm4
|
|
subps xmm3, xmm6
|
|
mulps xmm4, xmm7
|
|
mulps xmm3, xmm4
|
|
#else
|
|
rsqrtps xmm3, xmm3
|
|
#endif
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm3
|
|
mulps xmm2, xmm3
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT0_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT0_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT0_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT0_OFFSET+8], xmm2
|
|
|
|
// project and normalize 4 idDrawVert::tangent[1]
|
|
|
|
movss xmm0, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, X, X
|
|
movhps xmm0, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0] // 0, X, 3, 4
|
|
movss xmm2, [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8] // 5, X, X, X
|
|
movhps xmm2, [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4] // 5, X, 1, 2
|
|
movss xmm4, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, X, X
|
|
movhps xmm4, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0] // 6, X, 9, 10
|
|
movss xmm3, [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8] // 11, X, X, X
|
|
movhps xmm3, [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4] // 11, X, 7, 8
|
|
|
|
movaps xmm1, xmm0
|
|
movaps xmm5, xmm2
|
|
shufps xmm0, xmm4, R_SHUFFLEPS( 0, 2, 0, 2 ) // 0, 3, 6, 9
|
|
shufps xmm2, xmm3, R_SHUFFLEPS( 3, 0, 3, 0 ) // 2, 5, 8, 11
|
|
shufps xmm1, xmm5, R_SHUFFLEPS( 3, 3, 2, 2 ) // 4, 4, 1, 1
|
|
shufps xmm4, xmm3, R_SHUFFLEPS( 3, 3, 2, 2 ) // 10, 10, 7, 7
|
|
shufps xmm1, xmm4, R_SHUFFLEPS( 2, 0, 2, 0 ) // 1, 4, 7, 10
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, [normal+ 0]
|
|
mulps xmm4, [normal+16]
|
|
mulps xmm5, [normal+32]
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
movaps xmm4, xmm3
|
|
movaps xmm5, xmm3
|
|
mulps xmm3, [normal+ 0]
|
|
mulps xmm4, [normal+16]
|
|
mulps xmm5, [normal+32]
|
|
subps xmm0, xmm3
|
|
subps xmm1, xmm4
|
|
subps xmm2, xmm5
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
movaps xmm5, xmm2
|
|
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
mulps xmm5, xmm5
|
|
addps xmm3, xmm4
|
|
addps xmm3, xmm5
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtps xmm4, xmm3
|
|
mulps xmm3, xmm4
|
|
mulps xmm3, xmm4
|
|
subps xmm3, xmm6
|
|
mulps xmm4, xmm7
|
|
mulps xmm3, xmm4
|
|
#else
|
|
rsqrtps xmm3, xmm3
|
|
#endif
|
|
|
|
mulps xmm0, xmm3
|
|
mulps xmm1, xmm3
|
|
mulps xmm2, xmm3
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*0+DRAWVERT_TANGENT1_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*1+DRAWVERT_TANGENT1_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*2+DRAWVERT_TANGENT1_OFFSET+8], xmm2
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_SIZE*3+DRAWVERT_TANGENT1_OFFSET+8], xmm2
|
|
|
|
add eax, DRAWVERT_SIZE*8
|
|
|
|
jle loopVert4
|
|
|
|
sub eax, DRAWVERT_SIZE*4
|
|
jge done
|
|
|
|
loopVert1:
|
|
|
|
// normalize one idDrawVert::normal
|
|
|
|
movss xmm0, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
|
|
movss xmm1, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
|
|
movss xmm2, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm3
|
|
mulss xmm4, xmm4
|
|
mulss xmm5, xmm5
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtss xmm4, xmm3
|
|
mulss xmm3, xmm4
|
|
mulss xmm3, xmm4
|
|
subss xmm3, xmm6
|
|
mulss xmm4, xmm7
|
|
mulss xmm3, xmm4
|
|
#else
|
|
rsqrtss xmm3, xmm3
|
|
#endif
|
|
|
|
mulss xmm0, xmm3
|
|
mulss xmm1, xmm3
|
|
mulss xmm2, xmm3
|
|
|
|
movss [esi+eax+DRAWVERT_NORMAL_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_NORMAL_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_NORMAL_OFFSET+8], xmm2
|
|
|
|
// project and normalize one idDrawVert::tangent[0]
|
|
|
|
movss xmm0, [esi+eax+DRAWVERT_TANGENT0_OFFSET+0]
|
|
movss xmm1, [esi+eax+DRAWVERT_TANGENT0_OFFSET+4]
|
|
movss xmm2, [esi+eax+DRAWVERT_TANGENT0_OFFSET+8]
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
|
|
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
|
|
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
movss xmm4, xmm3
|
|
movss xmm5, xmm3
|
|
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
|
|
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
|
|
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
|
|
subss xmm0, xmm3
|
|
subss xmm1, xmm4
|
|
subss xmm2, xmm5
|
|
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm3
|
|
mulss xmm4, xmm4
|
|
mulss xmm5, xmm5
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtss xmm4, xmm3
|
|
mulss xmm3, xmm4
|
|
mulss xmm3, xmm4
|
|
subss xmm3, xmm6
|
|
mulss xmm4, xmm7
|
|
mulss xmm3, xmm4
|
|
#else
|
|
rsqrtss xmm3, xmm3
|
|
#endif
|
|
|
|
mulss xmm0, xmm3
|
|
mulss xmm1, xmm3
|
|
mulss xmm2, xmm3
|
|
|
|
movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_TANGENT0_OFFSET+8], xmm2
|
|
|
|
// project and normalize one idDrawVert::tangent[1]
|
|
|
|
movss xmm0, [esi+eax+DRAWVERT_TANGENT1_OFFSET+0]
|
|
movss xmm1, [esi+eax+DRAWVERT_TANGENT1_OFFSET+4]
|
|
movss xmm2, [esi+eax+DRAWVERT_TANGENT1_OFFSET+8]
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
|
|
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
|
|
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
movss xmm4, xmm3
|
|
movss xmm5, xmm3
|
|
mulss xmm3, [esi+eax+DRAWVERT_NORMAL_OFFSET+0]
|
|
mulss xmm4, [esi+eax+DRAWVERT_NORMAL_OFFSET+4]
|
|
mulss xmm5, [esi+eax+DRAWVERT_NORMAL_OFFSET+8]
|
|
subss xmm0, xmm3
|
|
subss xmm1, xmm4
|
|
subss xmm2, xmm5
|
|
|
|
movss xmm3, xmm0
|
|
movss xmm4, xmm1
|
|
movss xmm5, xmm2
|
|
|
|
mulss xmm3, xmm3
|
|
mulss xmm4, xmm4
|
|
mulss xmm5, xmm5
|
|
addss xmm3, xmm4
|
|
addss xmm3, xmm5
|
|
|
|
#ifdef REFINE_TANGENT_SQUAREROOT
|
|
rsqrtss xmm4, xmm3
|
|
mulss xmm3, xmm4
|
|
mulss xmm3, xmm4
|
|
subss xmm3, xmm6
|
|
mulss xmm4, xmm7
|
|
mulss xmm3, xmm4
|
|
#else
|
|
rsqrtss xmm3, xmm3
|
|
#endif
|
|
|
|
mulss xmm0, xmm3
|
|
mulss xmm1, xmm3
|
|
mulss xmm2, xmm3
|
|
|
|
movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+0], xmm0
|
|
movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+4], xmm1
|
|
movss [esi+eax+DRAWVERT_TANGENT1_OFFSET+8], xmm2
|
|
|
|
add eax, DRAWVERT_SIZE
|
|
|
|
jl loopVert1
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CreateTextureSpaceLightVectors
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
for ( int i = numIndexes - 1; i >= 0; i-- ) {
|
|
used[indexes[i]] = true;
|
|
}
|
|
|
|
#if 0
|
|
|
|
__asm {
|
|
|
|
mov eax, numVerts
|
|
|
|
mov esi, used
|
|
add esi, eax
|
|
|
|
mov edi, verts
|
|
sub edi, DRAWVERT_SIZE
|
|
|
|
neg eax
|
|
dec eax
|
|
|
|
mov ecx, lightOrigin
|
|
movss xmm7, [ecx+0]
|
|
movhps xmm7, [ecx+4]
|
|
|
|
mov ecx, lightVectors
|
|
sub ecx, 3*4
|
|
|
|
loopVert:
|
|
inc eax
|
|
jge done
|
|
|
|
add edi, DRAWVERT_SIZE
|
|
add ecx, 3*4
|
|
|
|
cmp byte ptr [esi+eax], 0
|
|
je loopVert
|
|
|
|
movaps xmm0, xmm7
|
|
movss xmm1, [edi+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
|
|
subps xmm0, xmm1
|
|
|
|
// 0, X, 1, 2
|
|
// 3, X, 4, 5
|
|
// 6, X, 7, 8
|
|
|
|
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
|
|
movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
|
|
mulps xmm2, xmm0
|
|
|
|
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
|
|
movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
|
|
mulps xmm3, xmm0
|
|
|
|
movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
|
|
unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
|
|
unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
|
|
|
|
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
|
|
movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
|
|
mulps xmm4, xmm0
|
|
|
|
movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
|
|
movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
|
|
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
|
|
|
|
addps xmm5, xmm4
|
|
addps xmm5, xmm2
|
|
movlps [ecx+0], xmm5
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
movss [ecx+8], xmm5
|
|
|
|
jmp loopVert
|
|
|
|
done:
|
|
}
|
|
|
|
#elif 1
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
idVec3 lightDir;
|
|
|
|
lightDir[0] = lightOrigin[0] - v->xyz[0];
|
|
lightDir[1] = lightOrigin[1] - v->xyz[1];
|
|
lightDir[2] = lightOrigin[2] - v->xyz[2];
|
|
|
|
lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
|
|
lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
|
|
lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
|
|
}
|
|
|
|
#elif 1
|
|
|
|
ALIGN16( int usedVertNums[4] );
|
|
ALIGN16( float lightDir0[4] );
|
|
ALIGN16( float lightDir1[4] );
|
|
ALIGN16( float lightDir2[4] );
|
|
ALIGN16( float normal0[4] );
|
|
ALIGN16( float normal1[4] );
|
|
ALIGN16( float normal2[4] );
|
|
ALIGN16( float tangent0[4] );
|
|
ALIGN16( float tangent1[4] );
|
|
ALIGN16( float tangent2[4] );
|
|
ALIGN16( float tangent3[4] );
|
|
ALIGN16( float tangent4[4] );
|
|
ALIGN16( float tangent5[4] );
|
|
idVec3 localLightOrigin = lightOrigin;
|
|
|
|
__asm {
|
|
|
|
xor ecx, ecx
|
|
mov eax, numVerts
|
|
|
|
mov esi, used
|
|
add esi, eax
|
|
|
|
mov edi, verts
|
|
sub edi, DRAWVERT_SIZE
|
|
|
|
neg eax
|
|
dec eax
|
|
|
|
loopVert4:
|
|
inc eax
|
|
jge done4
|
|
|
|
add edi, DRAWVERT_SIZE
|
|
|
|
cmp byte ptr [esi+eax], 0
|
|
je loopVert4
|
|
|
|
mov usedVertNums[ecx*4], eax
|
|
|
|
inc ecx
|
|
cmp ecx, 4
|
|
|
|
movss xmm0, localLightOrigin[0]
|
|
movss xmm1, localLightOrigin[4]
|
|
movss xmm2, localLightOrigin[8]
|
|
|
|
subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
movss lightDir0[ecx*4-4], xmm0
|
|
movss lightDir1[ecx*4-4], xmm1
|
|
movss lightDir2[ecx*4-4], xmm2
|
|
|
|
movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
|
|
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
|
|
movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
|
|
|
|
movss normal0[ecx*4-4], xmm3
|
|
movss normal1[ecx*4-4], xmm4
|
|
movss normal2[ecx*4-4], xmm5
|
|
|
|
movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
|
|
movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
|
|
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
|
|
|
|
movss tangent0[ecx*4-4], xmm0
|
|
movss tangent1[ecx*4-4], xmm1
|
|
movss tangent2[ecx*4-4], xmm2
|
|
|
|
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
|
|
movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
|
|
movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
|
|
|
|
movss tangent3[ecx*4-4], xmm3
|
|
movss tangent4[ecx*4-4], xmm4
|
|
movss tangent5[ecx*4-4], xmm5
|
|
|
|
jl loopVert4
|
|
|
|
movaps xmm0, lightDir0
|
|
movaps xmm1, lightDir1
|
|
movaps xmm2, lightDir2
|
|
|
|
movaps xmm3, tangent0
|
|
mulps xmm3, xmm0
|
|
movaps xmm4, tangent1
|
|
mulps xmm4, xmm1
|
|
movaps xmm5, tangent2
|
|
mulps xmm5, xmm2
|
|
|
|
addps xmm3, xmm4
|
|
addps xmm5, xmm3
|
|
|
|
movaps xmm3, tangent3
|
|
mulps xmm3, xmm0
|
|
movaps xmm4, tangent4
|
|
mulps xmm4, xmm1
|
|
movaps xmm6, tangent5
|
|
mulps xmm6, xmm2
|
|
|
|
addps xmm3, xmm4
|
|
addps xmm6, xmm3
|
|
|
|
mulps xmm0, normal0
|
|
mulps xmm1, normal1
|
|
mulps xmm2, normal2
|
|
|
|
addps xmm0, xmm1
|
|
addps xmm0, xmm2
|
|
|
|
mov ecx, numVerts
|
|
imul ecx, 12
|
|
mov edx, usedVertNums[0]
|
|
add ecx, lightVectors
|
|
imul edx, 12
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
mov edx, usedVertNums[4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
imul edx, 12
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
mov edx, usedVertNums[8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
imul edx, 12
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
mov edx, usedVertNums[12]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
imul edx, 12
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
|
|
xor ecx, ecx
|
|
jmp loopVert4
|
|
|
|
done4:
|
|
test ecx, ecx
|
|
jz done
|
|
xor eax, eax
|
|
mov edi, numVerts
|
|
imul edi, 12
|
|
add edi, lightVectors
|
|
|
|
loopVert1:
|
|
movss xmm0, lightDir0[eax*4]
|
|
movss xmm1, lightDir1[eax*4]
|
|
movss xmm2, lightDir2[eax*4]
|
|
|
|
mov edx, usedVertNums[eax*4]
|
|
imul edx, 12
|
|
|
|
movss xmm3, tangent0[eax*4]
|
|
mulss xmm3, xmm0
|
|
movss xmm4, tangent1[eax*4]
|
|
mulss xmm4, xmm1
|
|
movss xmm5, tangent2[eax*4]
|
|
mulss xmm5, xmm2
|
|
|
|
addss xmm3, xmm4
|
|
addss xmm5, xmm3
|
|
movss [edi+edx+0], xmm5
|
|
|
|
movss xmm3, tangent3[eax*4]
|
|
mulss xmm3, xmm0
|
|
movss xmm4, tangent4[eax*4]
|
|
mulss xmm4, xmm1
|
|
movss xmm6, tangent5[eax*4]
|
|
mulss xmm6, xmm2
|
|
|
|
addss xmm3, xmm4
|
|
addss xmm6, xmm3
|
|
movss [edi+edx+4], xmm6
|
|
|
|
mulss xmm0, normal0[eax*4]
|
|
mulss xmm1, normal1[eax*4]
|
|
mulss xmm2, normal2[eax*4]
|
|
|
|
addss xmm0, xmm1
|
|
addss xmm0, xmm2
|
|
movss [edi+edx+8], xmm0
|
|
|
|
inc eax
|
|
dec ecx
|
|
jg loopVert1
|
|
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
ALIGN16( float lightVectors0[4] );
|
|
ALIGN16( float lightVectors1[4] );
|
|
ALIGN16( float lightVectors2[4] );
|
|
int numUsedVerts = 0;
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
|
|
lightDir0[numUsedVerts] = lightOrigin[0] - v->xyz[0];
|
|
lightDir1[numUsedVerts] = lightOrigin[1] - v->xyz[1];
|
|
lightDir2[numUsedVerts] = lightOrigin[2] - v->xyz[2];
|
|
|
|
normal0[numUsedVerts] = v->normal[0];
|
|
normal1[numUsedVerts] = v->normal[1];
|
|
normal2[numUsedVerts] = v->normal[2];
|
|
|
|
tangent0[numUsedVerts] = v->tangents[0][0];
|
|
tangent1[numUsedVerts] = v->tangents[0][1];
|
|
tangent2[numUsedVerts] = v->tangents[0][2];
|
|
|
|
tangent3[numUsedVerts] = v->tangents[1][0];
|
|
tangent4[numUsedVerts] = v->tangents[1][1];
|
|
tangent5[numUsedVerts] = v->tangents[1][2];
|
|
|
|
usedVertNums[numUsedVerts++] = i;
|
|
if ( numUsedVerts < 4 ) {
|
|
continue;
|
|
}
|
|
|
|
lightVectors0[0] = lightDir0[0] * tangent0[0];
|
|
lightVectors0[1] = lightDir0[1] * tangent0[1];
|
|
lightVectors0[2] = lightDir0[2] * tangent0[2];
|
|
lightVectors0[3] = lightDir0[3] * tangent0[3];
|
|
|
|
lightVectors0[0] += lightDir1[0] * tangent1[0];
|
|
lightVectors0[1] += lightDir1[1] * tangent1[1];
|
|
lightVectors0[2] += lightDir1[2] * tangent1[2];
|
|
lightVectors0[3] += lightDir1[3] * tangent1[3];
|
|
|
|
lightVectors0[0] += lightDir2[0] * tangent2[0];
|
|
lightVectors0[1] += lightDir2[1] * tangent2[1];
|
|
lightVectors0[2] += lightDir2[2] * tangent2[2];
|
|
lightVectors0[3] += lightDir2[3] * tangent2[3];
|
|
|
|
lightVectors1[0] = lightDir0[0] * tangent3[0];
|
|
lightVectors1[1] = lightDir0[1] * tangent3[1];
|
|
lightVectors1[2] = lightDir0[2] * tangent3[2];
|
|
lightVectors1[3] = lightDir0[3] * tangent3[3];
|
|
|
|
lightVectors1[0] += lightDir1[0] * tangent4[0];
|
|
lightVectors1[1] += lightDir1[1] * tangent4[1];
|
|
lightVectors1[2] += lightDir1[2] * tangent4[2];
|
|
lightVectors1[3] += lightDir1[3] * tangent4[3];
|
|
|
|
lightVectors1[0] += lightDir2[0] * tangent5[0];
|
|
lightVectors1[1] += lightDir2[1] * tangent5[1];
|
|
lightVectors1[2] += lightDir2[2] * tangent5[2];
|
|
lightVectors1[3] += lightDir2[3] * tangent5[3];
|
|
|
|
lightVectors2[0] = lightDir0[0] * normal0[0];
|
|
lightVectors2[1] = lightDir0[1] * normal0[1];
|
|
lightVectors2[2] = lightDir0[2] * normal0[2];
|
|
lightVectors2[3] = lightDir0[3] * normal0[3];
|
|
|
|
lightVectors2[0] += lightDir1[0] * normal1[0];
|
|
lightVectors2[1] += lightDir1[1] * normal1[1];
|
|
lightVectors2[2] += lightDir1[2] * normal1[2];
|
|
lightVectors2[3] += lightDir1[3] * normal1[3];
|
|
|
|
lightVectors2[0] += lightDir2[0] * normal2[0];
|
|
lightVectors2[1] += lightDir2[1] * normal2[1];
|
|
lightVectors2[2] += lightDir2[2] * normal2[2];
|
|
lightVectors2[3] += lightDir2[3] * normal2[3];
|
|
|
|
|
|
for ( int j = 0; j < 4; j++ ) {
|
|
int n = usedVertNums[j];
|
|
|
|
lightVectors[n][0] = lightVectors0[j];
|
|
lightVectors[n][1] = lightVectors1[j];
|
|
lightVectors[n][2] = lightVectors2[j];
|
|
}
|
|
|
|
numUsedVerts = 0;
|
|
}
|
|
|
|
for ( int i = 0; i < numUsedVerts; i++ ) {
|
|
|
|
lightVectors0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
|
|
lightVectors1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
|
|
lightVectors2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
|
|
|
|
int n = usedVertNums[i];
|
|
lightVectors[n][0] = lightVectors0[i];
|
|
lightVectors[n][1] = lightVectors1[i];
|
|
lightVectors[n][2] = lightVectors2[i];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CreateSpecularTextureCoords
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
assert( sizeof( idDrawVert ) == DRAWVERT_SIZE );
|
|
assert( (int)&((idDrawVert *)0)->xyz == DRAWVERT_XYZ_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->normal == DRAWVERT_NORMAL_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[0] == DRAWVERT_TANGENT0_OFFSET );
|
|
assert( (int)&((idDrawVert *)0)->tangents[1] == DRAWVERT_TANGENT1_OFFSET );
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
for ( int i = numIndexes - 1; i >= 0; i-- ) {
|
|
used[indexes[i]] = true;
|
|
}
|
|
|
|
#if 0
|
|
|
|
__asm {
|
|
|
|
mov eax, numVerts
|
|
|
|
mov esi, used
|
|
add esi, eax
|
|
|
|
mov edi, verts
|
|
sub edi, DRAWVERT_SIZE
|
|
|
|
neg eax
|
|
dec eax
|
|
|
|
mov ecx, viewOrigin
|
|
movss xmm6, [ecx+0]
|
|
movhps xmm6, [ecx+4]
|
|
|
|
mov ecx, lightOrigin
|
|
movss xmm7, [ecx+0]
|
|
movhps xmm7, [ecx+4]
|
|
|
|
mov ecx, texCoords
|
|
sub ecx, 4*4
|
|
|
|
loopVert:
|
|
inc eax
|
|
jge done
|
|
|
|
add edi, DRAWVERT_SIZE
|
|
add ecx, 4*4
|
|
|
|
cmp byte ptr [esi+eax], 0
|
|
je loopVert
|
|
|
|
movaps xmm0, xmm7
|
|
movaps xmm1, xmm6
|
|
movss xmm2, [edi+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm2, [edi+DRAWVERT_XYZ_OFFSET+4]
|
|
subps xmm0, xmm2
|
|
subps xmm1, xmm2
|
|
|
|
movaps xmm3, xmm0
|
|
movaps xmm4, xmm1
|
|
mulps xmm3, xmm3
|
|
mulps xmm4, xmm4
|
|
|
|
// 0, X, 1, 2
|
|
// 3, X, 4, 5
|
|
|
|
movaps xmm5, xmm3 // xmm5 = 0, X, 1, 2
|
|
unpcklps xmm5, xmm4 // xmm5 = 0, 3, X, X
|
|
unpckhps xmm3, xmm4 // xmm3 = 1, 4, 2, 5
|
|
movhlps xmm4, xmm3 // xmm4 = 2, 5, 4, 5
|
|
|
|
addps xmm5, xmm3
|
|
addps xmm5, xmm4
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
rsqrtps xmm5, xmm5
|
|
|
|
movaps xmm4, xmm5
|
|
shufps xmm4, xmm4, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
|
|
mulps xmm0, xmm4
|
|
mulps xmm1, xmm5
|
|
addps xmm0, xmm1
|
|
|
|
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+0]
|
|
movhps xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+4]
|
|
mulps xmm2, xmm0
|
|
|
|
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
|
|
movhps xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+4]
|
|
mulps xmm3, xmm0
|
|
|
|
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+0]
|
|
movhps xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
|
|
mulps xmm4, xmm0
|
|
|
|
movaps xmm5, xmm2 // xmm5 = 0, X, 1, 2
|
|
unpcklps xmm5, xmm3 // xmm5 = 0, 3, X, X
|
|
unpckhps xmm2, xmm3 // xmm2 = 1, 4, 2, 5
|
|
|
|
movlhps xmm5, xmm4 // xmm5 = 0, 3, 6, X
|
|
movhlps xmm4, xmm2 // xmm4 = 2, 5, 7, 8
|
|
shufps xmm2, xmm4, R_SHUFFLEPS( 0, 1, 3, 2 ) // xmm2 = 2, 5, 8, 7
|
|
|
|
movaps xmm3, SIMD_SP_one
|
|
|
|
addps xmm5, xmm4
|
|
addps xmm5, xmm2
|
|
movaps [ecx+0], xmm5
|
|
movss [ecx+12], xmm3
|
|
|
|
jmp loopVert
|
|
|
|
done:
|
|
}
|
|
|
|
#elif 0
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
|
|
idVec3 lightDir = lightOrigin - v->xyz;
|
|
idVec3 viewDir = viewOrigin - v->xyz;
|
|
|
|
float ilength;
|
|
|
|
ilength = idMath::RSqrt( lightDir[0] * lightDir[0] + lightDir[1] * lightDir[1] + lightDir[2] * lightDir[2] );
|
|
lightDir[0] *= ilength;
|
|
lightDir[1] *= ilength;
|
|
lightDir[2] *= ilength;
|
|
|
|
ilength = idMath::RSqrt( viewDir[0] * viewDir[0] + viewDir[1] * viewDir[1] + viewDir[2] * viewDir[2] );
|
|
viewDir[0] *= ilength;
|
|
viewDir[1] *= ilength;
|
|
viewDir[2] *= ilength;
|
|
|
|
lightDir += viewDir;
|
|
|
|
texCoords[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
|
|
texCoords[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
|
|
texCoords[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
|
|
texCoords[i][3] = 1.0f;
|
|
}
|
|
|
|
|
|
#elif 1
|
|
|
|
ALIGN16( int usedVertNums[4] );
|
|
ALIGN16( float lightDir0[4] );
|
|
ALIGN16( float lightDir1[4] );
|
|
ALIGN16( float lightDir2[4] );
|
|
ALIGN16( float viewDir0[4] );
|
|
ALIGN16( float viewDir1[4] );
|
|
ALIGN16( float viewDir2[4] );
|
|
ALIGN16( float normal0[4] );
|
|
ALIGN16( float normal1[4] );
|
|
ALIGN16( float normal2[4] );
|
|
ALIGN16( float tangent0[4] );
|
|
ALIGN16( float tangent1[4] );
|
|
ALIGN16( float tangent2[4] );
|
|
ALIGN16( float tangent3[4] );
|
|
ALIGN16( float tangent4[4] );
|
|
ALIGN16( float tangent5[4] );
|
|
idVec3 localLightOrigin = lightOrigin;
|
|
idVec3 localViewOrigin = viewOrigin;
|
|
|
|
__asm {
|
|
|
|
xor ecx, ecx
|
|
mov eax, numVerts
|
|
|
|
mov esi, used
|
|
add esi, eax
|
|
|
|
mov edi, verts
|
|
sub edi, DRAWVERT_SIZE
|
|
|
|
neg eax
|
|
dec eax
|
|
|
|
loopVert4:
|
|
inc eax
|
|
jge done4
|
|
|
|
add edi, DRAWVERT_SIZE
|
|
|
|
cmp byte ptr [esi+eax], 0
|
|
je loopVert4
|
|
|
|
mov usedVertNums[ecx*4], eax
|
|
|
|
inc ecx
|
|
cmp ecx, 4
|
|
|
|
movss xmm3, localLightOrigin[0]
|
|
movss xmm4, localLightOrigin[4]
|
|
movss xmm5, localLightOrigin[8]
|
|
|
|
subss xmm3, [edi+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm4, [edi+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm5, [edi+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
movss lightDir0[ecx*4-4], xmm3
|
|
movss lightDir1[ecx*4-4], xmm4
|
|
movss lightDir2[ecx*4-4], xmm5
|
|
|
|
movss xmm0, localViewOrigin[0]
|
|
movss xmm1, localViewOrigin[4]
|
|
movss xmm2, localViewOrigin[8]
|
|
|
|
subss xmm0, [edi+DRAWVERT_XYZ_OFFSET+0]
|
|
subss xmm1, [edi+DRAWVERT_XYZ_OFFSET+4]
|
|
subss xmm2, [edi+DRAWVERT_XYZ_OFFSET+8]
|
|
|
|
movss viewDir0[ecx*4-4], xmm0
|
|
movss viewDir1[ecx*4-4], xmm1
|
|
movss viewDir2[ecx*4-4], xmm2
|
|
|
|
movss xmm3, [edi+DRAWVERT_NORMAL_OFFSET+0]
|
|
movss xmm4, [edi+DRAWVERT_NORMAL_OFFSET+4]
|
|
movss xmm5, [edi+DRAWVERT_NORMAL_OFFSET+8]
|
|
|
|
movss normal0[ecx*4-4], xmm3
|
|
movss normal1[ecx*4-4], xmm4
|
|
movss normal2[ecx*4-4], xmm5
|
|
|
|
movss xmm0, [edi+DRAWVERT_TANGENT0_OFFSET+0]
|
|
movss xmm1, [edi+DRAWVERT_TANGENT0_OFFSET+4]
|
|
movss xmm2, [edi+DRAWVERT_TANGENT0_OFFSET+8]
|
|
|
|
movss tangent0[ecx*4-4], xmm0
|
|
movss tangent1[ecx*4-4], xmm1
|
|
movss tangent2[ecx*4-4], xmm2
|
|
|
|
movss xmm3, [edi+DRAWVERT_TANGENT1_OFFSET+0]
|
|
movss xmm4, [edi+DRAWVERT_TANGENT1_OFFSET+4]
|
|
movss xmm5, [edi+DRAWVERT_TANGENT1_OFFSET+8]
|
|
|
|
movss tangent3[ecx*4-4], xmm3
|
|
movss tangent4[ecx*4-4], xmm4
|
|
movss tangent5[ecx*4-4], xmm5
|
|
|
|
jl loopVert4
|
|
|
|
movaps xmm6, lightDir0
|
|
movaps xmm0, xmm6
|
|
mulps xmm6, xmm6
|
|
movaps xmm7, lightDir1
|
|
movaps xmm1, xmm7
|
|
mulps xmm7, xmm7
|
|
addps xmm6, xmm7
|
|
movaps xmm5, lightDir2
|
|
movaps xmm2, xmm5
|
|
mulps xmm5, xmm5
|
|
addps xmm6, xmm5
|
|
rsqrtps xmm6, xmm6
|
|
|
|
mulps xmm0, xmm6
|
|
mulps xmm1, xmm6
|
|
mulps xmm2, xmm6
|
|
|
|
movaps xmm3, viewDir0
|
|
movaps xmm7, xmm3
|
|
mulps xmm7, xmm7
|
|
movaps xmm4, viewDir1
|
|
movaps xmm6, xmm4
|
|
mulps xmm6, xmm6
|
|
addps xmm7, xmm6
|
|
movaps xmm5, viewDir2
|
|
movaps xmm6, xmm5
|
|
mulps xmm6, xmm6
|
|
addps xmm7, xmm6
|
|
rsqrtps xmm7, xmm7
|
|
|
|
mulps xmm3, xmm7
|
|
addps xmm0, xmm3
|
|
mulps xmm4, xmm7
|
|
addps xmm1, xmm4
|
|
mulps xmm5, xmm7
|
|
addps xmm2, xmm5
|
|
|
|
movaps xmm3, tangent0
|
|
mulps xmm3, xmm0
|
|
movaps xmm4, tangent1
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movaps xmm5, tangent2
|
|
mulps xmm5, xmm2
|
|
addps xmm5, xmm3
|
|
|
|
movaps xmm3, tangent3
|
|
mulps xmm3, xmm0
|
|
movaps xmm4, tangent4
|
|
mulps xmm4, xmm1
|
|
addps xmm3, xmm4
|
|
movaps xmm6, tangent5
|
|
mulps xmm6, xmm2
|
|
addps xmm6, xmm3
|
|
|
|
mulps xmm0, normal0
|
|
mulps xmm1, normal1
|
|
addps xmm0, xmm1
|
|
mulps xmm2, normal2
|
|
addps xmm0, xmm2
|
|
|
|
mov ecx, numVerts
|
|
shl ecx, 4
|
|
mov edx, usedVertNums[0]
|
|
add ecx, texCoords
|
|
shl edx, 4
|
|
movss xmm3, SIMD_SP_one
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
movss [ecx+edx+12], xmm3
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
mov edx, usedVertNums[4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shl edx, 4
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
movss [ecx+edx+12], xmm3
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
mov edx, usedVertNums[8]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shl edx, 4
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
movss [ecx+edx+12], xmm3
|
|
|
|
shufps xmm5, xmm5, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
mov edx, usedVertNums[12]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
shl edx, 4
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 1, 2, 3, 0 )
|
|
|
|
movss [ecx+edx+0], xmm5
|
|
movss [ecx+edx+4], xmm6
|
|
movss [ecx+edx+8], xmm0
|
|
movss [ecx+edx+12], xmm3
|
|
|
|
xor ecx, ecx
|
|
jmp loopVert4
|
|
|
|
done4:
|
|
test ecx, ecx
|
|
jz done
|
|
xor eax, eax
|
|
mov edi, numVerts
|
|
shl edi, 4
|
|
add edi, texCoords
|
|
|
|
loopVert1:
|
|
movss xmm6, lightDir0[eax*4]
|
|
movss xmm0, xmm6
|
|
mulss xmm6, xmm6
|
|
movss xmm7, lightDir1[eax*4]
|
|
movss xmm1, xmm7
|
|
mulss xmm7, xmm7
|
|
addss xmm6, xmm7
|
|
movss xmm5, lightDir2[eax*4]
|
|
movss xmm2, xmm5
|
|
mulss xmm5, xmm5
|
|
addss xmm6, xmm5
|
|
rsqrtss xmm6, xmm6
|
|
|
|
mulss xmm0, xmm6
|
|
mulss xmm1, xmm6
|
|
mulss xmm2, xmm6
|
|
|
|
movss xmm3, viewDir0[eax*4]
|
|
movss xmm7, xmm3
|
|
mulss xmm7, xmm7
|
|
movss xmm4, viewDir1[eax*4]
|
|
movss xmm6, xmm4
|
|
mulss xmm6, xmm6
|
|
addss xmm7, xmm6
|
|
movss xmm5, viewDir2[eax*4]
|
|
movss xmm6, xmm5
|
|
mulss xmm6, xmm6
|
|
addss xmm7, xmm6
|
|
rsqrtss xmm7, xmm7
|
|
|
|
mulss xmm3, xmm7
|
|
addss xmm0, xmm3
|
|
mulss xmm4, xmm7
|
|
addss xmm1, xmm4
|
|
mulss xmm5, xmm7
|
|
addss xmm2, xmm5
|
|
|
|
mov edx, usedVertNums[eax*4]
|
|
shl edx, 4
|
|
|
|
movss xmm3, tangent0[eax*4]
|
|
mulss xmm3, xmm0
|
|
movss xmm4, tangent1[eax*4]
|
|
mulss xmm4, xmm1
|
|
addss xmm3, xmm4
|
|
movss xmm5, tangent2[eax*4]
|
|
mulss xmm5, xmm2
|
|
addss xmm5, xmm3
|
|
movss [edi+edx+0], xmm5
|
|
|
|
movss xmm3, tangent3[eax*4]
|
|
mulss xmm3, xmm0
|
|
movss xmm4, tangent4[eax*4]
|
|
mulss xmm4, xmm1
|
|
addss xmm3, xmm4
|
|
movss xmm6, tangent5[eax*4]
|
|
mulss xmm6, xmm2
|
|
addss xmm6, xmm3
|
|
movss [edi+edx+4], xmm6
|
|
|
|
mulss xmm0, normal0[eax*4]
|
|
mulss xmm1, normal1[eax*4]
|
|
addss xmm0, xmm1
|
|
mulss xmm2, normal2[eax*4]
|
|
addss xmm0, xmm2
|
|
movss [edi+edx+8], xmm0
|
|
|
|
movss xmm3, SIMD_SP_one
|
|
movss [edi+edx+12], xmm3
|
|
|
|
inc eax
|
|
dec ecx
|
|
jg loopVert1
|
|
|
|
done:
|
|
}
|
|
|
|
#else
|
|
|
|
ALIGN16( int usedVertNums[4] );
|
|
ALIGN16( float lightDir0[4] );
|
|
ALIGN16( float lightDir1[4] );
|
|
ALIGN16( float lightDir2[4] );
|
|
ALIGN16( float viewDir0[4] );
|
|
ALIGN16( float viewDir1[4] );
|
|
ALIGN16( float viewDir2[4] );
|
|
ALIGN16( float normal0[4] );
|
|
ALIGN16( float normal1[4] );
|
|
ALIGN16( float normal2[4] );
|
|
ALIGN16( float tangent0[4] );
|
|
ALIGN16( float tangent1[4] );
|
|
ALIGN16( float tangent2[4] );
|
|
ALIGN16( float tangent3[4] );
|
|
ALIGN16( float tangent4[4] );
|
|
ALIGN16( float tangent5[4] );
|
|
ALIGN16( float texCoords0[4] );
|
|
ALIGN16( float texCoords1[4] );
|
|
ALIGN16( float texCoords2[4] );
|
|
idVec3 localLightOrigin = lightOrigin;
|
|
idVec3 localViewOrigin = viewOrigin;
|
|
int numUsedVerts = 0;
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
|
|
lightDir0[numUsedVerts] = localLightOrigin[0] - v->xyz[0];
|
|
lightDir1[numUsedVerts] = localLightOrigin[1] - v->xyz[1];
|
|
lightDir2[numUsedVerts] = localLightOrigin[2] - v->xyz[2];
|
|
|
|
viewDir0[numUsedVerts] = localViewOrigin[0] - v->xyz[0];
|
|
viewDir1[numUsedVerts] = localViewOrigin[1] - v->xyz[1];
|
|
viewDir2[numUsedVerts] = localViewOrigin[2] - v->xyz[2];
|
|
|
|
normal0[numUsedVerts] = v->normal[0];
|
|
normal1[numUsedVerts] = v->normal[1];
|
|
normal2[numUsedVerts] = v->normal[2];
|
|
|
|
tangent0[numUsedVerts] = v->tangents[0][0];
|
|
tangent1[numUsedVerts] = v->tangents[0][1];
|
|
tangent2[numUsedVerts] = v->tangents[0][2];
|
|
|
|
tangent3[numUsedVerts] = v->tangents[1][0];
|
|
tangent4[numUsedVerts] = v->tangents[1][1];
|
|
tangent5[numUsedVerts] = v->tangents[1][2];
|
|
|
|
usedVertNums[numUsedVerts++] = i;
|
|
if ( numUsedVerts < 4 ) {
|
|
continue;
|
|
}
|
|
|
|
ALIGN16( float temp[4] );
|
|
|
|
temp[0] = lightDir0[0] * lightDir0[0];
|
|
temp[1] = lightDir0[1] * lightDir0[1];
|
|
temp[2] = lightDir0[2] * lightDir0[2];
|
|
temp[3] = lightDir0[3] * lightDir0[3];
|
|
|
|
temp[0] += lightDir1[0] * lightDir1[0];
|
|
temp[1] += lightDir1[1] * lightDir1[1];
|
|
temp[2] += lightDir1[2] * lightDir1[2];
|
|
temp[3] += lightDir1[3] * lightDir1[3];
|
|
|
|
temp[0] += lightDir2[0] * lightDir2[0];
|
|
temp[1] += lightDir2[1] * lightDir2[1];
|
|
temp[2] += lightDir2[2] * lightDir2[2];
|
|
temp[3] += lightDir2[3] * lightDir2[3];
|
|
|
|
temp[0] = idMath::RSqrt( temp[0] );
|
|
temp[1] = idMath::RSqrt( temp[1] );
|
|
temp[2] = idMath::RSqrt( temp[2] );
|
|
temp[3] = idMath::RSqrt( temp[3] );
|
|
|
|
lightDir0[0] *= temp[0];
|
|
lightDir0[1] *= temp[1];
|
|
lightDir0[2] *= temp[2];
|
|
lightDir0[3] *= temp[3];
|
|
|
|
lightDir1[0] *= temp[0];
|
|
lightDir1[1] *= temp[1];
|
|
lightDir1[2] *= temp[2];
|
|
lightDir1[3] *= temp[3];
|
|
|
|
lightDir2[0] *= temp[0];
|
|
lightDir2[1] *= temp[1];
|
|
lightDir2[2] *= temp[2];
|
|
lightDir2[3] *= temp[3];
|
|
|
|
temp[0] = viewDir0[0] * viewDir0[0];
|
|
temp[1] = viewDir0[1] * viewDir0[1];
|
|
temp[2] = viewDir0[2] * viewDir0[2];
|
|
temp[3] = viewDir0[3] * viewDir0[3];
|
|
|
|
temp[0] += viewDir1[0] * viewDir1[0];
|
|
temp[1] += viewDir1[1] * viewDir1[1];
|
|
temp[2] += viewDir1[2] * viewDir1[2];
|
|
temp[3] += viewDir1[3] * viewDir1[3];
|
|
|
|
temp[0] += viewDir2[0] * viewDir2[0];
|
|
temp[1] += viewDir2[1] * viewDir2[1];
|
|
temp[2] += viewDir2[2] * viewDir2[2];
|
|
temp[3] += viewDir2[3] * viewDir2[3];
|
|
|
|
temp[0] = idMath::RSqrt( temp[0] );
|
|
temp[1] = idMath::RSqrt( temp[1] );
|
|
temp[2] = idMath::RSqrt( temp[2] );
|
|
temp[3] = idMath::RSqrt( temp[3] );
|
|
|
|
viewDir0[0] *= temp[0];
|
|
viewDir0[1] *= temp[1];
|
|
viewDir0[2] *= temp[2];
|
|
viewDir0[3] *= temp[3];
|
|
|
|
viewDir1[0] *= temp[0];
|
|
viewDir1[1] *= temp[1];
|
|
viewDir1[2] *= temp[2];
|
|
viewDir1[3] *= temp[3];
|
|
|
|
viewDir2[0] *= temp[0];
|
|
viewDir2[1] *= temp[1];
|
|
viewDir2[2] *= temp[2];
|
|
viewDir2[3] *= temp[3];
|
|
|
|
lightDir0[0] += viewDir0[0];
|
|
lightDir0[1] += viewDir0[1];
|
|
lightDir0[2] += viewDir0[2];
|
|
lightDir0[3] += viewDir0[3];
|
|
|
|
lightDir1[0] += viewDir1[0];
|
|
lightDir1[1] += viewDir1[1];
|
|
lightDir1[2] += viewDir1[2];
|
|
lightDir1[3] += viewDir1[3];
|
|
|
|
lightDir2[0] += viewDir2[0];
|
|
lightDir2[1] += viewDir2[1];
|
|
lightDir2[2] += viewDir2[2];
|
|
lightDir2[3] += viewDir2[3];
|
|
|
|
texCoords0[0] = lightDir0[0] * tangent0[0];
|
|
texCoords0[1] = lightDir0[1] * tangent0[1];
|
|
texCoords0[2] = lightDir0[2] * tangent0[2];
|
|
texCoords0[3] = lightDir0[3] * tangent0[3];
|
|
|
|
texCoords0[0] += lightDir1[0] * tangent1[0];
|
|
texCoords0[1] += lightDir1[1] * tangent1[1];
|
|
texCoords0[2] += lightDir1[2] * tangent1[2];
|
|
texCoords0[3] += lightDir1[3] * tangent1[3];
|
|
|
|
texCoords0[0] += lightDir2[0] * tangent2[0];
|
|
texCoords0[1] += lightDir2[1] * tangent2[1];
|
|
texCoords0[2] += lightDir2[2] * tangent2[2];
|
|
texCoords0[3] += lightDir2[3] * tangent2[3];
|
|
|
|
texCoords1[0] = lightDir0[0] * tangent3[0];
|
|
texCoords1[1] = lightDir0[1] * tangent3[1];
|
|
texCoords1[2] = lightDir0[2] * tangent3[2];
|
|
texCoords1[3] = lightDir0[3] * tangent3[3];
|
|
|
|
texCoords1[0] += lightDir1[0] * tangent4[0];
|
|
texCoords1[1] += lightDir1[1] * tangent4[1];
|
|
texCoords1[2] += lightDir1[2] * tangent4[2];
|
|
texCoords1[3] += lightDir1[3] * tangent4[3];
|
|
|
|
texCoords1[0] += lightDir2[0] * tangent5[0];
|
|
texCoords1[1] += lightDir2[1] * tangent5[1];
|
|
texCoords1[2] += lightDir2[2] * tangent5[2];
|
|
texCoords1[3] += lightDir2[3] * tangent5[3];
|
|
|
|
texCoords2[0] = lightDir0[0] * normal0[0];
|
|
texCoords2[1] = lightDir0[1] * normal0[1];
|
|
texCoords2[2] = lightDir0[2] * normal0[2];
|
|
texCoords2[3] = lightDir0[3] * normal0[3];
|
|
|
|
texCoords2[0] += lightDir1[0] * normal1[0];
|
|
texCoords2[1] += lightDir1[1] * normal1[1];
|
|
texCoords2[2] += lightDir1[2] * normal1[2];
|
|
texCoords2[3] += lightDir1[3] * normal1[3];
|
|
|
|
texCoords2[0] += lightDir2[0] * normal2[0];
|
|
texCoords2[1] += lightDir2[1] * normal2[1];
|
|
texCoords2[2] += lightDir2[2] * normal2[2];
|
|
texCoords2[3] += lightDir2[3] * normal2[3];
|
|
|
|
for ( int j = 0; j < 4; j++ ) {
|
|
int n = usedVertNums[j];
|
|
|
|
texCoords[n][0] = texCoords0[j];
|
|
texCoords[n][1] = texCoords1[j];
|
|
texCoords[n][2] = texCoords2[j];
|
|
texCoords[n][3] = 1.0f;
|
|
}
|
|
|
|
numUsedVerts = 0;
|
|
}
|
|
|
|
for ( int i = 0; i < numUsedVerts; i++ ) {
|
|
float temp;
|
|
|
|
temp = lightDir0[i] * lightDir0[i] + lightDir1[i] * lightDir1[i] + lightDir2[i] * lightDir2[i];
|
|
temp = idMath::RSqrt( temp );
|
|
|
|
lightDir0[i] *= temp;
|
|
lightDir1[i] *= temp;
|
|
lightDir2[i] *= temp;
|
|
|
|
temp = viewDir0[i] * viewDir0[i] + viewDir1[i] * viewDir1[i] + viewDir2[i] * viewDir2[i];
|
|
temp = idMath::RSqrt( temp );
|
|
|
|
viewDir0[i] *= temp;
|
|
viewDir1[i] *= temp;
|
|
viewDir2[i] *= temp;
|
|
|
|
lightDir0[i] += viewDir0[i];
|
|
lightDir1[i] += viewDir1[i];
|
|
lightDir2[i] += viewDir2[i];
|
|
|
|
texCoords0[i] = lightDir0[i] * tangent0[i] + lightDir1[i] * tangent1[i] + lightDir2[i] * tangent2[i];
|
|
texCoords1[i] = lightDir0[i] * tangent3[i] + lightDir1[i] * tangent4[i] + lightDir2[i] * tangent5[i];
|
|
texCoords2[i] = lightDir0[i] * normal0[i] + lightDir1[i] * normal1[i] + lightDir2[i] * normal2[i];
|
|
|
|
int n = usedVertNums[i];
|
|
texCoords[n][0] = texCoords0;
|
|
texCoords[n][1] = texCoords1;
|
|
texCoords[n][2] = texCoords2;
|
|
texCoords[n][3] = 1.0f;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CreateShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_SSE::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
|
|
#if 1
|
|
int outVerts;
|
|
|
|
__asm {
|
|
push ebx
|
|
|
|
mov esi, lightOrigin
|
|
movaps xmm5, SIMD_SP_lastOne
|
|
movss xmm6, [esi+0]
|
|
movhps xmm6, [esi+4]
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 2, 3, 1 )
|
|
orps xmm6, SIMD_SP_lastOne
|
|
movaps xmm7, xmm6
|
|
|
|
xor ebx, ebx
|
|
xor ecx, ecx
|
|
|
|
mov edx, vertRemap
|
|
mov esi, verts
|
|
mov edi, vertexCache
|
|
mov eax, numVerts
|
|
and eax, ~3
|
|
jz done4
|
|
shl eax, 2
|
|
add edx, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
prefetchnta [edx+128]
|
|
prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
|
|
|
|
cmp dword ptr [edx+eax+0], ebx
|
|
jne skip1
|
|
|
|
mov dword ptr [edx+eax+0], ecx
|
|
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
add ecx, 2
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
|
|
orps xmm0, xmm5
|
|
movaps [edi+0*16], xmm0
|
|
subps xmm0, xmm6
|
|
movaps [edi+1*16], xmm0
|
|
add edi, 2*16
|
|
|
|
skip1:
|
|
cmp dword ptr [edx+eax+4], ebx
|
|
jne skip2
|
|
|
|
mov dword ptr [edx+eax+4], ecx
|
|
movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
add ecx, 2
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
|
|
orps xmm1, xmm5
|
|
movaps [edi+0*16], xmm1
|
|
subps xmm1, xmm7
|
|
movaps [edi+1*16], xmm1
|
|
add edi, 2*16
|
|
|
|
skip2:
|
|
cmp dword ptr [edx+eax+8], ebx
|
|
jne skip3
|
|
|
|
mov dword ptr [edx+eax+8], ecx
|
|
movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
add ecx, 2
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
|
|
orps xmm2, xmm5
|
|
movaps [edi+0*16], xmm2
|
|
subps xmm2, xmm6
|
|
movaps [edi+1*16], xmm2
|
|
add edi, 2*16
|
|
|
|
skip3:
|
|
cmp dword ptr [edx+eax+12], ebx
|
|
jne skip4
|
|
|
|
mov dword ptr [edx+eax+12], ecx
|
|
movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
add ecx, 2
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
|
|
orps xmm3, xmm5
|
|
movaps [edi+0*16], xmm3
|
|
subps xmm3, xmm7
|
|
movaps [edi+1*16], xmm3
|
|
add edi, 2*16
|
|
|
|
skip4:
|
|
add esi, 4*DRAWVERT_SIZE
|
|
add eax, 4*4
|
|
jl loop4
|
|
|
|
done4:
|
|
mov eax, numVerts
|
|
and eax, 3
|
|
jz done1
|
|
shl eax, 2
|
|
add edx, eax
|
|
neg eax
|
|
|
|
loop1:
|
|
cmp dword ptr [edx+eax+0], ebx
|
|
jne skip0
|
|
|
|
mov dword ptr [edx+eax+0], ecx
|
|
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
add ecx, 2
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
orps xmm0, xmm5
|
|
movaps [edi+0*16], xmm0
|
|
subps xmm0, xmm6
|
|
movaps [edi+1*16], xmm0
|
|
add edi, 2*16
|
|
|
|
skip0:
|
|
|
|
add esi, DRAWVERT_SIZE
|
|
add eax, 4
|
|
jl loop1
|
|
|
|
done1:
|
|
pop ebx
|
|
mov outVerts, ecx
|
|
}
|
|
return outVerts;
|
|
|
|
#else
|
|
|
|
int outVerts = 0;
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( vertRemap[i] ) {
|
|
continue;
|
|
}
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[outVerts+0][0] = v[0];
|
|
vertexCache[outVerts+0][1] = v[1];
|
|
vertexCache[outVerts+0][2] = v[2];
|
|
vertexCache[outVerts+0][3] = 1.0f;
|
|
|
|
// R_SetupProjection() builds the projection matrix with a slight crunch
|
|
// for depth, which keeps this w=0 division from rasterizing right at the
|
|
// wrap around point and causing depth fighting with the rear caps
|
|
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
|
|
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
|
|
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
|
|
vertexCache[outVerts+1][3] = 0.0f;
|
|
vertRemap[i] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
return outVerts;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::CreateVertexProgramShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_SSE::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
|
|
#if 1
|
|
|
|
__asm {
|
|
movaps xmm4, SIMD_SP_lastOne
|
|
movaps xmm5, xmm4
|
|
movaps xmm6, xmm4
|
|
movaps xmm7, xmm4
|
|
|
|
mov esi, verts
|
|
mov edi, vertexCache
|
|
mov eax, numVerts
|
|
and eax, ~3
|
|
jz done4
|
|
shl eax, 5
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop4:
|
|
prefetchnta [esi+4*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET]
|
|
|
|
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
|
|
movaps [edi+eax+1*16], xmm0
|
|
orps xmm0, xmm4
|
|
movaps [edi+eax+0*16], xmm0
|
|
|
|
movss xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm1, [esi+1*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 2, 3, 1 )
|
|
movaps [edi+eax+3*16], xmm1
|
|
orps xmm1, xmm5
|
|
movaps [edi+eax+2*16], xmm1
|
|
|
|
movss xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm2, [esi+2*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 2, 3, 0, 1 );
|
|
movaps [edi+eax+5*16], xmm2
|
|
orps xmm2, xmm6
|
|
movaps [edi+eax+4*16], xmm2
|
|
|
|
movss xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
movhps xmm3, [esi+3*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+4]
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 0, 2, 3, 1 )
|
|
movaps [edi+eax+7*16], xmm3
|
|
orps xmm3, xmm7
|
|
movaps [edi+eax+6*16], xmm3
|
|
|
|
add esi, 4*DRAWVERT_SIZE
|
|
add eax, 4*8*4
|
|
jl loop4
|
|
|
|
done4:
|
|
mov eax, numVerts
|
|
and eax, 3
|
|
jz done1
|
|
shl eax, 5
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop1:
|
|
movss xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8]
|
|
movhps xmm0, [esi+0*DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 3, 0, 1 );
|
|
movaps [edi+eax+1*16], xmm0
|
|
orps xmm0, xmm4
|
|
movaps [edi+eax+0*16], xmm0
|
|
|
|
add esi, DRAWVERT_SIZE
|
|
add eax, 8*4
|
|
jl loop1
|
|
|
|
done1:
|
|
}
|
|
return numVerts * 2;
|
|
|
|
#else
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[i*2+0][0] = v[0];
|
|
vertexCache[i*2+0][1] = v[1];
|
|
vertexCache[i*2+0][2] = v[2];
|
|
vertexCache[i*2+0][3] = 1.0f;
|
|
|
|
vertexCache[i*2+1][0] = v[0];
|
|
vertexCache[i*2+1][1] = v[1];
|
|
vertexCache[i*2+1][2] = v[2];
|
|
vertexCache[i*2+1][3] = 0.0f;
|
|
}
|
|
return numVerts * 2;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample11kHzMonoPCMTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample11kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 2*4*4
|
|
|
|
movsx ecx, word ptr [esi+eax+0]
|
|
cvtsi2ss xmm0, ecx
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi-2*4*4+0], xmm0
|
|
movhps [edi-2*4*4+8], xmm0
|
|
|
|
movsx edx, word ptr [esi+eax+2]
|
|
cvtsi2ss xmm1, edx
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi-1*4*4+0], xmm1
|
|
movhps [edi-1*4*4+8], xmm1
|
|
|
|
add eax, 2*2
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movsx ecx, word ptr [esi]
|
|
cvtsi2ss xmm0, ecx
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi+0], xmm0
|
|
movhps [edi+8], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample11kHzStereoPCMTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample11kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
|
|
mov eax, numSamples
|
|
test eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 8*4
|
|
|
|
movsx ecx, word ptr [esi+eax+0]
|
|
cvtsi2ss xmm0, ecx
|
|
|
|
movsx edx, word ptr [esi+eax+2]
|
|
cvtsi2ss xmm1, edx
|
|
|
|
unpcklps xmm0, xmm1
|
|
|
|
movlps [edi-8*4+0], xmm0
|
|
movlps [edi-8*4+8], xmm0
|
|
movlps [edi-4*4+0], xmm0
|
|
movlps [edi-4*4+8], xmm0
|
|
|
|
add eax, 2*2
|
|
jl loop2
|
|
|
|
done2:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample22kHzMonoPCMTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample22kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 4*4
|
|
|
|
movsx ecx, word ptr [esi+eax+0]
|
|
cvtsi2ss xmm0, ecx
|
|
|
|
movsx edx, word ptr [esi+eax+2]
|
|
cvtsi2ss xmm1, edx
|
|
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi-4*4+0], xmm0
|
|
movhps [edi-4*4+8], xmm0
|
|
|
|
add eax, 2*2
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movsx ecx, word ptr [esi]
|
|
cvtsi2ss xmm0, ecx
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample22kHzStereoPCMTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample22kHzStereoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
|
|
mov eax, numSamples
|
|
test eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 4*4
|
|
|
|
movsx ecx, word ptr [esi+eax+0]
|
|
cvtsi2ss xmm0, ecx
|
|
movss [edi-4*4], xmm0
|
|
movss [edi-2*4], xmm0
|
|
|
|
movsx edx, word ptr [esi+eax+2]
|
|
cvtsi2ss xmm1, edx
|
|
movss [edi-3*4], xmm1
|
|
movss [edi-1*4], xmm1
|
|
|
|
add eax, 2*2
|
|
jl loop2
|
|
|
|
done2:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample44kHzMonoPCMTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample44kHzMonoPCMTo44kHz( float *dest, const short *src, const int numSamples ) {
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 2*4
|
|
|
|
movsx ecx, word ptr [esi+eax+0]
|
|
cvtsi2ss xmm0, ecx
|
|
movss [edi-2*4], xmm0
|
|
|
|
movsx edx, word ptr [esi+eax+2]
|
|
cvtsi2ss xmm1, edx
|
|
movss [edi-1*4], xmm1
|
|
|
|
add eax, 2*2
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movsx ecx, word ptr [esi]
|
|
cvtsi2ss xmm0, ecx
|
|
movss [edi], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::UpSamplePCMTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
============
|
|
*/
|
|
void idSIMD_SSE::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
SSE_UpSample11kHzMonoPCMTo44kHz( dest, src, numSamples );
|
|
} else {
|
|
SSE_UpSample11kHzStereoPCMTo44kHz( dest, src, numSamples );
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
SSE_UpSample22kHzMonoPCMTo44kHz( dest, src, numSamples );
|
|
} else {
|
|
SSE_UpSample22kHzStereoPCMTo44kHz( dest, src, numSamples );
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
SSE_UpSample44kHzMonoPCMTo44kHz( dest, src, numSamples );
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample11kHzMonoOGGTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample11kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
|
|
float constant = 32768.0f;
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
movss xmm7, constant
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 2
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 2*16
|
|
|
|
movss xmm0, [esi+eax+0]
|
|
mulss xmm0, xmm7
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi-32], xmm0
|
|
movlps [edi-24], xmm0
|
|
|
|
movss xmm1, [esi+eax+4]
|
|
mulss xmm1, xmm7
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi-16], xmm1
|
|
movlps [edi- 8], xmm1
|
|
|
|
add eax, 2*4
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movss xmm0, [esi]
|
|
mulss xmm0, xmm7
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi+0], xmm0
|
|
movlps [edi+8], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample11kHzStereoOGGTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample11kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
|
|
float constant = 32768.0f;
|
|
__asm {
|
|
mov esi, src
|
|
mov ecx, [esi+0]
|
|
mov edx, [esi+4]
|
|
mov edi, dest
|
|
movss xmm7, constant
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 4*16
|
|
|
|
movlps xmm0, [ecx+eax]
|
|
movlps xmm1, [edx+eax]
|
|
unpcklps xmm0, xmm1
|
|
mulps xmm0, xmm7
|
|
movlps [edi-8*8], xmm0
|
|
movlps [edi-7*8], xmm0
|
|
movlps [edi-6*8], xmm0
|
|
movlps [edi-5*8], xmm0
|
|
movhps [edi-4*8], xmm0
|
|
movhps [edi-3*8], xmm0
|
|
movhps [edi-2*8], xmm0
|
|
movhps [edi-1*8], xmm0
|
|
|
|
add eax, 2*4
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movss xmm0, [ecx]
|
|
movss xmm1, [edx]
|
|
unpcklps xmm0, xmm1
|
|
mulps xmm0, xmm7
|
|
movlps [edi+0*8], xmm0
|
|
movlps [edi+1*8], xmm0
|
|
movlps [edi+2*8], xmm0
|
|
movlps [edi+3*8], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample22kHzMonoOGGTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample22kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
|
|
float constant = 32768.0f;
|
|
__asm {
|
|
mov esi, src
|
|
mov edi, dest
|
|
movss xmm7, constant
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 2
|
|
add esi, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 2*8
|
|
|
|
movss xmm0, [esi+eax+0]
|
|
movss xmm1, [esi+eax+4]
|
|
shufps xmm0, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm0, xmm7
|
|
movlps [edi-16], xmm0
|
|
movhps [edi- 8], xmm0
|
|
|
|
add eax, 2*4
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movss xmm0, [esi]
|
|
mulss xmm0, xmm7
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
movlps [edi+0], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample22kHzStereoOGGTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample22kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
|
|
float constant = 32768.0f;
|
|
__asm {
|
|
mov esi, src
|
|
mov ecx, [esi+0]
|
|
mov edx, [esi+4]
|
|
mov edi, dest
|
|
movss xmm7, constant
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 2*16
|
|
|
|
movlps xmm0, [ecx+eax]
|
|
movlps xmm1, [edx+eax]
|
|
unpcklps xmm0, xmm1
|
|
mulps xmm0, xmm7
|
|
movlps [edi-4*8], xmm0
|
|
movlps [edi-3*8], xmm0
|
|
movhps [edi-2*8], xmm0
|
|
movhps [edi-1*8], xmm0
|
|
|
|
add eax, 2*4
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movss xmm0, [ecx]
|
|
movss xmm1, [edx]
|
|
unpcklps xmm0, xmm1
|
|
mulps xmm0, xmm7
|
|
movlps [edi+0*8], xmm0
|
|
movlps [edi+1*8], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample44kHzMonoOGGTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample44kHzMonoOGGTo44kHz( float *dest, const float *src, const int numSamples ) {
|
|
float constant = 32768.0f;
|
|
KFLOAT_CA( mul, dest, src, constant, numSamples )
|
|
}
|
|
|
|
/*
|
|
============
|
|
SSE_UpSample44kHzStereoOGGTo44kHz
|
|
============
|
|
*/
|
|
static void SSE_UpSample44kHzStereoOGGTo44kHz( float *dest, const float * const *src, const int numSamples ) {
|
|
float constant = 32768.0f;
|
|
__asm {
|
|
mov esi, src
|
|
mov ecx, [esi+0]
|
|
mov edx, [esi+4]
|
|
mov edi, dest
|
|
movss xmm7, constant
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
|
|
mov eax, numSamples
|
|
and eax, ~1
|
|
jz done2
|
|
shl eax, 1
|
|
add ecx, eax
|
|
add edx, eax
|
|
neg eax
|
|
|
|
align 16
|
|
loop2:
|
|
add edi, 16
|
|
|
|
movlps xmm0, [ecx+eax]
|
|
movlps xmm1, [edx+eax]
|
|
unpcklps xmm0, xmm1
|
|
mulps xmm0, xmm7
|
|
movlps [edi-2*8], xmm0
|
|
movhps [edi-1*8], xmm0
|
|
|
|
add eax, 2*4
|
|
jl loop2
|
|
|
|
done2:
|
|
mov eax, numSamples
|
|
and eax, 1
|
|
jz done
|
|
|
|
movss xmm0, [ecx]
|
|
movss xmm1, [edx]
|
|
unpcklps xmm0, xmm1
|
|
mulps xmm0, xmm7
|
|
movlps [edi+0*8], xmm0
|
|
|
|
done:
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::UpSampleOGGTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
============
|
|
*/
|
|
void idSIMD_SSE::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
SSE_UpSample11kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
|
|
} else {
|
|
SSE_UpSample11kHzStereoOGGTo44kHz( dest, ogg, numSamples );
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
SSE_UpSample22kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
|
|
} else {
|
|
SSE_UpSample22kHzStereoOGGTo44kHz( dest, ogg, numSamples );
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
if ( numChannels == 1 ) {
|
|
SSE_UpSample44kHzMonoOGGTo44kHz( dest, ogg[0], numSamples );
|
|
} else {
|
|
SSE_UpSample44kHzStereoOGGTo44kHz( dest, ogg, numSamples );
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MixSoundTwoSpeakerMono
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
#if 1
|
|
|
|
ALIGN16( float incs[2] );
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
__asm {
|
|
mov eax, MIXBUFFER_SAMPLES
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 2
|
|
add esi, eax
|
|
neg eax
|
|
|
|
mov ecx, lastV
|
|
movlps xmm6, [ecx]
|
|
xorps xmm7, xmm7
|
|
movhps xmm7, incs
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
addps xmm6, xmm7
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
|
|
addps xmm7, xmm7
|
|
|
|
loop16:
|
|
add edi, 4*4*4
|
|
|
|
movaps xmm0, [esi+eax+0*4*4]
|
|
movaps xmm1, xmm0
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
mulps xmm0, xmm6
|
|
addps xmm0, [edi-4*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-4*4*4], xmm0
|
|
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
|
|
mulps xmm1, xmm6
|
|
addps xmm1, [edi-3*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-3*4*4], xmm1
|
|
|
|
movaps xmm2, [esi+eax+1*4*4]
|
|
movaps xmm3, xmm2
|
|
shufps xmm2, xmm2, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
mulps xmm2, xmm6
|
|
addps xmm2, [edi-2*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-2*4*4], xmm2
|
|
|
|
shufps xmm3, xmm3, R_SHUFFLEPS( 2, 2, 3, 3 )
|
|
mulps xmm3, xmm6
|
|
addps xmm3, [edi-1*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-1*4*4], xmm3
|
|
|
|
add eax, 2*4*4
|
|
|
|
jl loop16
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
float incL;
|
|
float incR;
|
|
float sL0, sL1;
|
|
float sR0, sR1;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
sL0 = lastV[0];
|
|
sR0 = lastV[1];
|
|
sL1 = lastV[0] + incL;
|
|
sR1 = lastV[1] + incR;
|
|
|
|
incL *= 2;
|
|
incR *= 2;
|
|
|
|
for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
|
|
mixBuffer[i*2+0] += samples[i+0] * sL0;
|
|
mixBuffer[i*2+1] += samples[i+0] * sR0;
|
|
mixBuffer[i*2+2] += samples[i+1] * sL1;
|
|
mixBuffer[i*2+3] += samples[i+1] * sR1;
|
|
sL0 += incL;
|
|
sR0 += incR;
|
|
sL1 += incL;
|
|
sR1 += incR;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MixSoundTwoSpeakerStereo
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
#if 1
|
|
|
|
ALIGN16( float incs[2] );
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
__asm {
|
|
mov eax, MIXBUFFER_SAMPLES
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 3
|
|
add esi, eax
|
|
neg eax
|
|
|
|
mov ecx, lastV
|
|
movlps xmm6, [ecx]
|
|
xorps xmm7, xmm7
|
|
movhps xmm7, incs
|
|
shufps xmm6, xmm6, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
addps xmm6, xmm7
|
|
shufps xmm7, xmm7, R_SHUFFLEPS( 2, 3, 2, 3 )
|
|
addps xmm7, xmm7
|
|
|
|
loop16:
|
|
add edi, 4*4*4
|
|
|
|
movaps xmm0, [esi+eax+0*4*4]
|
|
mulps xmm0, xmm6
|
|
addps xmm0, [edi-4*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-4*4*4], xmm0
|
|
|
|
movaps xmm2, [esi+eax+1*4*4]
|
|
mulps xmm2, xmm6
|
|
addps xmm2, [edi-3*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-3*4*4], xmm2
|
|
|
|
movaps xmm3, [esi+eax+2*4*4]
|
|
mulps xmm3, xmm6
|
|
addps xmm3, [edi-2*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-2*4*4], xmm3
|
|
|
|
movaps xmm4, [esi+eax+3*4*4]
|
|
mulps xmm4, xmm6
|
|
addps xmm4, [edi-1*4*4]
|
|
addps xmm6, xmm7
|
|
movaps [edi-1*4*4], xmm4
|
|
|
|
add eax, 4*4*4
|
|
|
|
jl loop16
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
float incL;
|
|
float incR;
|
|
float sL0, sL1;
|
|
float sR0, sR1;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
sL0 = lastV[0];
|
|
sR0 = lastV[1];
|
|
sL1 = lastV[0] + incL;
|
|
sR1 = lastV[1] + incR;
|
|
|
|
incL *= 2;
|
|
incR *= 2;
|
|
|
|
for( i = 0; i < MIXBUFFER_SAMPLES; i += 2 ) {
|
|
mixBuffer[i*2+0] += samples[i*2+0] * sL0;
|
|
mixBuffer[i*2+1] += samples[i*2+1] * sR0;
|
|
mixBuffer[i*2+2] += samples[i*2+2] * sL1;
|
|
mixBuffer[i*2+3] += samples[i*2+3] * sR1;
|
|
sL0 += incL;
|
|
sR0 += incR;
|
|
sL1 += incL;
|
|
sR1 += incR;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MixSoundSixSpeakerMono
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
#if 1
|
|
|
|
ALIGN16( float incs[6] );
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
__asm {
|
|
mov eax, MIXBUFFER_SAMPLES
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 2
|
|
add esi, eax
|
|
neg eax
|
|
|
|
mov ecx, lastV
|
|
movlps xmm2, [ecx+ 0]
|
|
movhps xmm2, [ecx+ 8]
|
|
movlps xmm3, [ecx+16]
|
|
movaps xmm4, xmm2
|
|
shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
|
|
xorps xmm5, xmm5
|
|
movhps xmm5, incs
|
|
movlps xmm7, incs+8
|
|
movhps xmm7, incs+16
|
|
addps xmm3, xmm5
|
|
addps xmm4, xmm7
|
|
shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
movaps xmm6, xmm7
|
|
shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
addps xmm5, xmm5
|
|
addps xmm6, xmm6
|
|
addps xmm7, xmm7
|
|
|
|
loop24:
|
|
add edi, 6*16
|
|
|
|
movaps xmm0, [esi+eax]
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
|
|
mulps xmm1, xmm2
|
|
addps xmm1, [edi-6*16]
|
|
addps xmm2, xmm5
|
|
movaps [edi-6*16], xmm1
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 1, 1 )
|
|
mulps xmm1, xmm3
|
|
addps xmm1, [edi-5*16]
|
|
addps xmm3, xmm6
|
|
movaps [edi-5*16], xmm1
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 1, 1, 1, 1 )
|
|
mulps xmm1, xmm4
|
|
addps xmm1, [edi-4*16]
|
|
addps xmm4, xmm7
|
|
movaps [edi-4*16], xmm1
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 2, 2 )
|
|
mulps xmm1, xmm2
|
|
addps xmm1, [edi-3*16]
|
|
addps xmm2, xmm5
|
|
movaps [edi-3*16], xmm1
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 2, 2, 3, 3 )
|
|
mulps xmm1, xmm3
|
|
addps xmm1, [edi-2*16]
|
|
addps xmm3, xmm6
|
|
movaps [edi-2*16], xmm1
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 3, 3, 3, 3 )
|
|
mulps xmm0, xmm4
|
|
addps xmm0, [edi-1*16]
|
|
addps xmm4, xmm7
|
|
movaps [edi-1*16], xmm0
|
|
|
|
add eax, 4*4
|
|
|
|
jl loop24
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
|
|
float incL0, incL1, incL2, incL3, incL4, incL5;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
sL0 = lastV[0];
|
|
sL1 = lastV[1];
|
|
sL2 = lastV[2];
|
|
sL3 = lastV[3];
|
|
sL4 = lastV[4];
|
|
sL5 = lastV[5];
|
|
|
|
sL6 = lastV[0] + incL0;
|
|
sL7 = lastV[1] + incL1;
|
|
sL8 = lastV[2] + incL2;
|
|
sL9 = lastV[3] + incL3;
|
|
sL10 = lastV[4] + incL4;
|
|
sL11 = lastV[5] + incL5;
|
|
|
|
incL0 *= 2;
|
|
incL1 *= 2;
|
|
incL2 *= 2;
|
|
incL3 *= 2;
|
|
incL4 *= 2;
|
|
incL5 *= 2;
|
|
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
|
|
mixBuffer[i*6+ 0] += samples[i+0] * sL0;
|
|
mixBuffer[i*6+ 1] += samples[i+0] * sL1;
|
|
mixBuffer[i*6+ 2] += samples[i+0] * sL2;
|
|
mixBuffer[i*6+ 3] += samples[i+0] * sL3;
|
|
|
|
mixBuffer[i*6+ 4] += samples[i+0] * sL4;
|
|
mixBuffer[i*6+ 5] += samples[i+0] * sL5;
|
|
mixBuffer[i*6+ 6] += samples[i+1] * sL6;
|
|
mixBuffer[i*6+ 7] += samples[i+1] * sL7;
|
|
|
|
mixBuffer[i*6+ 8] += samples[i+1] * sL8;
|
|
mixBuffer[i*6+ 9] += samples[i+1] * sL9;
|
|
mixBuffer[i*6+10] += samples[i+1] * sL10;
|
|
mixBuffer[i*6+11] += samples[i+1] * sL11;
|
|
|
|
sL0 += incL0;
|
|
sL1 += incL1;
|
|
sL2 += incL2;
|
|
sL3 += incL3;
|
|
|
|
sL4 += incL4;
|
|
sL5 += incL5;
|
|
sL6 += incL0;
|
|
sL7 += incL1;
|
|
|
|
sL8 += incL2;
|
|
sL9 += incL3;
|
|
sL10 += incL4;
|
|
sL11 += incL5;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MixSoundSixSpeakerStereo
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
#if 1
|
|
|
|
ALIGN16( float incs[6] );
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
incs[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incs[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incs[2] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incs[3] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incs[4] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incs[5] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
__asm {
|
|
mov eax, MIXBUFFER_SAMPLES
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 3
|
|
add esi, eax
|
|
neg eax
|
|
|
|
mov ecx, lastV
|
|
movlps xmm2, [ecx+ 0]
|
|
movhps xmm2, [ecx+ 8]
|
|
movlps xmm3, [ecx+16]
|
|
movaps xmm4, xmm2
|
|
shufps xmm3, xmm2, R_SHUFFLEPS( 0, 1, 0, 1 )
|
|
shufps xmm4, xmm3, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
|
|
xorps xmm5, xmm5
|
|
movhps xmm5, incs
|
|
movlps xmm7, incs+ 8
|
|
movhps xmm7, incs+16
|
|
addps xmm3, xmm5
|
|
addps xmm4, xmm7
|
|
shufps xmm5, xmm7, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
movaps xmm6, xmm7
|
|
shufps xmm6, xmm5, R_SHUFFLEPS( 2, 3, 0, 1 )
|
|
addps xmm5, xmm5
|
|
addps xmm6, xmm6
|
|
addps xmm7, xmm7
|
|
|
|
loop12:
|
|
add edi, 3*16
|
|
|
|
movaps xmm0, [esi+eax+0]
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 0, 0 )
|
|
mulps xmm1, xmm2
|
|
addps xmm1, [edi-3*16]
|
|
addps xmm2, xmm5
|
|
movaps [edi-3*16], xmm1
|
|
|
|
movaps xmm1, xmm0
|
|
shufps xmm1, xmm1, R_SHUFFLEPS( 0, 1, 2, 3 )
|
|
mulps xmm1, xmm3
|
|
addps xmm1, [edi-2*16]
|
|
addps xmm3, xmm6
|
|
movaps [edi-2*16], xmm1
|
|
|
|
add eax, 4*4
|
|
|
|
shufps xmm0, xmm0, R_SHUFFLEPS( 2, 2, 2, 3 )
|
|
mulps xmm0, xmm4
|
|
addps xmm0, [edi-1*16]
|
|
addps xmm4, xmm7
|
|
movaps [edi-1*16], xmm0
|
|
|
|
jl loop12
|
|
|
|
emms
|
|
}
|
|
|
|
#else
|
|
|
|
int i;
|
|
float sL0, sL1, sL2, sL3, sL4, sL5, sL6, sL7, sL8, sL9, sL10, sL11;
|
|
float incL0, incL1, incL2, incL3, incL4, incL5;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
sL0 = lastV[0];
|
|
sL1 = lastV[1];
|
|
sL2 = lastV[2];
|
|
sL3 = lastV[3];
|
|
sL4 = lastV[4];
|
|
sL5 = lastV[5];
|
|
|
|
sL6 = lastV[0] + incL0;
|
|
sL7 = lastV[1] + incL1;
|
|
sL8 = lastV[2] + incL2;
|
|
sL9 = lastV[3] + incL3;
|
|
sL10 = lastV[4] + incL4;
|
|
sL11 = lastV[5] + incL5;
|
|
|
|
incL0 *= 2;
|
|
incL1 *= 2;
|
|
incL2 *= 2;
|
|
incL3 *= 2;
|
|
incL4 *= 2;
|
|
incL5 *= 2;
|
|
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
|
|
mixBuffer[i*6+ 0] += samples[i*2+0+0] * sL0;
|
|
mixBuffer[i*6+ 1] += samples[i*2+0+1] * sL1;
|
|
mixBuffer[i*6+ 2] += samples[i*2+0+0] * sL2;
|
|
mixBuffer[i*6+ 3] += samples[i*2+0+0] * sL3;
|
|
|
|
mixBuffer[i*6+ 4] += samples[i*2+0+0] * sL4;
|
|
mixBuffer[i*6+ 5] += samples[i*2+0+1] * sL5;
|
|
mixBuffer[i*6+ 6] += samples[i*2+2+0] * sL6;
|
|
mixBuffer[i*6+ 7] += samples[i*2+2+1] * sL7;
|
|
|
|
mixBuffer[i*6+ 8] += samples[i*2+2+0] * sL8;
|
|
mixBuffer[i*6+ 9] += samples[i*2+2+0] * sL9;
|
|
mixBuffer[i*6+10] += samples[i*2+2+0] * sL10;
|
|
mixBuffer[i*6+11] += samples[i*2+2+1] * sL11;
|
|
|
|
sL0 += incL0;
|
|
sL1 += incL1;
|
|
sL2 += incL2;
|
|
sL3 += incL3;
|
|
|
|
sL4 += incL4;
|
|
sL5 += incL5;
|
|
sL6 += incL0;
|
|
sL7 += incL1;
|
|
|
|
sL8 += incL2;
|
|
sL9 += incL3;
|
|
sL10 += incL4;
|
|
sL11 += incL5;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_SSE::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_SSE::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
#if 1
|
|
|
|
assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
|
|
|
|
__asm {
|
|
|
|
mov eax, numSamples
|
|
mov edi, mixBuffer
|
|
mov esi, samples
|
|
shl eax, 2
|
|
add edi, eax
|
|
neg eax
|
|
|
|
loop16:
|
|
|
|
movaps xmm0, [edi+eax+0*16]
|
|
movaps xmm2, [edi+eax+1*16]
|
|
movaps xmm4, [edi+eax+2*16]
|
|
movaps xmm6, [edi+eax+3*16]
|
|
|
|
add esi, 4*4*2
|
|
|
|
movhlps xmm1, xmm0
|
|
movhlps xmm3, xmm2
|
|
movhlps xmm5, xmm4
|
|
movhlps xmm7, xmm6
|
|
|
|
prefetchnta [edi+eax+64]
|
|
|
|
cvtps2pi mm0, xmm0
|
|
cvtps2pi mm2, xmm2
|
|
cvtps2pi mm4, xmm4
|
|
cvtps2pi mm6, xmm6
|
|
|
|
prefetchnta [edi+eax+128]
|
|
|
|
cvtps2pi mm1, xmm1
|
|
cvtps2pi mm3, xmm3
|
|
cvtps2pi mm5, xmm5
|
|
cvtps2pi mm7, xmm7
|
|
|
|
add eax, 4*16
|
|
|
|
packssdw mm0, mm1
|
|
packssdw mm2, mm3
|
|
packssdw mm4, mm5
|
|
packssdw mm6, mm7
|
|
|
|
movq [esi-4*4*2], mm0
|
|
movq [esi-3*4*2], mm2
|
|
movq [esi-2*4*2], mm4
|
|
movq [esi-1*4*2], mm6
|
|
|
|
jl loop16
|
|
|
|
emms
|
|
}
|
|
|
|
#else
|
|
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
if ( mixBuffer[i] <= -32768.0f ) {
|
|
samples[i] = -32768;
|
|
} else if ( mixBuffer[i] >= 32767.0f ) {
|
|
samples[i] = 32767;
|
|
} else {
|
|
samples[i] = (short) mixBuffer[i];
|
|
}
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
#endif /* _WIN32 */
|