etqw-sdk/source/idlib/math/Simd_SSE3.cpp

565 lines
13 KiB
C++

// Copyright (C) 2007 Id Software, Inc.
//
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_MMX.h"
#include "Simd_SSE.h"
#include "Simd_SSE2.h"
#include "Simd_SSE3.h"
//===============================================================
//
// SSE3 implementation of idSIMDProcessor
//
//===============================================================
#ifdef ID_WIN_X86_ASM
#include "Simd_InstructionMacros.h"
ALIGN4_INIT4( unsigned long SIMD_SP_clearLast, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 );
ALIGN4_INIT4( float SIMD_SP_lastOne, 0.0f, 0.0f, 0.0f, 1.0f );
/*
============
SSE3_Dot
============
*/
void SSE3_Dot( const idVec4 &v1, const idVec4 &v2, float &result ) {
__asm {
mov esi, v1
mov edi, v2
mov ecx, result
movaps xmm0, [esi]
mulps xmm0, [edi]
_haddps( _xmm0, _xmm0 )
_haddps( _xmm0, _xmm0 )
movss [ecx], xmm0
}
}
/*
============
SSE3_Dot4
============
*/
void SSE3_Dot4( const idVec4 v1[4], const idVec4 v2[4], float result[4] ) {
__asm {
mov esi, v1
mov edi, v2
mov ecx, result
movaps xmm0, [esi+0*16]
mulps xmm0, [edi+0*16]
movaps xmm1, [esi+1*16]
mulps xmm1, [edi+1*16]
movaps xmm2, [esi+2*16]
mulps xmm2, [edi+2*16]
movaps xmm3, [esi+3*16]
mulps xmm3, [edi+3*16]
_haddps( _xmm0, _xmm1 )
_haddps( _xmm2, _xmm3 )
_haddps( _xmm0, _xmm2 )
movaps [ecx], xmm0
}
}
/*
============
idSIMD_SSE3::GetName
============
*/
const char * idSIMD_SSE3::GetName( void ) const {
return "MMX & SSE & SSE2 & SSE3";
}
/*
============
idSIMD_SSE3::TransformVerts
============
*/
void VPCALL idSIMD_SSE3::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
loopVert:
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
movaps xmm2, [esi]
add edx, JOINTWEIGHT_SIZE
movaps xmm0, xmm2
add esi, BASEVECTOR_SIZE
movaps xmm1, xmm2
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
je doneWeight
loopWeight:
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
movaps xmm5, [esi]
add edx, JOINTWEIGHT_SIZE
movaps xmm3, xmm5
add esi, BASEVECTOR_SIZE
movaps xmm4, xmm5
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
addps xmm0, xmm3
addps xmm1, xmm4
addps xmm2, xmm5
jne loopWeight
doneWeight:
add eax, DRAWVERT_SIZE
_haddps( _xmm0, _xmm1 )
_haddps( _xmm2, _xmm0 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm2
pshufd xmm3, xmm2, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm3, xmm2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm3
jl loopVert
done:
}
}
/*
============
idSIMD_SSE3::TransformShadowVerts
============
*/
void VPCALL idSIMD_SSE3::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const jointWeight_t *weights, const int numWeights ) {
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
movaps xmm0, SIMD_SP_clearLast
movaps xmm1, SIMD_SP_lastOne
loopVert:
add esi, DRAWVERT_SIZE
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
movaps xmm3, [esi-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
andps xmm3, xmm0
orps xmm3, xmm1
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
add edx, dword ptr [edx+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET]
add eax, DRAWVERT_SIZE
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
jl loopVert
done:
}
}
/*
============
idSIMD_SSE3::TransformShadowVerts
============
*/
void VPCALL idSIMD_SSE3::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) {
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
movaps xmm0, SIMD_SP_clearLast
movaps xmm1, SIMD_SP_lastOne
loopVert:
add esi, DRAWVERT_SIZE
movzx ebx, word ptr [edx]
movaps xmm3, [esi-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
andps xmm3, xmm0
orps xmm3, xmm1
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
add edx, 2
add eax, DRAWVERT_SIZE
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
jl loopVert
done:
}
}
/*
============
idSIMD_SSE3::TransformShadowVerts
============
*/
void VPCALL idSIMD_SSE3::TransformShadowVerts( shadowCache_t *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) {
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, SHADOWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
movaps xmm0, SIMD_SP_clearLast
movaps xmm1, SIMD_SP_lastOne
loopVert:
add esi, DRAWVERT_SIZE
movzx ebx, word ptr [edx]
movaps xmm3, [esi-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0]
andps xmm3, xmm0
orps xmm3, xmm1
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
add edx, 2
add eax, SHADOWVERT_SIZE
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-SHADOWVERT_SIZE+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-SHADOWVERT_SIZE+8], xmm7
jl loopVert
done:
}
}
#if !defined(SD_USE_DRAWVERT_SIZE_32)
/*
============
idSIMD_SSE3::TransformVertsAndTangents
============
*/
void VPCALL idSIMD_SSE3::TransformVertsAndTangents( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
loopVert:
movss xmm2, [edx+JOINTWEIGHT_WEIGHT_OFFSET]
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
shufps xmm2, xmm2, R_SHUFFLE_PS( 0, 0, 0, 0 )
add edx, JOINTWEIGHT_SIZE
movaps xmm0, xmm2
movaps xmm1, xmm2
mulps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
mulps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
mulps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
je doneWeight
loopWeight:
movss xmm5, [edx+JOINTWEIGHT_WEIGHT_OFFSET]
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
shufps xmm5, xmm5, R_SHUFFLE_PS( 0, 0, 0, 0 )
add edx, JOINTWEIGHT_SIZE
movaps xmm3, xmm5
movaps xmm4, xmm5
mulps xmm3, [edi+ebx+ 0] // xmm3 = m0, m1, m2, t0
mulps xmm4, [edi+ebx+16] // xmm4 = m3, m4, m5, t1
mulps xmm5, [edi+ebx+32] // xmm5 = m6, m7, m8, t2
cmp dword ptr [edx-JOINTWEIGHT_SIZE+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET], JOINTWEIGHT_SIZE
addps xmm0, xmm3
addps xmm1, xmm4
addps xmm2, xmm5
jne loopWeight
doneWeight:
add esi, 3*BASEVECTOR_SIZE
add eax, DRAWVERT_SIZE
// transform vertex
movaps xmm3, [esi-3*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
// transform normal
movaps xmm3, [esi-2*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7
// transform first tangent
movaps xmm3, [esi-1*BASEVECTOR_SIZE]
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
_haddps( _xmm0, _xmm1 )
_haddps( _xmm2, _xmm0 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+0], xmm2
pshufd xmm7, xmm2, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+8], xmm7
jl loopVert
done:
}
}
/*
============
idSIMD_SSE3::TransformVertsAndTangentsFast
============
*/
void VPCALL idSIMD_SSE3::TransformVertsAndTangentsFast( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
assert_16_byte_aligned( joints );
assert_16_byte_aligned( base );
__asm
{
mov eax, numVerts
test eax, eax
jz done
imul eax, DRAWVERT_SIZE
mov ecx, verts
mov edx, weights
mov esi, base
mov edi, joints
add ecx, eax
neg eax
loopVert:
mov ebx, dword ptr [edx+JOINTWEIGHT_JOINTMATOFFSET_OFFSET]
add esi, 3*BASEVECTOR_SIZE
movaps xmm0, [edi+ebx+ 0] // xmm0 = m0, m1, m2, t0
movaps xmm1, [edi+ebx+16] // xmm1 = m3, m4, m5, t1
movaps xmm2, [edi+ebx+32] // xmm2 = m6, m7, m8, t2
add edx, dword ptr [edx+JOINTWEIGHT_NEXTVERTEXOFFSET_OFFSET]
add eax, DRAWVERT_SIZE
// transform vertex
movaps xmm3, [esi-3*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_XYZ_OFFSET+8], xmm7
// transform normal
movaps xmm3, [esi-2*BASEVECTOR_SIZE]
movaps xmm4, xmm3
movaps xmm5, xmm3
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
_haddps( _xmm3, _xmm4 )
_haddps( _xmm5, _xmm3 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+0], xmm5
pshufd xmm7, xmm5, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm5
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_NORMAL_OFFSET+8], xmm7
// transform first tangent
movaps xmm3, [esi-1*BASEVECTOR_SIZE]
mulps xmm0, xmm3
mulps xmm1, xmm3
mulps xmm2, xmm3
_haddps( _xmm0, _xmm1 )
_haddps( _xmm2, _xmm0 )
movhps [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+0], xmm2
pshufd xmm7, xmm2, R_SHUFFLE_D( 1, 0, 2, 3 )
addss xmm7, xmm2
movss [ecx+eax-DRAWVERT_SIZE+DRAWVERT_TANGENT_OFFSET+8], xmm7
jl loopVert
done:
}
}
#endif
#endif /* ID_WIN_X86_ASM */