mirror of
https://github.com/dhewm/dhewm3-sdk.git
synced 2024-11-29 07:42:01 +00:00
afebd7e1e5
Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards.
3071 lines
80 KiB
C++
3071 lines
80 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include "sys/platform.h"
|
|
#include "idlib/geometry/DrawVert.h"
|
|
#include "idlib/geometry/JointTransform.h"
|
|
#include "idlib/math/Vector.h"
|
|
#include "idlib/math/Plane.h"
|
|
#include "idlib/math/Matrix.h"
|
|
#include "renderer/Model.h"
|
|
|
|
#include "idlib/math/Simd_Generic.h"
|
|
|
|
//===============================================================
|
|
//
|
|
// Generic implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
|
|
#define UNROLL1(Y) { int _IX; for (_IX=0;_IX<count;_IX++) {Y(_IX);} }
|
|
#define UNROLL2(Y) { int _IX, _NM = count&0xfffffffe; for (_IX=0;_IX<_NM;_IX+=2){Y(_IX+0);Y(_IX+1);} if (_IX < count) {Y(_IX);}}
|
|
#define UNROLL4(Y) { int _IX, _NM = count&0xfffffffc; for (_IX=0;_IX<_NM;_IX+=4){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);}for(;_IX<count;_IX++){Y(_IX);}}
|
|
#define UNROLL8(Y) { int _IX, _NM = count&0xfffffff8; for (_IX=0;_IX<_NM;_IX+=8){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);Y(_IX+4);Y(_IX+5);Y(_IX+6);Y(_IX+7);} _NM = count&0xfffffffe; for(;_IX<_NM;_IX+=2){Y(_IX); Y(_IX+1);} if (_IX < count) {Y(_IX);} }
|
|
|
|
#ifdef _DEBUG
|
|
#define NODEFAULT default: assert( 0 )
|
|
#elif _WIN32
|
|
#define NODEFAULT default: __assume( 0 )
|
|
#else
|
|
#define NODEFAULT
|
|
#endif
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::GetName
|
|
============
|
|
*/
|
|
const char * idSIMD_Generic::GetName( void ) const {
|
|
return "generic code";
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Add
|
|
|
|
dst[i] = constant + src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Add( float *dst, const float constant, const float *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = src[(X)] + constant;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Add
|
|
|
|
dst[i] = src0[i] + src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Add( float *dst, const float *src0, const float *src1, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] + src1[(X)];
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Sub
|
|
|
|
dst[i] = constant - src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Sub( float *dst, const float constant, const float *src, const int count ) {
|
|
double c = constant;
|
|
#define OPER(X) dst[(X)] = c - src[(X)];
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Sub
|
|
|
|
dst[i] = src0[i] - src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Sub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] - src1[(X)];
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Mul
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Mul( float *dst, const float constant, const float *src0, const int count) {
|
|
double c = constant;
|
|
#define OPER(X) (dst[(X)] = (c * src0[(X)]))
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Mul
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Mul( float *dst, const float *src0, const float *src1, const int count ) {
|
|
#define OPER(X) (dst[(X)] = src0[(X)] * src1[(X)])
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Div
|
|
|
|
dst[i] = constant / divisor[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Div( float *dst, const float constant, const float *divisor, const int count ) {
|
|
double c = constant;
|
|
#define OPER(X) (dst[(X)] = (c / divisor[(X)]))
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Div
|
|
|
|
dst[i] = src0[i] / src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Div( float *dst, const float *src0, const float *src1, const int count ) {
|
|
#define OPER(X) (dst[(X)] = src0[(X)] / src1[(X)])
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MulAdd
|
|
|
|
dst[i] += constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MulAdd( float *dst, const float constant, const float *src, const int count ) {
|
|
double c = constant;
|
|
#define OPER(X) (dst[(X)] += c * src[(X)])
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MulAdd
|
|
|
|
dst[i] += src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
|
|
#define OPER(X) (dst[(X)] += src0[(X)] * src1[(X)])
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MulSub
|
|
|
|
dst[i] -= constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MulSub( float *dst, const float constant, const float *src, const int count ) {
|
|
double c = constant;
|
|
#define OPER(X) (dst[(X)] -= c * src[(X)])
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MulSub
|
|
|
|
dst[i] -= src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
#define OPER(X) (dst[(X)] -= src0[(X)] * src1[(X)])
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = constant * src[(X)];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = constant * src[i].xyz;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i] + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Dot
|
|
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Dot( float &dot, const float *src1, const float *src2, const int count ) {
|
|
#if 1
|
|
|
|
switch( count ) {
|
|
case 0: {
|
|
dot = 0.0f;
|
|
return;
|
|
}
|
|
case 1: {
|
|
dot = src1[0] * src2[0];
|
|
return;
|
|
}
|
|
case 2: {
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1];
|
|
return;
|
|
}
|
|
case 3: {
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
|
|
return;
|
|
}
|
|
default: {
|
|
int i;
|
|
double s0, s1, s2, s3;
|
|
s0 = src1[0] * src2[0];
|
|
s1 = src1[1] * src2[1];
|
|
s2 = src1[2] * src2[2];
|
|
s3 = src1[3] * src2[3];
|
|
for ( i = 4; i < count-7; i += 8 ) {
|
|
s0 += src1[i+0] * src2[i+0];
|
|
s1 += src1[i+1] * src2[i+1];
|
|
s2 += src1[i+2] * src2[i+2];
|
|
s3 += src1[i+3] * src2[i+3];
|
|
s0 += src1[i+4] * src2[i+4];
|
|
s1 += src1[i+5] * src2[i+5];
|
|
s2 += src1[i+6] * src2[i+6];
|
|
s3 += src1[i+7] * src2[i+7];
|
|
}
|
|
switch( count - i ) {
|
|
NODEFAULT;
|
|
case 7: s0 += src1[i+6] * src2[i+6];
|
|
case 6: s1 += src1[i+5] * src2[i+5];
|
|
case 5: s2 += src1[i+4] * src2[i+4];
|
|
case 4: s3 += src1[i+3] * src2[i+3];
|
|
case 3: s0 += src1[i+2] * src2[i+2];
|
|
case 2: s1 += src1[i+1] * src2[i+1];
|
|
case 1: s2 += src1[i+0] * src2[i+0];
|
|
case 0: break;
|
|
}
|
|
double sum;
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
dot = sum;
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
dot = 0.0f;
|
|
for ( i = 0; i < count; i++ ) {
|
|
dot += src1[i] * src2[i];
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpGT
|
|
|
|
dst[i] = src0[i] > constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] > constant;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpGT
|
|
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpGE
|
|
|
|
dst[i] = src0[i] >= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] >= constant;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpGE
|
|
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpLT
|
|
|
|
dst[i] = src0[i] < constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] < constant;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpLT
|
|
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpLE
|
|
|
|
dst[i] = src0[i] <= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] = src0[(X)] <= constant;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CmpLE
|
|
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
|
|
UNROLL4(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) {
|
|
min = idMath::INFINITY; max = -idMath::INFINITY;
|
|
#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
|
|
min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
|
|
#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
#define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
#define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Clamp
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
|
|
#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::ClampMin
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) {
|
|
#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::ClampMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) {
|
|
#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
================
|
|
idSIMD_Generic::Memcpy
|
|
================
|
|
*/
|
|
void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) {
|
|
memcpy( dst, src, count );
|
|
}
|
|
|
|
/*
|
|
================
|
|
idSIMD_Generic::Memset
|
|
================
|
|
*/
|
|
void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) {
|
|
memset( dst, val, count );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Zero16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) {
|
|
memset( dst, 0, count * sizeof( float ) );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Negate16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) {
|
|
unsigned int *ptr = reinterpret_cast<unsigned int *>(dst);
|
|
#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Copy16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) {
|
|
#define OPER(X) dst[(X)] = src[(X)]
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Add16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Sub16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::Mul16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] = src1[(X)] * constant
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::AddAssign16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) {
|
|
#define OPER(X) dst[(X)] += src[(X)]
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::SubAssign16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) {
|
|
#define OPER(X) dst[(X)] -= src[(X)]
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MulAssign16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) {
|
|
#define OPER(X) dst[(X)] *= constant
|
|
UNROLL1(OPER)
|
|
#undef OPER
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_MultiplyVecX
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
int i, j, numRows;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumColumns() );
|
|
assert( dst.GetSize() >= mat.GetNumRows() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numRows = mat.GetNumRows();
|
|
switch( mat.GetNumColumns() ) {
|
|
case 1:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] = mPtr[0] * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
|
|
mPtr += 2;
|
|
}
|
|
break;
|
|
case 3:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
|
|
mPtr += 3;
|
|
}
|
|
break;
|
|
case 4:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3];
|
|
mPtr += 4;
|
|
}
|
|
break;
|
|
case 5:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
|
|
mPtr += 5;
|
|
}
|
|
break;
|
|
case 6:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
|
|
mPtr += 6;
|
|
}
|
|
break;
|
|
default:
|
|
int numColumns = mat.GetNumColumns();
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( j = 1; j < numColumns; j++ ) {
|
|
sum += mPtr[j] * vPtr[j];
|
|
}
|
|
dstPtr[i] = sum;
|
|
mPtr += numColumns;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_MultiplyAddVecX
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
int i, j, numRows;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumColumns() );
|
|
assert( dst.GetSize() >= mat.GetNumRows() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numRows = mat.GetNumRows();
|
|
switch( mat.GetNumColumns() ) {
|
|
case 1:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] += mPtr[0] * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
|
|
mPtr += 2;
|
|
}
|
|
break;
|
|
case 3:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
|
|
mPtr += 3;
|
|
}
|
|
break;
|
|
case 4:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3];
|
|
mPtr += 4;
|
|
}
|
|
break;
|
|
case 5:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
|
|
mPtr += 5;
|
|
}
|
|
break;
|
|
case 6:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
|
|
mPtr += 6;
|
|
}
|
|
break;
|
|
default:
|
|
int numColumns = mat.GetNumColumns();
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( j = 1; j < numColumns; j++ ) {
|
|
sum += mPtr[j] * vPtr[j];
|
|
}
|
|
dstPtr[i] += sum;
|
|
mPtr += numColumns;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_MultiplySubVecX
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
int i, j, numRows;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumColumns() );
|
|
assert( dst.GetSize() >= mat.GetNumRows() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numRows = mat.GetNumRows();
|
|
switch( mat.GetNumColumns() ) {
|
|
case 1:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] -= mPtr[0] * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
|
|
mPtr += 2;
|
|
}
|
|
break;
|
|
case 3:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
|
|
mPtr += 3;
|
|
}
|
|
break;
|
|
case 4:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3];
|
|
mPtr += 4;
|
|
}
|
|
break;
|
|
case 5:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
|
|
mPtr += 5;
|
|
}
|
|
break;
|
|
case 6:
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
|
|
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
|
|
mPtr += 6;
|
|
}
|
|
break;
|
|
default:
|
|
int numColumns = mat.GetNumColumns();
|
|
for ( i = 0; i < numRows; i++ ) {
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( j = 1; j < numColumns; j++ ) {
|
|
sum += mPtr[j] * vPtr[j];
|
|
}
|
|
dstPtr[i] -= sum;
|
|
mPtr += numColumns;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_TransposeMultiplyVecX
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
int i, j, numColumns;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumRows() );
|
|
assert( dst.GetSize() >= mat.GetNumColumns() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numColumns = mat.GetNumColumns();
|
|
switch( mat.GetNumRows() ) {
|
|
case 1:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] = *(mPtr) * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 3:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 4:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 5:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 6:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
default:
|
|
int numRows = mat.GetNumRows();
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
mPtr = mat.ToFloatPtr() + i;
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( j = 1; j < numRows; j++ ) {
|
|
mPtr += numColumns;
|
|
sum += mPtr[0] * vPtr[j];
|
|
}
|
|
dstPtr[i] = sum;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_TransposeMultiplyAddVecX
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
int i, j, numColumns;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumRows() );
|
|
assert( dst.GetSize() >= mat.GetNumColumns() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numColumns = mat.GetNumColumns();
|
|
switch( mat.GetNumRows() ) {
|
|
case 1:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] += *(mPtr) * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 3:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 4:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 5:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 6:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
default:
|
|
int numRows = mat.GetNumRows();
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
mPtr = mat.ToFloatPtr() + i;
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( j = 1; j < numRows; j++ ) {
|
|
mPtr += numColumns;
|
|
sum += mPtr[0] * vPtr[j];
|
|
}
|
|
dstPtr[i] += sum;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_TransposeMultiplySubVecX
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
|
|
int i, numColumns;
|
|
const float *mPtr, *vPtr;
|
|
float *dstPtr;
|
|
|
|
assert( vec.GetSize() >= mat.GetNumRows() );
|
|
assert( dst.GetSize() >= mat.GetNumColumns() );
|
|
|
|
mPtr = mat.ToFloatPtr();
|
|
vPtr = vec.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
numColumns = mat.GetNumColumns();
|
|
switch( mat.GetNumRows() ) {
|
|
case 1:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] -= *(mPtr) * vPtr[0];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 3:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 4:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 5:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
case 6:
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
|
|
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
|
|
mPtr++;
|
|
}
|
|
break;
|
|
default:
|
|
int numRows = mat.GetNumRows();
|
|
for ( i = 0; i < numColumns; i++ ) {
|
|
mPtr = mat.ToFloatPtr() + i;
|
|
float sum = mPtr[0] * vPtr[0];
|
|
for ( int j = 1; j < numRows; j++ ) {
|
|
mPtr += numColumns;
|
|
sum += mPtr[0] * vPtr[j];
|
|
}
|
|
dstPtr[i] -= sum;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_MultiplyMatX
|
|
|
|
optimizes the following matrix multiplications:
|
|
|
|
NxN * Nx6
|
|
6xN * Nx6
|
|
Nx6 * 6xN
|
|
6x6 * 6xN
|
|
|
|
with N in the range [1-6].
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
|
|
int i, j, k, l, n;
|
|
float *dstPtr;
|
|
const float *m1Ptr, *m2Ptr;
|
|
double sum;
|
|
|
|
assert( m1.GetNumColumns() == m2.GetNumRows() );
|
|
|
|
dstPtr = dst.ToFloatPtr();
|
|
m1Ptr = m1.ToFloatPtr();
|
|
m2Ptr = m2.ToFloatPtr();
|
|
k = m1.GetNumRows();
|
|
l = m2.GetNumColumns();
|
|
|
|
switch( m1.GetNumColumns() ) {
|
|
case 1: {
|
|
if ( l == 6 ) {
|
|
for ( i = 0; i < k; i++ ) { // Nx1 * 1x6
|
|
*dstPtr++ = m1Ptr[i] * m2Ptr[0];
|
|
*dstPtr++ = m1Ptr[i] * m2Ptr[1];
|
|
*dstPtr++ = m1Ptr[i] * m2Ptr[2];
|
|
*dstPtr++ = m1Ptr[i] * m2Ptr[3];
|
|
*dstPtr++ = m1Ptr[i] * m2Ptr[4];
|
|
*dstPtr++ = m1Ptr[i] * m2Ptr[5];
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
if ( l == 6 ) {
|
|
for ( i = 0; i < k; i++ ) { // Nx2 * 2x6
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11];
|
|
m1Ptr += 2;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 2;
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
if ( l == 6 ) {
|
|
for ( i = 0; i < k; i++ ) { // Nx3 * 3x6
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17];
|
|
m1Ptr += 3;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 3;
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
if ( l == 6 ) {
|
|
for ( i = 0; i < k; i++ ) { // Nx4 * 4x6
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23];
|
|
m1Ptr += 4;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
|
|
m1Ptr[3] * m2Ptr[3*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 4;
|
|
}
|
|
break;
|
|
}
|
|
case 5: {
|
|
if ( l == 6 ) {
|
|
for ( i = 0; i < k; i++ ) { // Nx5 * 5x6
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28];
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29];
|
|
m1Ptr += 5;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
|
|
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 5;
|
|
}
|
|
break;
|
|
}
|
|
case 6: {
|
|
switch( k ) {
|
|
case 1: {
|
|
if ( l == 1 ) { // 1x6 * 6x1
|
|
dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
|
|
m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 2: {
|
|
if ( l == 2 ) { // 2x6 * 6x2
|
|
for ( i = 0; i < 2; i++ ) {
|
|
for ( j = 0; j < 2; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 3: {
|
|
if ( l == 3 ) { // 3x6 * 6x3
|
|
for ( i = 0; i < 3; i++ ) {
|
|
for ( j = 0; j < 3; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
break;
|
|
}
|
|
case 4: {
|
|
if ( l == 4 ) { // 4x6 * 6x4
|
|
for ( i = 0; i < 4; i++ ) {
|
|
for ( j = 0; j < 4; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
case 5: {
|
|
if ( l == 5 ) { // 5x6 * 6x5
|
|
for ( i = 0; i < 5; i++ ) {
|
|
for ( j = 0; j < 5; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
case 6: {
|
|
switch( l ) {
|
|
case 1: { // 6x6 * 6x1
|
|
for ( i = 0; i < 6; i++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 1 ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 1 ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 1 ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 1 ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 1 ];
|
|
dstPtr++;
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
case 2: { // 6x6 * 6x2
|
|
for ( i = 0; i < 6; i++ ) {
|
|
for ( j = 0; j < 2; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
case 3: { // 6x6 * 6x3
|
|
for ( i = 0; i < 6; i++ ) {
|
|
for ( j = 0; j < 3; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
case 4: { // 6x6 * 6x4
|
|
for ( i = 0; i < 6; i++ ) {
|
|
for ( j = 0; j < 4; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
case 5: { // 6x6 * 6x5
|
|
for ( i = 0; i < 6; i++ ) {
|
|
for ( j = 0; j < 5; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
case 6: { // 6x6 * 6x6
|
|
for ( i = 0; i < 6; i++ ) {
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ]
|
|
+ m1Ptr[1] * m2Ptr[ 1 * 6 + j ]
|
|
+ m1Ptr[2] * m2Ptr[ 2 * 6 + j ]
|
|
+ m1Ptr[3] * m2Ptr[ 3 * 6 + j ]
|
|
+ m1Ptr[4] * m2Ptr[ 4 * 6 + j ]
|
|
+ m1Ptr[5] * m2Ptr[ 5 * 6 + j ];
|
|
dstPtr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
|
|
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr += 6;
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
for ( i = 0; i < k; i++ ) {
|
|
for ( j = 0; j < l; j++ ) {
|
|
m2Ptr = m2.ToFloatPtr() + j;
|
|
sum = m1Ptr[0] * m2Ptr[0];
|
|
for ( n = 1; n < m1.GetNumColumns(); n++ ) {
|
|
m2Ptr += l;
|
|
sum += m1Ptr[n] * m2Ptr[0];
|
|
}
|
|
*dstPtr++ = sum;
|
|
}
|
|
m1Ptr += m1.GetNumColumns();
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_TransposeMultiplyMatX
|
|
|
|
optimizes the following tranpose matrix multiplications:
|
|
|
|
Nx6 * NxN
|
|
6xN * 6x6
|
|
|
|
with N in the range [1-6].
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
|
|
int i, j, k, l, n;
|
|
float *dstPtr;
|
|
const float *m1Ptr, *m2Ptr;
|
|
double sum;
|
|
|
|
assert( m1.GetNumRows() == m2.GetNumRows() );
|
|
|
|
m1Ptr = m1.ToFloatPtr();
|
|
m2Ptr = m2.ToFloatPtr();
|
|
dstPtr = dst.ToFloatPtr();
|
|
k = m1.GetNumColumns();
|
|
l = m2.GetNumColumns();
|
|
|
|
switch( m1.GetNumRows() ) {
|
|
case 1:
|
|
if ( k == 6 && l == 1 ) { // 1x6 * 1x1
|
|
for ( i = 0; i < 6; i++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 2:
|
|
if ( k == 6 && l == 2 ) { // 2x6 * 2x2
|
|
for ( i = 0; i < 6; i++ ) {
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1];
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 3:
|
|
if ( k == 6 && l == 3 ) { // 3x6 * 3x3
|
|
for ( i = 0; i < 6; i++ ) {
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2];
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 4:
|
|
if ( k == 6 && l == 4 ) { // 4x6 * 4x4
|
|
for ( i = 0; i < 6; i++ ) {
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3];
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
|
|
m1Ptr[3*k] * m2Ptr[3*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 5:
|
|
if ( k == 6 && l == 5 ) { // 5x6 * 5x5
|
|
for ( i = 0; i < 6; i++ ) {
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3];
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4];
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
|
|
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
case 6:
|
|
if ( l == 6 ) {
|
|
switch( k ) {
|
|
case 1: // 6x1 * 6x6
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] +
|
|
m1Ptr[1*1] * m2Ptr[1*6] +
|
|
m1Ptr[2*1] * m2Ptr[2*6] +
|
|
m1Ptr[3*1] * m2Ptr[3*6] +
|
|
m1Ptr[4*1] * m2Ptr[4*6] +
|
|
m1Ptr[5*1] * m2Ptr[5*6];
|
|
m2Ptr++;
|
|
}
|
|
return;
|
|
case 2: // 6x2 * 6x6
|
|
for ( i = 0; i < 2; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] +
|
|
m1Ptr[1*2] * m2Ptr[1*6] +
|
|
m1Ptr[2*2] * m2Ptr[2*6] +
|
|
m1Ptr[3*2] * m2Ptr[3*6] +
|
|
m1Ptr[4*2] * m2Ptr[4*6] +
|
|
m1Ptr[5*2] * m2Ptr[5*6];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
case 3: // 6x3 * 6x6
|
|
for ( i = 0; i < 3; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] +
|
|
m1Ptr[1*3] * m2Ptr[1*6] +
|
|
m1Ptr[2*3] * m2Ptr[2*6] +
|
|
m1Ptr[3*3] * m2Ptr[3*6] +
|
|
m1Ptr[4*3] * m2Ptr[4*6] +
|
|
m1Ptr[5*3] * m2Ptr[5*6];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
case 4: // 6x4 * 6x6
|
|
for ( i = 0; i < 4; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] +
|
|
m1Ptr[1*4] * m2Ptr[1*6] +
|
|
m1Ptr[2*4] * m2Ptr[2*6] +
|
|
m1Ptr[3*4] * m2Ptr[3*6] +
|
|
m1Ptr[4*4] * m2Ptr[4*6] +
|
|
m1Ptr[5*4] * m2Ptr[5*6];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
case 5: // 6x5 * 6x6
|
|
for ( i = 0; i < 5; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] +
|
|
m1Ptr[1*5] * m2Ptr[1*6] +
|
|
m1Ptr[2*5] * m2Ptr[2*6] +
|
|
m1Ptr[3*5] * m2Ptr[3*6] +
|
|
m1Ptr[4*5] * m2Ptr[4*6] +
|
|
m1Ptr[5*5] * m2Ptr[5*6];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
case 6: // 6x6 * 6x6
|
|
for ( i = 0; i < 6; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < 6; j++ ) {
|
|
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] +
|
|
m1Ptr[1*6] * m2Ptr[1*6] +
|
|
m1Ptr[2*6] * m2Ptr[2*6] +
|
|
m1Ptr[3*6] * m2Ptr[3*6] +
|
|
m1Ptr[4*6] * m2Ptr[4*6] +
|
|
m1Ptr[5*6] * m2Ptr[5*6];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
for ( i = 0; i < k; i++ ) {
|
|
m2Ptr = m2.ToFloatPtr();
|
|
for ( j = 0; j < l; j++ ) {
|
|
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
|
|
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
|
|
m2Ptr++;
|
|
}
|
|
m1Ptr++;
|
|
}
|
|
break;
|
|
default:
|
|
for ( i = 0; i < k; i++ ) {
|
|
for ( j = 0; j < l; j++ ) {
|
|
m1Ptr = m1.ToFloatPtr() + i;
|
|
m2Ptr = m2.ToFloatPtr() + j;
|
|
sum = m1Ptr[0] * m2Ptr[0];
|
|
for ( n = 1; n < m1.GetNumRows(); n++ ) {
|
|
m1Ptr += k;
|
|
m2Ptr += l;
|
|
sum += m1Ptr[0] * m2Ptr[0];
|
|
}
|
|
*dstPtr++ = sum;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_LowerTriangularSolve
|
|
|
|
solves x in Lx = b for the n * n sub-matrix of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
#if 1
|
|
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
if ( skip >= n ) {
|
|
return;
|
|
}
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
#define NSKIP( n, s ) ((n<<3)|(s&7))
|
|
switch( NSKIP( n, skip ) ) {
|
|
case NSKIP( 1, 0 ): x[0] = b[0];
|
|
return;
|
|
case NSKIP( 2, 0 ): x[0] = b[0];
|
|
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
return;
|
|
case NSKIP( 3, 0 ): x[0] = b[0];
|
|
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
return;
|
|
case NSKIP( 4, 0 ): x[0] = b[0];
|
|
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
return;
|
|
case NSKIP( 5, 0 ): x[0] = b[0];
|
|
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
return;
|
|
case NSKIP( 6, 0 ): x[0] = b[0];
|
|
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
return;
|
|
case NSKIP( 7, 0 ): x[0] = b[0];
|
|
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
|
|
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
|
|
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// process first 4 rows
|
|
switch( skip ) {
|
|
case 0: x[0] = b[0];
|
|
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
|
|
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
|
|
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
|
|
skip = 4;
|
|
}
|
|
|
|
lptr = L[skip];
|
|
|
|
int i, j;
|
|
register double s0, s1, s2, s3;
|
|
|
|
for ( i = skip; i < n; i++ ) {
|
|
s0 = lptr[0] * x[0];
|
|
s1 = lptr[1] * x[1];
|
|
s2 = lptr[2] * x[2];
|
|
s3 = lptr[3] * x[3];
|
|
for ( j = 4; j < i-7; j += 8 ) {
|
|
s0 += lptr[j+0] * x[j+0];
|
|
s1 += lptr[j+1] * x[j+1];
|
|
s2 += lptr[j+2] * x[j+2];
|
|
s3 += lptr[j+3] * x[j+3];
|
|
s0 += lptr[j+4] * x[j+4];
|
|
s1 += lptr[j+5] * x[j+5];
|
|
s2 += lptr[j+6] * x[j+6];
|
|
s3 += lptr[j+7] * x[j+7];
|
|
}
|
|
switch( i - j ) {
|
|
NODEFAULT;
|
|
case 7: s0 += lptr[j+6] * x[j+6];
|
|
case 6: s1 += lptr[j+5] * x[j+5];
|
|
case 5: s2 += lptr[j+4] * x[j+4];
|
|
case 4: s3 += lptr[j+3] * x[j+3];
|
|
case 3: s0 += lptr[j+2] * x[j+2];
|
|
case 2: s1 += lptr[j+1] * x[j+1];
|
|
case 1: s2 += lptr[j+0] * x[j+0];
|
|
case 0: break;
|
|
}
|
|
double sum;
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
sum -= b[i];
|
|
x[i] = -sum;
|
|
lptr += nc;
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j;
|
|
const float *lptr;
|
|
double sum;
|
|
|
|
for ( i = skip; i < n; i++ ) {
|
|
sum = b[i];
|
|
lptr = L[i];
|
|
for ( j = 0; j < i; j++ ) {
|
|
sum -= lptr[j] * x[j];
|
|
}
|
|
x[i] = sum;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L'x = b for the n * n sub-matrix of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
#if 1
|
|
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x[1] = b[1];
|
|
x[0] = b[0] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 3:
|
|
x[2] = b[2];
|
|
x[1] = b[1] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 4:
|
|
x[3] = b[3];
|
|
x[2] = b[2] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 5:
|
|
x[4] = b[4];
|
|
x[3] = b[3] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 6:
|
|
x[5] = b[5];
|
|
x[4] = b[4] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
case 7:
|
|
x[6] = b[6];
|
|
x[5] = b[5] - lptr[6*nc+5] * x[6];
|
|
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
|
|
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
|
|
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
|
|
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
|
|
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int i, j;
|
|
register double s0, s1, s2, s3;
|
|
float *xptr;
|
|
|
|
lptr = L.ToFloatPtr() + n * nc + n - 4;
|
|
xptr = x + n;
|
|
|
|
// process 4 rows at a time
|
|
for ( i = n; i >= 4; i -= 4 ) {
|
|
s0 = b[i-4];
|
|
s1 = b[i-3];
|
|
s2 = b[i-2];
|
|
s3 = b[i-1];
|
|
// process 4x4 blocks
|
|
for ( j = 0; j < n-i; j += 4 ) {
|
|
s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
|
|
s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
|
|
s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
|
|
s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
|
|
s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
|
|
s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
|
|
s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
|
|
s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
|
|
s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
|
|
s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
|
|
s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
|
|
s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
|
|
s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
|
|
s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
|
|
s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
|
|
s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
|
|
}
|
|
// process left over of the 4 rows
|
|
s0 -= lptr[0-1*nc] * s3;
|
|
s1 -= lptr[1-1*nc] * s3;
|
|
s2 -= lptr[2-1*nc] * s3;
|
|
s0 -= lptr[0-2*nc] * s2;
|
|
s1 -= lptr[1-2*nc] * s2;
|
|
s0 -= lptr[0-3*nc] * s1;
|
|
// store result
|
|
xptr[-4] = s0;
|
|
xptr[-3] = s1;
|
|
xptr[-2] = s2;
|
|
xptr[-1] = s3;
|
|
// update pointers for next four rows
|
|
lptr -= 4 + 4 * nc;
|
|
xptr -= 4;
|
|
}
|
|
// process left over rows
|
|
for ( i--; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[0] + i;
|
|
for ( j = i + 1; j < n; j++ ) {
|
|
s0 -= lptr[j*nc] * x[j];
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
|
|
#else
|
|
|
|
int i, j, nc;
|
|
const float *ptr;
|
|
double sum;
|
|
|
|
nc = L.GetNumColumns();
|
|
for ( i = n - 1; i >= 0; i-- ) {
|
|
sum = b[i];
|
|
ptr = L[0] + i;
|
|
for ( j = i + 1; j < n; j++ ) {
|
|
sum -= ptr[j*nc] * x[j];
|
|
}
|
|
x[i] = sum;
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MatX_LDLTFactor
|
|
|
|
in-place factorization LDL' of the n * n sub-matrix of mat
|
|
the reciprocal of the diagonal elements are stored in invDiag
|
|
============
|
|
*/
|
|
bool VPCALL idSIMD_Generic::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
|
|
#if 1
|
|
|
|
int i, j, k, nc;
|
|
float *v, *diag, *mptr;
|
|
double s0, s1, s2, s3, sum, d;
|
|
|
|
v = (float *) _alloca16( n * sizeof( float ) );
|
|
diag = (float *) _alloca16( n * sizeof( float ) );
|
|
|
|
nc = mat.GetNumColumns();
|
|
|
|
if ( n <= 0 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
|
|
sum = mptr[0];
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
diag[0] = sum;
|
|
invDiag[0] = d = 1.0f / sum;
|
|
|
|
if ( n <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 1; j < n; j++ ) {
|
|
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[1];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
sum = mptr[1] - s0;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[1][1] = sum;
|
|
diag[1] = sum;
|
|
invDiag[1] = d = 1.0f / sum;
|
|
|
|
if ( n <= 2 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 2; j < n; j++ ) {
|
|
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[2];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
sum = mptr[2] - s0 - s1;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[2][2] = sum;
|
|
diag[2] = sum;
|
|
invDiag[2] = d = 1.0f / sum;
|
|
|
|
if ( n <= 3 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 3; j < n; j++ ) {
|
|
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
|
|
}
|
|
|
|
mptr = mat[3];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
sum = mptr[3] - s0 - s1 - s2;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[3][3] = sum;
|
|
diag[3] = sum;
|
|
invDiag[3] = d = 1.0f / sum;
|
|
|
|
if ( n <= 4 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 4; j < n; j++ ) {
|
|
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
|
}
|
|
|
|
for ( i = 4; i < n; i++ ) {
|
|
|
|
mptr = mat[i];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
|
|
for ( k = 4; k < i-3; k += 4 ) {
|
|
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
|
|
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
|
|
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
|
|
}
|
|
switch( i - k ) {
|
|
NODEFAULT;
|
|
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
|
|
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
|
|
case 0: break;
|
|
}
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
sum = mptr[i] - sum;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[i][i] = sum;
|
|
diag[i] = sum;
|
|
invDiag[i] = d = 1.0f / sum;
|
|
|
|
if ( i + 1 >= n ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[i+1];
|
|
for ( j = i+1; j < n; j++ ) {
|
|
s0 = mptr[0] * v[0];
|
|
s1 = mptr[1] * v[1];
|
|
s2 = mptr[2] * v[2];
|
|
s3 = mptr[3] * v[3];
|
|
for ( k = 4; k < i-7; k += 8 ) {
|
|
s0 += mptr[k+0] * v[k+0];
|
|
s1 += mptr[k+1] * v[k+1];
|
|
s2 += mptr[k+2] * v[k+2];
|
|
s3 += mptr[k+3] * v[k+3];
|
|
s0 += mptr[k+4] * v[k+4];
|
|
s1 += mptr[k+5] * v[k+5];
|
|
s2 += mptr[k+6] * v[k+6];
|
|
s3 += mptr[k+7] * v[k+7];
|
|
}
|
|
switch( i - k ) {
|
|
NODEFAULT;
|
|
case 7: s0 += mptr[k+6] * v[k+6];
|
|
case 6: s1 += mptr[k+5] * v[k+5];
|
|
case 5: s2 += mptr[k+4] * v[k+4];
|
|
case 4: s3 += mptr[k+3] * v[k+3];
|
|
case 3: s0 += mptr[k+2] * v[k+2];
|
|
case 2: s1 += mptr[k+1] * v[k+1];
|
|
case 1: s2 += mptr[k+0] * v[k+0];
|
|
case 0: break;
|
|
}
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
mptr[i] = ( mptr[i] - sum ) * d;
|
|
mptr += nc;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
|
|
#else
|
|
|
|
int i, j, k, nc;
|
|
float *v, *ptr, *diagPtr;
|
|
double d, sum;
|
|
|
|
v = (float *) _alloca16( n * sizeof( float ) );
|
|
nc = mat.GetNumColumns();
|
|
|
|
for ( i = 0; i < n; i++ ) {
|
|
|
|
ptr = mat[i];
|
|
diagPtr = mat[0];
|
|
sum = ptr[i];
|
|
for ( j = 0; j < i; j++ ) {
|
|
d = ptr[j];
|
|
v[j] = diagPtr[0] * d;
|
|
sum -= v[j] * d;
|
|
diagPtr += nc + 1;
|
|
}
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
diagPtr[0] = sum;
|
|
invDiag[i] = d = 1.0f / sum;
|
|
|
|
if ( i + 1 >= n ) {
|
|
continue;
|
|
}
|
|
|
|
ptr = mat[i+1];
|
|
for ( j = i + 1; j < n; j++ ) {
|
|
sum = ptr[i];
|
|
for ( k = 0; k < i; k++ ) {
|
|
sum -= ptr[k] * v[k];
|
|
}
|
|
ptr[i] = sum * d;
|
|
ptr += nc;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::BlendJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numJoints; i++ ) {
|
|
int j = index[i];
|
|
joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
|
|
joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::ConvertJointQuatsToJointMats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numJoints; i++ ) {
|
|
jointMats[i].SetRotation( jointQuats[i].q.ToMat3() );
|
|
jointMats[i].SetTranslation( jointQuats[i].t );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::ConvertJointMatsToJointQuats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numJoints; i++ ) {
|
|
jointQuats[i] = jointMats[i].ToJointQuat();
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::TransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
int i;
|
|
|
|
for( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] *= jointMats[parents[i]];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::UntransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
int i;
|
|
|
|
for( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] /= jointMats[parents[i]];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::TransformVerts
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
|
|
int i, j;
|
|
const byte *jointsPtr = (byte *)joints;
|
|
|
|
for( j = i = 0; i < numVerts; i++ ) {
|
|
idVec3 v;
|
|
|
|
v = ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
while( index[j*2+1] == 0 ) {
|
|
j++;
|
|
v += ( *(idJointMat *) ( jointsPtr + index[j*2+0] ) ) * weights[j];
|
|
}
|
|
j++;
|
|
|
|
verts[i].xyz = v;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::TracePointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
int i;
|
|
byte tOr;
|
|
|
|
tOr = 0;
|
|
|
|
for ( i = 0; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, t;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
|
|
t = d0 + radius;
|
|
bits = FLOATSIGNBITSET( t ) << 0;
|
|
t = d1 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 1;
|
|
t = d2 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 2;
|
|
t = d3 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 3;
|
|
|
|
t = d0 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 4;
|
|
t = d1 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 5;
|
|
t = d2 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 6;
|
|
t = d3 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 7;
|
|
|
|
bits ^= 0x0F; // flip lower four bits
|
|
|
|
tOr |= bits;
|
|
cullBits[i] = bits;
|
|
}
|
|
|
|
totalOr = tOr;
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::DecalPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
d4 = planes[4].Distance( v );
|
|
d5 = planes[5].Distance( v );
|
|
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 3;
|
|
bits |= FLOATSIGNBITSET( d4 ) << 4;
|
|
bits |= FLOATSIGNBITSET( d5 ) << 5;
|
|
|
|
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::OverlayPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
texCoords[i][0] = d0 = planes[0].Distance( v );
|
|
texCoords[i][1] = d1 = planes[1].Distance( v );
|
|
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
d0 = 1.0f - d0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
d1 = 1.0f - d1;
|
|
bits |= FLOATSIGNBITSET( d0 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 3;
|
|
|
|
cullBits[i] = bits;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::DeriveTriPlanes
|
|
|
|
Derives a plane equation for each triangle.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numIndexes; i += 3 ) {
|
|
const idDrawVert *a, *b, *c;
|
|
float d0[3], d1[3], f;
|
|
idVec3 n;
|
|
|
|
a = verts + indexes[i + 0];
|
|
b = verts + indexes[i + 1];
|
|
c = verts + indexes[i + 2];
|
|
|
|
d0[0] = b->xyz[0] - a->xyz[0];
|
|
d0[1] = b->xyz[1] - a->xyz[1];
|
|
d0[2] = b->xyz[2] - a->xyz[2];
|
|
|
|
d1[0] = c->xyz[0] - a->xyz[0];
|
|
d1[1] = c->xyz[1] - a->xyz[1];
|
|
d1[2] = c->xyz[2] - a->xyz[2];
|
|
|
|
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
|
|
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
|
|
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
|
|
|
|
f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
|
|
|
|
n.x *= f;
|
|
n.y *= f;
|
|
n.z *= f;
|
|
|
|
planes->SetNormal( n );
|
|
planes->FitThroughPoint( a->xyz );
|
|
planes++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::DeriveTangents
|
|
|
|
Derives the normal and orthogonal tangent vectors for the triangle vertices.
|
|
For each vertex the normal and tangent vectors are derived from all triangles
|
|
using the vertex which results in smooth tangents across the mesh.
|
|
In the process the triangle planes are calculated as well.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
int i;
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
idPlane *planesPtr = planes;
|
|
for ( i = 0; i < numIndexes; i += 3 ) {
|
|
idDrawVert *a, *b, *c;
|
|
unsigned int signBit;
|
|
float d0[5], d1[5], f, area;
|
|
idVec3 n, t0, t1;
|
|
|
|
int v0 = indexes[i + 0];
|
|
int v1 = indexes[i + 1];
|
|
int v2 = indexes[i + 2];
|
|
|
|
a = verts + v0;
|
|
b = verts + v1;
|
|
c = verts + v2;
|
|
|
|
d0[0] = b->xyz[0] - a->xyz[0];
|
|
d0[1] = b->xyz[1] - a->xyz[1];
|
|
d0[2] = b->xyz[2] - a->xyz[2];
|
|
d0[3] = b->st[0] - a->st[0];
|
|
d0[4] = b->st[1] - a->st[1];
|
|
|
|
d1[0] = c->xyz[0] - a->xyz[0];
|
|
d1[1] = c->xyz[1] - a->xyz[1];
|
|
d1[2] = c->xyz[2] - a->xyz[2];
|
|
d1[3] = c->st[0] - a->st[0];
|
|
d1[4] = c->st[1] - a->st[1];
|
|
|
|
// normal
|
|
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
|
|
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
|
|
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
|
|
|
|
f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
|
|
|
|
n.x *= f;
|
|
n.y *= f;
|
|
n.z *= f;
|
|
|
|
planesPtr->SetNormal( n );
|
|
planesPtr->FitThroughPoint( a->xyz );
|
|
planesPtr++;
|
|
|
|
// area sign bit
|
|
area = d0[3] * d1[4] - d0[4] * d1[3];
|
|
signBit = ( *(unsigned int *)&area ) & ( 1 << 31 );
|
|
|
|
// first tangent
|
|
t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
|
|
t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
|
|
t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
|
|
|
|
f = idMath::RSqrt( t0.x * t0.x + t0.y * t0.y + t0.z * t0.z );
|
|
*(unsigned int *)&f ^= signBit;
|
|
|
|
t0.x *= f;
|
|
t0.y *= f;
|
|
t0.z *= f;
|
|
|
|
// second tangent
|
|
t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
|
|
t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
|
|
t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
|
|
|
|
f = idMath::RSqrt( t1.x * t1.x + t1.y * t1.y + t1.z * t1.z );
|
|
*(unsigned int *)&f ^= signBit;
|
|
|
|
t1.x *= f;
|
|
t1.y *= f;
|
|
t1.z *= f;
|
|
|
|
if ( used[v0] ) {
|
|
a->normal += n;
|
|
a->tangents[0] += t0;
|
|
a->tangents[1] += t1;
|
|
} else {
|
|
a->normal = n;
|
|
a->tangents[0] = t0;
|
|
a->tangents[1] = t1;
|
|
used[v0] = true;
|
|
}
|
|
|
|
if ( used[v1] ) {
|
|
b->normal += n;
|
|
b->tangents[0] += t0;
|
|
b->tangents[1] += t1;
|
|
} else {
|
|
b->normal = n;
|
|
b->tangents[0] = t0;
|
|
b->tangents[1] = t1;
|
|
used[v1] = true;
|
|
}
|
|
|
|
if ( used[v2] ) {
|
|
c->normal += n;
|
|
c->tangents[0] += t0;
|
|
c->tangents[1] += t1;
|
|
} else {
|
|
c->normal = n;
|
|
c->tangents[0] = t0;
|
|
c->tangents[1] = t1;
|
|
used[v2] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::DeriveUnsmoothedTangents
|
|
|
|
Derives the normal and orthogonal tangent vectors for the triangle vertices.
|
|
For each vertex the normal and tangent vectors are derived from a single dominant triangle.
|
|
============
|
|
*/
|
|
#define DERIVE_UNSMOOTHED_BITANGENT
|
|
|
|
void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numVerts; i++ ) {
|
|
idDrawVert *a, *b, *c;
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
float d3, d8;
|
|
#endif
|
|
float d0, d1, d2, d4;
|
|
float d5, d6, d7, d9;
|
|
float s0, s1, s2;
|
|
float n0, n1, n2;
|
|
float t0, t1, t2;
|
|
float t3, t4, t5;
|
|
|
|
const dominantTri_s &dt = dominantTris[i];
|
|
|
|
a = verts + i;
|
|
b = verts + dt.v2;
|
|
c = verts + dt.v3;
|
|
|
|
d0 = b->xyz[0] - a->xyz[0];
|
|
d1 = b->xyz[1] - a->xyz[1];
|
|
d2 = b->xyz[2] - a->xyz[2];
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
d3 = b->st[0] - a->st[0];
|
|
#endif
|
|
d4 = b->st[1] - a->st[1];
|
|
|
|
d5 = c->xyz[0] - a->xyz[0];
|
|
d6 = c->xyz[1] - a->xyz[1];
|
|
d7 = c->xyz[2] - a->xyz[2];
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
d8 = c->st[0] - a->st[0];
|
|
#endif
|
|
d9 = c->st[1] - a->st[1];
|
|
|
|
s0 = dt.normalizationScale[0];
|
|
s1 = dt.normalizationScale[1];
|
|
s2 = dt.normalizationScale[2];
|
|
|
|
n0 = s2 * ( d6 * d2 - d7 * d1 );
|
|
n1 = s2 * ( d7 * d0 - d5 * d2 );
|
|
n2 = s2 * ( d5 * d1 - d6 * d0 );
|
|
|
|
t0 = s0 * ( d0 * d9 - d4 * d5 );
|
|
t1 = s0 * ( d1 * d9 - d4 * d6 );
|
|
t2 = s0 * ( d2 * d9 - d4 * d7 );
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
t3 = s1 * ( d3 * d5 - d0 * d8 );
|
|
t4 = s1 * ( d3 * d6 - d1 * d8 );
|
|
t5 = s1 * ( d3 * d7 - d2 * d8 );
|
|
#else
|
|
t3 = s1 * ( n2 * t1 - n1 * t2 );
|
|
t4 = s1 * ( n0 * t2 - n2 * t0 );
|
|
t5 = s1 * ( n1 * t0 - n0 * t1 );
|
|
#endif
|
|
|
|
a->normal[0] = n0;
|
|
a->normal[1] = n1;
|
|
a->normal[2] = n2;
|
|
|
|
a->tangents[0][0] = t0;
|
|
a->tangents[0][1] = t1;
|
|
a->tangents[0][2] = t2;
|
|
|
|
a->tangents[1][0] = t3;
|
|
a->tangents[1][1] = t4;
|
|
a->tangents[1][2] = t5;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::NormalizeTangents
|
|
|
|
Normalizes each vertex normal and projects and normalizes the
|
|
tangent vectors onto the plane orthogonal to the vertex normal.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
idVec3 &v = verts[i].normal;
|
|
float f;
|
|
|
|
f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
|
|
v.x *= f; v.y *= f; v.z *= f;
|
|
|
|
for ( int j = 0; j < 2; j++ ) {
|
|
idVec3 &t = verts[i].tangents[j];
|
|
|
|
t -= ( t * v ) * v;
|
|
f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
|
|
t.x *= f; t.y *= f; t.z *= f;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CreateTextureSpaceLightVectors
|
|
|
|
Calculates light vectors in texture space for the given triangle vertices.
|
|
For each vertex the direction towards the light origin is projected onto texture space.
|
|
The light vectors are only calculated for the vertices referenced by the indexes.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
for ( int i = numIndexes - 1; i >= 0; i-- ) {
|
|
used[indexes[i]] = true;
|
|
}
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
|
|
idVec3 lightDir = lightOrigin - v->xyz;
|
|
|
|
lightVectors[i][0] = lightDir * v->tangents[0];
|
|
lightVectors[i][1] = lightDir * v->tangents[1];
|
|
lightVectors[i][2] = lightDir * v->normal;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CreateSpecularTextureCoords
|
|
|
|
Calculates specular texture coordinates for the given triangle vertices.
|
|
For each vertex the normalized direction towards the light origin is added to the
|
|
normalized direction towards the view origin and the result is projected onto texture space.
|
|
The texture coordinates are only calculated for the vertices referenced by the indexes.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
for ( int i = numIndexes - 1; i >= 0; i-- ) {
|
|
used[indexes[i]] = true;
|
|
}
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
|
|
idVec3 lightDir = lightOrigin - v->xyz;
|
|
idVec3 viewDir = viewOrigin - v->xyz;
|
|
|
|
float ilength;
|
|
|
|
ilength = idMath::RSqrt( lightDir * lightDir );
|
|
lightDir[0] *= ilength;
|
|
lightDir[1] *= ilength;
|
|
lightDir[2] *= ilength;
|
|
|
|
ilength = idMath::RSqrt( viewDir * viewDir );
|
|
viewDir[0] *= ilength;
|
|
viewDir[1] *= ilength;
|
|
viewDir[2] *= ilength;
|
|
|
|
lightDir += viewDir;
|
|
|
|
texCoords[i][0] = lightDir * v->tangents[0];
|
|
texCoords[i][1] = lightDir * v->tangents[1];
|
|
texCoords[i][2] = lightDir * v->normal;
|
|
texCoords[i][3] = 1.0f;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CreateShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
|
|
int outVerts = 0;
|
|
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
if ( vertRemap[i] ) {
|
|
continue;
|
|
}
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[outVerts+0][0] = v[0];
|
|
vertexCache[outVerts+0][1] = v[1];
|
|
vertexCache[outVerts+0][2] = v[2];
|
|
vertexCache[outVerts+0][3] = 1.0f;
|
|
|
|
// R_SetupProjection() builds the projection matrix with a slight crunch
|
|
// for depth, which keeps this w=0 division from rasterizing right at the
|
|
// wrap around point and causing depth fighting with the rear caps
|
|
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
|
|
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
|
|
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
|
|
vertexCache[outVerts+1][3] = 0.0f;
|
|
vertRemap[i] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
return outVerts;
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::CreateVertexProgramShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_Generic::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
|
|
for ( int i = 0; i < numVerts; i++ ) {
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[i*2+0][0] = v[0];
|
|
vertexCache[i*2+1][0] = v[0];
|
|
vertexCache[i*2+0][1] = v[1];
|
|
vertexCache[i*2+1][1] = v[1];
|
|
vertexCache[i*2+0][2] = v[2];
|
|
vertexCache[i*2+1][2] = v[2];
|
|
vertexCache[i*2+0][3] = 1.0f;
|
|
vertexCache[i*2+1][3] = 0.0f;
|
|
}
|
|
return numVerts * 2;
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::UpSamplePCMTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
============
|
|
*/
|
|
void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
for ( int i = 0; i < numSamples; i += 2 ) {
|
|
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
|
|
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
for ( int i = 0; i < numSamples; i += 2 ) {
|
|
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
|
|
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
dest[i] = (float) src[i];
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::UpSampleOGGTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
============
|
|
*/
|
|
void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
for ( int i = 0; i < numSamples >> 1; i++ ) {
|
|
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
|
|
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
for ( int i = 0; i < numSamples >> 1; i++ ) {
|
|
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
|
|
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
if ( numChannels == 1 ) {
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
dest[i*1+0] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
for ( int i = 0; i < numSamples >> 1; i++ ) {
|
|
dest[i*2+0] = ogg[0][i] * 32768.0f;
|
|
dest[i*2+1] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MixSoundTwoSpeakerMono
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
float sL = lastV[0];
|
|
float sR = lastV[1];
|
|
float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
|
|
mixBuffer[j*2+0] += samples[j] * sL;
|
|
mixBuffer[j*2+1] += samples[j] * sR;
|
|
sL += incL;
|
|
sR += incR;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MixSoundTwoSpeakerStereo
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
float sL = lastV[0];
|
|
float sR = lastV[1];
|
|
float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
|
|
mixBuffer[j*2+0] += samples[j*2+0] * sL;
|
|
mixBuffer[j*2+1] += samples[j*2+1] * sR;
|
|
sL += incL;
|
|
sR += incR;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MixSoundSixSpeakerMono
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
float sL0 = lastV[0];
|
|
float sL1 = lastV[1];
|
|
float sL2 = lastV[2];
|
|
float sL3 = lastV[3];
|
|
float sL4 = lastV[4];
|
|
float sL5 = lastV[5];
|
|
|
|
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
|
|
mixBuffer[i*6+0] += samples[i] * sL0;
|
|
mixBuffer[i*6+1] += samples[i] * sL1;
|
|
mixBuffer[i*6+2] += samples[i] * sL2;
|
|
mixBuffer[i*6+3] += samples[i] * sL3;
|
|
mixBuffer[i*6+4] += samples[i] * sL4;
|
|
mixBuffer[i*6+5] += samples[i] * sL5;
|
|
sL0 += incL0;
|
|
sL1 += incL1;
|
|
sL2 += incL2;
|
|
sL3 += incL3;
|
|
sL4 += incL4;
|
|
sL5 += incL5;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MixSoundSixSpeakerStereo
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
float sL0 = lastV[0];
|
|
float sL1 = lastV[1];
|
|
float sL2 = lastV[2];
|
|
float sL3 = lastV[3];
|
|
float sL4 = lastV[4];
|
|
float sL5 = lastV[5];
|
|
|
|
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
|
|
mixBuffer[i*6+0] += samples[i*2+0] * sL0;
|
|
mixBuffer[i*6+1] += samples[i*2+1] * sL1;
|
|
mixBuffer[i*6+2] += samples[i*2+0] * sL2;
|
|
mixBuffer[i*6+3] += samples[i*2+0] * sL3;
|
|
mixBuffer[i*6+4] += samples[i*2+0] * sL4;
|
|
mixBuffer[i*6+5] += samples[i*2+1] * sL5;
|
|
sL0 += incL0;
|
|
sL1 += incL1;
|
|
sL2 += incL2;
|
|
sL3 += incL3;
|
|
sL4 += incL4;
|
|
sL5 += incL5;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_Generic::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
|
|
for ( int i = 0; i < numSamples; i++ ) {
|
|
if ( mixBuffer[i] <= -32768.0f ) {
|
|
samples[i] = -32768;
|
|
} else if ( mixBuffer[i] >= 32767.0f ) {
|
|
samples[i] = 32767;
|
|
} else {
|
|
samples[i] = (short) mixBuffer[i];
|
|
}
|
|
}
|
|
}
|