etqw-sdk/source/idlib/math/Simd_Generic.cpp

4263 lines
112 KiB
C++
Raw Normal View History

2008-05-29 00:00:00 +00:00
// Copyright (C) 2007 Id Software, Inc.
//
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#pragma warning( disable : 4244 )
//===============================================================
//
// Generic implementation of idSIMDProcessor
//
//===============================================================
#ifdef _DEBUG
#define NODEFAULT default: assert( 0 )
#elif _WIN32
#define NODEFAULT default: __assume( 0 )
#else
#define NODEFAULT
#endif
#define UNROLL1(Y) { int _IX; for (_IX=0;_IX<count;_IX++) {Y(_IX);} }
#define UNROLL2(Y) { int _IX, _NM = count&0xfffffffe; for (_IX=0;_IX<_NM;_IX+=2){Y(_IX+0);Y(_IX+1);} if (_IX < count) {Y(_IX);}}
#define UNROLL4(Y) { int _IX, _NM = count&0xfffffffc; for (_IX=0;_IX<_NM;_IX+=4){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);}for(;_IX<count;_IX++){Y(_IX);}}
#define UNROLL8(Y) { int _IX, _NM = count&0xfffffff8; for (_IX=0;_IX<_NM;_IX+=8){Y(_IX+0);Y(_IX+1);Y(_IX+2);Y(_IX+3);Y(_IX+4);Y(_IX+5);Y(_IX+6);Y(_IX+7);} _NM = count&0xfffffffe; for(;_IX<_NM;_IX+=2){Y(_IX); Y(_IX+1);} if (_IX < count) {Y(_IX);} }
#define OPTIMIZED_SHADOW_VOLUME_CODE
#define OPTIMIZED_TRI_PLANE_CODE
/*
============
idSIMD_Generic::GetName
============
*/
const char * idSIMD_Generic::GetName( void ) const {
return "generic code";
}
/*
============
idSIMD_Generic::Add
dst[i] = constant + src[i];
============
*/
void VPCALL idSIMD_Generic::Add( float *dst, const float constant, const float *src, const int count ) {
#define OPER(X) dst[(X)] = src[(X)] + constant;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Add
dst[i] = src0[i] + src1[i];
============
*/
void VPCALL idSIMD_Generic::Add( float *dst, const float *src0, const float *src1, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] + src1[(X)];
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Sub
dst[i] = constant - src[i];
============
*/
void VPCALL idSIMD_Generic::Sub( float *dst, const float constant, const float *src, const int count ) {
double c = constant;
#define OPER(X) dst[(X)] = c - src[(X)];
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Sub
dst[i] = src0[i] - src1[i];
============
*/
void VPCALL idSIMD_Generic::Sub( float *dst, const float *src0, const float *src1, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] - src1[(X)];
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Mul
dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_Generic::Mul( float *dst, const float constant, const float *src0, const int count) {
double c = constant;
#define OPER(X) (dst[(X)] = (c * src0[(X)]))
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Mul
dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_Generic::Mul( float *dst, const float *src0, const float *src1, const int count ) {
#define OPER(X) (dst[(X)] = src0[(X)] * src1[(X)])
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Div
dst[i] = constant / divisor[i];
============
*/
void VPCALL idSIMD_Generic::Div( float *dst, const float constant, const float *divisor, const int count ) {
double c = constant;
#define OPER(X) (dst[(X)] = (c / divisor[(X)]))
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Div
dst[i] = src0[i] / src1[i];
============
*/
void VPCALL idSIMD_Generic::Div( float *dst, const float *src0, const float *src1, const int count ) {
#define OPER(X) (dst[(X)] = src0[(X)] / src1[(X)])
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MulAdd
dst[i] += constant * src[i];
============
*/
void VPCALL idSIMD_Generic::MulAdd( float *dst, const float constant, const float *src, const int count ) {
double c = constant;
#define OPER(X) (dst[(X)] += c * src[(X)])
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MulAdd
dst[i] += src0[i] * src1[i];
============
*/
void VPCALL idSIMD_Generic::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
#define OPER(X) (dst[(X)] += src0[(X)] * src1[(X)])
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MulSub
dst[i] -= constant * src[i];
============
*/
void VPCALL idSIMD_Generic::MulSub( float *dst, const float constant, const float *src, const int count ) {
double c = constant;
#define OPER(X) (dst[(X)] -= c * src[(X)])
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MulSub
dst[i] -= src0[i] * src1[i];
============
*/
void VPCALL idSIMD_Generic::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
#define OPER(X) (dst[(X)] -= src0[(X)] * src1[(X)])
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
#define OPER(X) dst[(X)] = constant * src[(X)];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = constant * src[i].Normal() + src[i][3];
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = constant * src[i].xyz;
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = constant.Normal() * src[i] + constant[3];
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_Generic::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Dot
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
============
*/
void VPCALL idSIMD_Generic::Dot( float &dot, const float *src1, const float *src2, const int count ) {
#if 1
switch( count ) {
case 0: {
dot = 0.0f;
return;
}
case 1: {
dot = src1[0] * src2[0];
return;
}
case 2: {
dot = src1[0] * src2[0] + src1[1] * src2[1];
return;
}
case 3: {
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2];
return;
}
default: {
int i;
double s0, s1, s2, s3;
s0 = src1[0] * src2[0];
s1 = src1[1] * src2[1];
s2 = src1[2] * src2[2];
s3 = src1[3] * src2[3];
for ( i = 4; i < count-7; i += 8 ) {
s0 += src1[i+0] * src2[i+0];
s1 += src1[i+1] * src2[i+1];
s2 += src1[i+2] * src2[i+2];
s3 += src1[i+3] * src2[i+3];
s0 += src1[i+4] * src2[i+4];
s1 += src1[i+5] * src2[i+5];
s2 += src1[i+6] * src2[i+6];
s3 += src1[i+7] * src2[i+7];
}
switch( count - i ) {
NODEFAULT;
case 7: s0 += src1[i+6] * src2[i+6];
case 6: s1 += src1[i+5] * src2[i+5];
case 5: s2 += src1[i+4] * src2[i+4];
case 4: s3 += src1[i+3] * src2[i+3];
case 3: s0 += src1[i+2] * src2[i+2];
case 2: s1 += src1[i+1] * src2[i+1];
case 1: s2 += src1[i+0] * src2[i+0];
case 0: break;
}
double sum;
sum = s3;
sum += s2;
sum += s1;
sum += s0;
dot = sum;
}
}
#else
dot = 0.0f;
for ( i = 0; i < count; i++ ) {
dot += src1[i] * src2[i];
}
#endif
}
/*
============
idSIMD_Generic::CmpGT
dst[i] = src0[i] > constant;
============
*/
void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] > constant;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpGT
dst[i] |= ( src0[i] > constant ) << bitNum;
============
*/
void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpGE
dst[i] = src0[i] >= constant;
============
*/
void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] >= constant;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpGE
dst[i] |= ( src0[i] >= constant ) << bitNum;
============
*/
void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpLT
dst[i] = src0[i] < constant;
============
*/
void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] < constant;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpLT
dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_Generic::SetCmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] = ( src0[(X)] < constant ) << bitNum;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpLT
dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpLE
dst[i] = src0[i] <= constant;
============
*/
void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] = src0[(X)] <= constant;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::CmpLE
dst[i] |= ( src0[i] <= constant ) << bitNum;
============
*/
void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
UNROLL4(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MinMax
============
*/
void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) {
min = idMath::INFINITY; max = -idMath::INFINITY;
#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MinMax
============
*/
void VPCALL idSIMD_Generic::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MinMax
============
*/
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MinMax
============
*/
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
#define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MinMax
============
*/
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const vertIndex_t *indexes, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
#define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MinMax
============
*/
void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const struct shadowCache_s *src, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
#define OPER(X) const idVec3 &v = src[(X)].xyz.ToVec3(); if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Clamp
============
*/
void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::ClampMin
============
*/
void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) {
#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::ClampMax
============
*/
void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) {
#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
UNROLL1(OPER)
#undef OPER
}
/*
================
idSIMD_Generic::Memcpy
================
*/
void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) {
memcpy( dst, src, count );
}
/*
================
idSIMD_Generic::Memset
================
*/
void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) {
memset( dst, val, count );
}
/*
============
idSIMD_Generic::Zero16
============
*/
void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) {
memset( dst, 0, count * sizeof( float ) );
}
/*
============
idSIMD_Generic::Negate16
============
*/
void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) {
unsigned int *ptr = reinterpret_cast<unsigned int *>(dst);
#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Copy16
============
*/
void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) {
#define OPER(X) dst[(X)] = src[(X)]
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Add16
============
*/
void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) {
#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Sub16
============
*/
void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::Mul16
============
*/
void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) {
#define OPER(X) dst[(X)] = src1[(X)] * constant
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::AddAssign16
============
*/
void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) {
#define OPER(X) dst[(X)] += src[(X)]
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::SubAssign16
============
*/
void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) {
#define OPER(X) dst[(X)] -= src[(X)]
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MulAssign16
============
*/
void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) {
#define OPER(X) dst[(X)] *= constant
UNROLL1(OPER)
#undef OPER
}
/*
============
idSIMD_Generic::MatX_MultiplyVecX
============
*/
void VPCALL idSIMD_Generic::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
int i, j, numRows;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumColumns() );
assert( dst.GetSize() >= mat.GetNumRows() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numRows = mat.GetNumRows();
switch( mat.GetNumColumns() ) {
case 1:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] = mPtr[0] * vPtr[0];
mPtr++;
}
break;
case 2:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
mPtr += 2;
}
break;
case 3:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
mPtr += 3;
}
break;
case 4:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3];
mPtr += 4;
}
break;
case 5:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
mPtr += 5;
}
break;
case 6:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
mPtr += 6;
}
break;
default:
int numColumns = mat.GetNumColumns();
for ( i = 0; i < numRows; i++ ) {
float sum = mPtr[0] * vPtr[0];
for ( j = 1; j < numColumns; j++ ) {
sum += mPtr[j] * vPtr[j];
}
dstPtr[i] = sum;
mPtr += numColumns;
}
break;
}
}
/*
============
idSIMD_Generic::MatX_MultiplyAddVecX
============
*/
void VPCALL idSIMD_Generic::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
int i, j, numRows;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumColumns() );
assert( dst.GetSize() >= mat.GetNumRows() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numRows = mat.GetNumRows();
switch( mat.GetNumColumns() ) {
case 1:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] += mPtr[0] * vPtr[0];
mPtr++;
}
break;
case 2:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
mPtr += 2;
}
break;
case 3:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
mPtr += 3;
}
break;
case 4:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3];
mPtr += 4;
}
break;
case 5:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
mPtr += 5;
}
break;
case 6:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
mPtr += 6;
}
break;
default:
int numColumns = mat.GetNumColumns();
for ( i = 0; i < numRows; i++ ) {
float sum = mPtr[0] * vPtr[0];
for ( j = 1; j < numColumns; j++ ) {
sum += mPtr[j] * vPtr[j];
}
dstPtr[i] += sum;
mPtr += numColumns;
}
break;
}
}
/*
============
idSIMD_Generic::MatX_MultiplySubVecX
============
*/
void VPCALL idSIMD_Generic::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
int i, j, numRows;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumColumns() );
assert( dst.GetSize() >= mat.GetNumRows() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numRows = mat.GetNumRows();
switch( mat.GetNumColumns() ) {
case 1:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] -= mPtr[0] * vPtr[0];
mPtr++;
}
break;
case 2:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1];
mPtr += 2;
}
break;
case 3:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2];
mPtr += 3;
}
break;
case 4:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3];
mPtr += 4;
}
break;
case 5:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4];
mPtr += 5;
}
break;
case 6:
for ( i = 0; i < numRows; i++ ) {
dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] +
mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5];
mPtr += 6;
}
break;
default:
int numColumns = mat.GetNumColumns();
for ( i = 0; i < numRows; i++ ) {
float sum = mPtr[0] * vPtr[0];
for ( j = 1; j < numColumns; j++ ) {
sum += mPtr[j] * vPtr[j];
}
dstPtr[i] -= sum;
mPtr += numColumns;
}
break;
}
}
/*
============
idSIMD_Generic::MatX_TransposeMultiplyVecX
============
*/
void VPCALL idSIMD_Generic::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
int i, j, numColumns;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumRows() );
assert( dst.GetSize() >= mat.GetNumColumns() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numColumns = mat.GetNumColumns();
switch( mat.GetNumRows() ) {
case 1:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] = *(mPtr) * vPtr[0];
mPtr++;
}
break;
case 2:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
mPtr++;
}
break;
case 3:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
mPtr++;
}
break;
case 4:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3];
mPtr++;
}
break;
case 5:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
mPtr++;
}
break;
case 6:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
mPtr++;
}
break;
default:
int numRows = mat.GetNumRows();
for ( i = 0; i < numColumns; i++ ) {
mPtr = mat.ToFloatPtr() + i;
float sum = mPtr[0] * vPtr[0];
for ( j = 1; j < numRows; j++ ) {
mPtr += numColumns;
sum += mPtr[0] * vPtr[j];
}
dstPtr[i] = sum;
}
break;
}
}
/*
============
idSIMD_Generic::MatX_TransposeMultiplyAddVecX
============
*/
void VPCALL idSIMD_Generic::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
int i, j, numColumns;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumRows() );
assert( dst.GetSize() >= mat.GetNumColumns() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numColumns = mat.GetNumColumns();
switch( mat.GetNumRows() ) {
case 1:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] += *(mPtr) * vPtr[0];
mPtr++;
}
break;
case 2:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
mPtr++;
}
break;
case 3:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
mPtr++;
}
break;
case 4:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3];
mPtr++;
}
break;
case 5:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
mPtr++;
}
break;
case 6:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
mPtr++;
}
break;
default:
int numRows = mat.GetNumRows();
for ( i = 0; i < numColumns; i++ ) {
mPtr = mat.ToFloatPtr() + i;
float sum = mPtr[0] * vPtr[0];
for ( j = 1; j < numRows; j++ ) {
mPtr += numColumns;
sum += mPtr[0] * vPtr[j];
}
dstPtr[i] += sum;
}
break;
}
}
/*
============
idSIMD_Generic::MatX_TransposeMultiplySubVecX
============
*/
void VPCALL idSIMD_Generic::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) {
int i, numColumns;
const float *mPtr, *vPtr;
float *dstPtr;
assert( vec.GetSize() >= mat.GetNumRows() );
assert( dst.GetSize() >= mat.GetNumColumns() );
mPtr = mat.ToFloatPtr();
vPtr = vec.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
numColumns = mat.GetNumColumns();
switch( mat.GetNumRows() ) {
case 1:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] -= *(mPtr) * vPtr[0];
mPtr++;
}
break;
case 2:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1];
mPtr++;
}
break;
case 3:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2];
mPtr++;
}
break;
case 4:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3];
mPtr++;
}
break;
case 5:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4];
mPtr++;
}
break;
case 6:
for ( i = 0; i < numColumns; i++ ) {
dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] +
*(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5];
mPtr++;
}
break;
default:
int numRows = mat.GetNumRows();
for ( i = 0; i < numColumns; i++ ) {
mPtr = mat.ToFloatPtr() + i;
float sum = mPtr[0] * vPtr[0];
for ( int j = 1; j < numRows; j++ ) {
mPtr += numColumns;
sum += mPtr[0] * vPtr[j];
}
dstPtr[i] -= sum;
}
break;
}
}
/*
============
idSIMD_Generic::MatX_MultiplyMatX
optimizes the following matrix multiplications:
NxN * Nx6
6xN * Nx6
Nx6 * 6xN
6x6 * 6xN
with N in the range [1-6].
============
*/
void VPCALL idSIMD_Generic::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
int i, j, k, l, n;
float *dstPtr;
const float *m1Ptr, *m2Ptr;
double sum;
assert( m1.GetNumColumns() == m2.GetNumRows() );
dstPtr = dst.ToFloatPtr();
m1Ptr = m1.ToFloatPtr();
m2Ptr = m2.ToFloatPtr();
k = m1.GetNumRows();
l = m2.GetNumColumns();
switch( m1.GetNumColumns() ) {
case 1: {
if ( l == 6 ) {
for ( i = 0; i < k; i++ ) { // Nx1 * 1x6
*dstPtr++ = m1Ptr[i] * m2Ptr[0];
*dstPtr++ = m1Ptr[i] * m2Ptr[1];
*dstPtr++ = m1Ptr[i] * m2Ptr[2];
*dstPtr++ = m1Ptr[i] * m2Ptr[3];
*dstPtr++ = m1Ptr[i] * m2Ptr[4];
*dstPtr++ = m1Ptr[i] * m2Ptr[5];
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
m2Ptr++;
}
m1Ptr++;
}
break;
}
case 2: {
if ( l == 6 ) {
for ( i = 0; i < k; i++ ) { // Nx2 * 2x6
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6];
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7];
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8];
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9];
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10];
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11];
m1Ptr += 2;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l];
m2Ptr++;
}
m1Ptr += 2;
}
break;
}
case 3: {
if ( l == 6 ) {
for ( i = 0; i < k; i++ ) { // Nx3 * 3x6
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12];
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13];
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14];
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15];
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16];
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17];
m1Ptr += 3;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l];
m2Ptr++;
}
m1Ptr += 3;
}
break;
}
case 4: {
if ( l == 6 ) {
for ( i = 0; i < k; i++ ) { // Nx4 * 4x6
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18];
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19];
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20];
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21];
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22];
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23];
m1Ptr += 4;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
m1Ptr[3] * m2Ptr[3*l];
m2Ptr++;
}
m1Ptr += 4;
}
break;
}
case 5: {
if ( l == 6 ) {
for ( i = 0; i < k; i++ ) { // Nx5 * 5x6
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24];
*dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25];
*dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26];
*dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27];
*dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28];
*dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29];
m1Ptr += 5;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l];
m2Ptr++;
}
m1Ptr += 5;
}
break;
}
case 6: {
switch( k ) {
case 1: {
if ( l == 1 ) { // 1x6 * 6x1
dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] +
m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5];
return;
}
break;
}
case 2: {
if ( l == 2 ) { // 2x6 * 6x2
for ( i = 0; i < 2; i++ ) {
for ( j = 0; j < 2; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
break;
}
case 3: {
if ( l == 3 ) { // 3x6 * 6x3
for ( i = 0; i < 3; i++ ) {
for ( j = 0; j < 3; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
break;
}
case 4: {
if ( l == 4 ) { // 4x6 * 6x4
for ( i = 0; i < 4; i++ ) {
for ( j = 0; j < 4; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
}
case 5: {
if ( l == 5 ) { // 5x6 * 6x5
for ( i = 0; i < 5; i++ ) {
for ( j = 0; j < 5; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
}
case 6: {
switch( l ) {
case 1: { // 6x6 * 6x1
for ( i = 0; i < 6; i++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ]
+ m1Ptr[1] * m2Ptr[ 1 * 1 ]
+ m1Ptr[2] * m2Ptr[ 2 * 1 ]
+ m1Ptr[3] * m2Ptr[ 3 * 1 ]
+ m1Ptr[4] * m2Ptr[ 4 * 1 ]
+ m1Ptr[5] * m2Ptr[ 5 * 1 ];
dstPtr++;
m1Ptr += 6;
}
return;
}
case 2: { // 6x6 * 6x2
for ( i = 0; i < 6; i++ ) {
for ( j = 0; j < 2; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 2 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 2 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 2 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 2 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 2 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
case 3: { // 6x6 * 6x3
for ( i = 0; i < 6; i++ ) {
for ( j = 0; j < 3; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 3 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 3 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 3 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 3 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 3 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
case 4: { // 6x6 * 6x4
for ( i = 0; i < 6; i++ ) {
for ( j = 0; j < 4; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 4 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 4 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 4 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 4 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 4 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
case 5: { // 6x6 * 6x5
for ( i = 0; i < 6; i++ ) {
for ( j = 0; j < 5; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 5 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 5 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 5 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 5 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 5 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
case 6: { // 6x6 * 6x6
for ( i = 0; i < 6; i++ ) {
for ( j = 0; j < 6; j++ ) {
*dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ]
+ m1Ptr[1] * m2Ptr[ 1 * 6 + j ]
+ m1Ptr[2] * m2Ptr[ 2 * 6 + j ]
+ m1Ptr[3] * m2Ptr[ 3 * 6 + j ]
+ m1Ptr[4] * m2Ptr[ 4 * 6 + j ]
+ m1Ptr[5] * m2Ptr[ 5 * 6 + j ];
dstPtr++;
}
m1Ptr += 6;
}
return;
}
}
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] +
m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l];
m2Ptr++;
}
m1Ptr += 6;
}
break;
}
default: {
for ( i = 0; i < k; i++ ) {
for ( j = 0; j < l; j++ ) {
m2Ptr = m2.ToFloatPtr() + j;
sum = m1Ptr[0] * m2Ptr[0];
for ( n = 1; n < m1.GetNumColumns(); n++ ) {
m2Ptr += l;
sum += m1Ptr[n] * m2Ptr[0];
}
*dstPtr++ = sum;
}
m1Ptr += m1.GetNumColumns();
}
break;
}
}
}
/*
============
idSIMD_Generic::MatX_TransposeMultiplyMatX
optimizes the following tranpose matrix multiplications:
Nx6 * NxN
6xN * 6x6
with N in the range [1-6].
============
*/
void VPCALL idSIMD_Generic::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) {
int i, j, k, l, n;
float *dstPtr;
const float *m1Ptr, *m2Ptr;
double sum;
assert( m1.GetNumRows() == m2.GetNumRows() );
m1Ptr = m1.ToFloatPtr();
m2Ptr = m2.ToFloatPtr();
dstPtr = dst.ToFloatPtr();
k = m1.GetNumColumns();
l = m2.GetNumColumns();
switch( m1.GetNumRows() ) {
case 1:
if ( k == 6 && l == 1 ) { // 1x6 * 1x1
for ( i = 0; i < 6; i++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
m1Ptr++;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0];
m2Ptr++;
}
m1Ptr++;
}
break;
case 2:
if ( k == 6 && l == 2 ) { // 2x6 * 2x2
for ( i = 0; i < 6; i++ ) {
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1];
m1Ptr++;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 3:
if ( k == 6 && l == 3 ) { // 3x6 * 3x3
for ( i = 0; i < 6; i++ ) {
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2];
m1Ptr++;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 4:
if ( k == 6 && l == 4 ) { // 4x6 * 4x4
for ( i = 0; i < 6; i++ ) {
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3];
m1Ptr++;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
m1Ptr[3*k] * m2Ptr[3*l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 5:
if ( k == 6 && l == 5 ) { // 5x6 * 5x5
for ( i = 0; i < 6; i++ ) {
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3];
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4];
m1Ptr++;
}
return;
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l];
m2Ptr++;
}
m1Ptr++;
}
break;
case 6:
if ( l == 6 ) {
switch( k ) {
case 1: // 6x1 * 6x6
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < 6; j++ ) {
*dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] +
m1Ptr[1*1] * m2Ptr[1*6] +
m1Ptr[2*1] * m2Ptr[2*6] +
m1Ptr[3*1] * m2Ptr[3*6] +
m1Ptr[4*1] * m2Ptr[4*6] +
m1Ptr[5*1] * m2Ptr[5*6];
m2Ptr++;
}
return;
case 2: // 6x2 * 6x6
for ( i = 0; i < 2; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < 6; j++ ) {
*dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] +
m1Ptr[1*2] * m2Ptr[1*6] +
m1Ptr[2*2] * m2Ptr[2*6] +
m1Ptr[3*2] * m2Ptr[3*6] +
m1Ptr[4*2] * m2Ptr[4*6] +
m1Ptr[5*2] * m2Ptr[5*6];
m2Ptr++;
}
m1Ptr++;
}
return;
case 3: // 6x3 * 6x6
for ( i = 0; i < 3; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < 6; j++ ) {
*dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] +
m1Ptr[1*3] * m2Ptr[1*6] +
m1Ptr[2*3] * m2Ptr[2*6] +
m1Ptr[3*3] * m2Ptr[3*6] +
m1Ptr[4*3] * m2Ptr[4*6] +
m1Ptr[5*3] * m2Ptr[5*6];
m2Ptr++;
}
m1Ptr++;
}
return;
case 4: // 6x4 * 6x6
for ( i = 0; i < 4; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < 6; j++ ) {
*dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] +
m1Ptr[1*4] * m2Ptr[1*6] +
m1Ptr[2*4] * m2Ptr[2*6] +
m1Ptr[3*4] * m2Ptr[3*6] +
m1Ptr[4*4] * m2Ptr[4*6] +
m1Ptr[5*4] * m2Ptr[5*6];
m2Ptr++;
}
m1Ptr++;
}
return;
case 5: // 6x5 * 6x6
for ( i = 0; i < 5; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < 6; j++ ) {
*dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] +
m1Ptr[1*5] * m2Ptr[1*6] +
m1Ptr[2*5] * m2Ptr[2*6] +
m1Ptr[3*5] * m2Ptr[3*6] +
m1Ptr[4*5] * m2Ptr[4*6] +
m1Ptr[5*5] * m2Ptr[5*6];
m2Ptr++;
}
m1Ptr++;
}
return;
case 6: // 6x6 * 6x6
for ( i = 0; i < 6; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < 6; j++ ) {
*dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] +
m1Ptr[1*6] * m2Ptr[1*6] +
m1Ptr[2*6] * m2Ptr[2*6] +
m1Ptr[3*6] * m2Ptr[3*6] +
m1Ptr[4*6] * m2Ptr[4*6] +
m1Ptr[5*6] * m2Ptr[5*6];
m2Ptr++;
}
m1Ptr++;
}
return;
}
}
for ( i = 0; i < k; i++ ) {
m2Ptr = m2.ToFloatPtr();
for ( j = 0; j < l; j++ ) {
*dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] +
m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l];
m2Ptr++;
}
m1Ptr++;
}
break;
default:
for ( i = 0; i < k; i++ ) {
for ( j = 0; j < l; j++ ) {
m1Ptr = m1.ToFloatPtr() + i;
m2Ptr = m2.ToFloatPtr() + j;
sum = m1Ptr[0] * m2Ptr[0];
for ( n = 1; n < m1.GetNumRows(); n++ ) {
m1Ptr += k;
m2Ptr += l;
sum += m1Ptr[0] * m2Ptr[0];
}
*dstPtr++ = sum;
}
}
break;
}
}
/*
============
idSIMD_Generic::MatX_LowerTriangularSolve
solves x in Lx = b for the n * n sub-matrix of L
if skip > 0 the first skip elements of x are assumed to be valid already
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
#if 1
int nc;
const float *lptr;
if ( skip >= n ) {
return;
}
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
#define NSKIP( n, s ) ((n<<3)|(s&7))
switch( NSKIP( n, skip ) ) {
case NSKIP( 1, 0 ): x[0] = b[0];
return;
case NSKIP( 2, 0 ): x[0] = b[0];
case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
return;
case NSKIP( 3, 0 ): x[0] = b[0];
case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
return;
case NSKIP( 4, 0 ): x[0] = b[0];
case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
return;
case NSKIP( 5, 0 ): x[0] = b[0];
case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
return;
case NSKIP( 6, 0 ): x[0] = b[0];
case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
return;
case NSKIP( 7, 0 ): x[0] = b[0];
case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
return;
}
return;
}
// process first 4 rows
switch( skip ) {
case 0: x[0] = b[0];
case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
skip = 4;
}
lptr = L[skip];
int i, j;
register double s0, s1, s2, s3;
for ( i = skip; i < n; i++ ) {
s0 = lptr[0] * x[0];
s1 = lptr[1] * x[1];
s2 = lptr[2] * x[2];
s3 = lptr[3] * x[3];
for ( j = 4; j < i-7; j += 8 ) {
s0 += lptr[j+0] * x[j+0];
s1 += lptr[j+1] * x[j+1];
s2 += lptr[j+2] * x[j+2];
s3 += lptr[j+3] * x[j+3];
s0 += lptr[j+4] * x[j+4];
s1 += lptr[j+5] * x[j+5];
s2 += lptr[j+6] * x[j+6];
s3 += lptr[j+7] * x[j+7];
}
switch( i - j ) {
NODEFAULT;
case 7: s0 += lptr[j+6] * x[j+6];
case 6: s1 += lptr[j+5] * x[j+5];
case 5: s2 += lptr[j+4] * x[j+4];
case 4: s3 += lptr[j+3] * x[j+3];
case 3: s0 += lptr[j+2] * x[j+2];
case 2: s1 += lptr[j+1] * x[j+1];
case 1: s2 += lptr[j+0] * x[j+0];
case 0: break;
}
double sum;
sum = s3;
sum += s2;
sum += s1;
sum += s0;
sum -= b[i];
x[i] = -sum;
lptr += nc;
}
#else
int i, j;
const float *lptr;
double sum;
for ( i = skip; i < n; i++ ) {
sum = b[i];
lptr = L[i];
for ( j = 0; j < i; j++ ) {
sum -= lptr[j] * x[j];
}
x[i] = sum;
}
#endif
}
/*
============
idSIMD_Generic::MatX_LowerTriangularSolveTranspose
solves x in L'x = b for the n * n sub-matrix of L
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
#if 1
int nc;
const float *lptr;
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x[1] = b[1];
x[0] = b[0] - lptr[1*nc+0] * x[1];
return;
case 3:
x[2] = b[2];
x[1] = b[1] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 4:
x[3] = b[3];
x[2] = b[2] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 5:
x[4] = b[4];
x[3] = b[3] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 6:
x[5] = b[5];
x[4] = b[4] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
case 7:
x[6] = b[6];
x[5] = b[5] - lptr[6*nc+5] * x[6];
x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
return;
}
return;
}
int i, j;
register double s0, s1, s2, s3;
float *xptr;
lptr = L.ToFloatPtr() + n * nc + n - 4;
xptr = x + n;
// process 4 rows at a time
for ( i = n; i >= 4; i -= 4 ) {
s0 = b[i-4];
s1 = b[i-3];
s2 = b[i-2];
s3 = b[i-1];
// process 4x4 blocks
for ( j = 0; j < n-i; j += 4 ) {
s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
}
// process left over of the 4 rows
s0 -= lptr[0-1*nc] * s3;
s1 -= lptr[1-1*nc] * s3;
s2 -= lptr[2-1*nc] * s3;
s0 -= lptr[0-2*nc] * s2;
s1 -= lptr[1-2*nc] * s2;
s0 -= lptr[0-3*nc] * s1;
// store result
xptr[-4] = s0;
xptr[-3] = s1;
xptr[-2] = s2;
xptr[-1] = s3;
// update pointers for next four rows
lptr -= 4 + 4 * nc;
xptr -= 4;
}
// process left over rows
for ( i--; i >= 0; i-- ) {
s0 = b[i];
lptr = L[0] + i;
for ( j = i + 1; j < n; j++ ) {
s0 -= lptr[j*nc] * x[j];
}
x[i] = s0;
}
#else
int i, j, nc;
const float *ptr;
double sum;
nc = L.GetNumColumns();
for ( i = n - 1; i >= 0; i-- ) {
sum = b[i];
ptr = L[0] + i;
for ( j = i + 1; j < n; j++ ) {
sum -= ptr[j*nc] * x[j];
}
x[i] = sum;
}
#endif
}
/*
============
idSIMD_Generic::MatX_UpperTriangularSolve
solves x in Ux = b for the n * n sub-matrix of U
U has to be an upper triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_Generic::MatX_UpperTriangularSolve( const idMatX &U, float *x, const float *b, const int n ) {
#if 1
int nc;
const float *uptr;
uptr = U.ToFloatPtr();
nc = U.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x[1] = b[1];
x[0] = b[0] - uptr[0*nc+1] * x[1];
return;
case 3:
x[2] = b[2];
x[1] = b[1] - uptr[1*nc+2] * x[2];
x[0] = b[0] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1];
return;
case 4:
x[3] = b[3];
x[2] = b[2] - uptr[2*nc+3] * x[3];
x[1] = b[1] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2];
x[0] = b[0] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1];
return;
case 5:
x[4] = b[4];
x[3] = b[3] - uptr[3*nc+4] * x[4];
x[2] = b[2] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3];
x[1] = b[1] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2];
x[0] = b[0] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1];
return;
case 6:
x[5] = b[5];
x[4] = b[4] - uptr[4*nc+5] * x[5];
x[3] = b[3] - uptr[3*nc+5] * x[5] - uptr[3*nc+4] * x[4];
x[2] = b[2] - uptr[2*nc+5] * x[5] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3];
x[1] = b[1] - uptr[1*nc+5] * x[5] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2];
x[0] = b[0] - uptr[0*nc+5] * x[5] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1];
return;
case 7:
x[6] = b[6];
x[5] = b[5] - uptr[5*nc+6] * x[6];
x[4] = b[4] - uptr[4*nc+6] * x[6] - uptr[4*nc+5] * x[5];
x[3] = b[3] - uptr[3*nc+6] * x[6] - uptr[3*nc+5] * x[5] - uptr[3*nc+4] * x[4];
x[2] = b[2] - uptr[2*nc+6] * x[6] - uptr[2*nc+5] * x[5] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3];
x[1] = b[1] - uptr[1*nc+6] * x[6] - uptr[1*nc+5] * x[5] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2];
x[0] = b[0] - uptr[0*nc+6] * x[6] - uptr[0*nc+5] * x[5] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1];
return;
}
return;
}
int i, j;
register double s0, s1, s2, s3;
// process the last four rows
x[n-1] = b[n-1];
x[n-2] = b[n-2] - uptr[(n-2)*nc+(n-1)] * x[n-1];
x[n-3] = b[n-3] - uptr[(n-3)*nc+(n-1)] * x[n-1] - uptr[(n-3)*nc+(n-2)] * x[n-2];
x[n-4] = b[n-4] - uptr[(n-4)*nc+(n-1)] * x[n-1] - uptr[(n-4)*nc+(n-2)] * x[n-2] - uptr[(n-4)*nc+(n-3)] * x[n-3];
uptr = U[n - 5];
for ( i = n - 5; i >= 0; i-- ) {
s0 = uptr[i+1] * x[i+1];
s1 = uptr[i+2] * x[i+2];
s2 = uptr[i+3] * x[i+3];
s3 = uptr[i+4] * x[i+4];
for ( j = i + 5; j < n-7; j += 8 ) {
s0 += uptr[j+0] * x[j+0];
s1 += uptr[j+1] * x[j+1];
s2 += uptr[j+2] * x[j+2];
s3 += uptr[j+3] * x[j+3];
s0 += uptr[j+4] * x[j+4];
s1 += uptr[j+5] * x[j+5];
s2 += uptr[j+6] * x[j+6];
s3 += uptr[j+7] * x[j+7];
}
switch( n - j ) {
NODEFAULT;
case 7: s0 += uptr[j+6] * x[j+6];
case 6: s1 += uptr[j+5] * x[j+5];
case 5: s2 += uptr[j+4] * x[j+4];
case 4: s3 += uptr[j+3] * x[j+3];
case 3: s0 += uptr[j+2] * x[j+2];
case 2: s1 += uptr[j+1] * x[j+1];
case 1: s2 += uptr[j+0] * x[j+0];
case 0: break;
}
double sum;
sum = s3;
sum += s2;
sum += s1;
sum += s0;
sum -= b[i];
x[i] = -sum;
uptr -= nc;
}
#else
int i, j;
const float *ptr;
double sum;
for ( i = n - 1; i >= 0; i-- ) {
sum = b[i];
ptr = U[i];
for ( j = i + 1; j < n; j++ ) {
sum -= ptr[j] * x[j];
}
x[i] = sum;
}
#endif
}
/*
============
idSIMD_Generic::MatX_UpperTriangularSolveTranspose
solves x in U'x = b for the n * n sub-matrix of U
U has to be an upper triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_Generic::MatX_UpperTriangularSolveTranspose( const idMatX &U, float *x, const float *b, const int n ) {
#if 1
int nc;
const float *uptr;
uptr = U.ToFloatPtr();
nc = U.GetNumColumns();
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x[0] = b[0];
x[1] = b[1] - uptr[0*nc+1] * x[0];
return;
case 3:
x[0] = b[0];
x[1] = b[1] - uptr[0*nc+1] * x[0];
x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1];
return;
case 4:
x[0] = b[0];
x[1] = b[1] - uptr[0*nc+1] * x[0];
x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1];
x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2];
return;
case 5:
x[0] = b[0];
x[1] = b[1] - uptr[0*nc+1] * x[0];
x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1];
x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2];
x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3];
return;
case 6:
x[0] = b[0];
x[1] = b[1] - uptr[0*nc+1] * x[0];
x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1];
x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2];
x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3];
x[5] = b[5] - uptr[0*nc+5] * x[0] - uptr[1*nc+5] * x[1] - uptr[2*nc+5] * x[2] - uptr[3*nc+5] * x[3] - uptr[4*nc+5] * x[4];
return;
case 7:
x[0] = b[0];
x[1] = b[1] - uptr[0*nc+1] * x[0];
x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1];
x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2];
x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3];
x[5] = b[5] - uptr[0*nc+5] * x[0] - uptr[1*nc+5] * x[1] - uptr[2*nc+5] * x[2] - uptr[3*nc+5] * x[3] - uptr[4*nc+5] * x[4];
x[6] = b[6] - uptr[0*nc+6] * x[0] - uptr[1*nc+6] * x[1] - uptr[2*nc+6] * x[2] - uptr[3*nc+6] * x[3] - uptr[4*nc+6] * x[4] - uptr[5*nc+6] * x[5];
return;
}
return;
}
int i, j;
register double s0, s1, s2, s3;
uptr = U.ToFloatPtr();
// process 4 columns at a time
for ( i = 0; i < n - 3; i += 4 ) {
s0 = b[i+0];
s1 = b[i+1];
s2 = b[i+2];
s3 = b[i+3];
// process 4x4 blocks
for ( j = 0; j < i-3; j += 4 ) {
s0 -= uptr[(j+0)*nc+0] * x[j+0];
s1 -= uptr[(j+0)*nc+1] * x[j+0];
s2 -= uptr[(j+0)*nc+2] * x[j+0];
s3 -= uptr[(j+0)*nc+3] * x[j+0];
s0 -= uptr[(j+1)*nc+0] * x[j+1];
s1 -= uptr[(j+1)*nc+1] * x[j+1];
s2 -= uptr[(j+1)*nc+2] * x[j+1];
s3 -= uptr[(j+1)*nc+3] * x[j+1];
s0 -= uptr[(j+2)*nc+0] * x[j+2];
s1 -= uptr[(j+2)*nc+1] * x[j+2];
s2 -= uptr[(j+2)*nc+2] * x[j+2];
s3 -= uptr[(j+2)*nc+3] * x[j+2];
s0 -= uptr[(j+3)*nc+0] * x[j+3];
s1 -= uptr[(j+3)*nc+1] * x[j+3];
s2 -= uptr[(j+3)*nc+2] * x[j+3];
s3 -= uptr[(j+3)*nc+3] * x[j+3];
}
// process left over of the 4 columns
s1 -= uptr[(j+0)*nc+1] * s0;
s2 -= uptr[(j+0)*nc+2] * s0;
s2 -= uptr[(j+1)*nc+2] * s1;
s3 -= uptr[(j+0)*nc+3] * s0;
s3 -= uptr[(j+1)*nc+3] * s1;
s3 -= uptr[(j+2)*nc+3] * s2;
// store result
x[i+0] = s0;
x[i+1] = s1;
x[i+2] = s2;
x[i+3] = s3;
// update pointer for next four columns
uptr += 4;
}
// process left over columns
for ( ; i < n; i++ ) {
s0 = b[i];
uptr = U[0] + i;
for ( j = 0; j < i; j++ ) {
s0 -= uptr[j*nc] * x[j];
}
x[i] = s0;
}
#else
int i, j, nc;
const float *uptr;
double sum;
nc = U.GetNumColumns();
for ( i = 0; i < n; i++ ) {
sum = b[i];
uptr = U.ToFloatPtr() + i;
for ( j = 0; j < i; j++ ) {
sum -= uptr[j*nc] * x[j];
}
x[i] = sum;
}
#endif
}
/*
============
idSIMD_Generic::MatX_LU_Factor
in-place factorization LU of the n * n sub-matrix of mat
the reciprocal of the diagonal elements of U are stored in invDiag
no pivoting is used
============
*/
bool VPCALL idSIMD_Generic::MatX_LU_Factor( idMatX &mat, idVecX &invDiag, const int n ) {
#if 1
int i, j, k;
float d1, d2, *ptr1, *ptr2;
for ( i = 0; i < n; i++ ) {
d1 = mat[i][i];
if ( d1 == 0.0f ) {
return false;
}
invDiag[i] = d1 = 1.0f / d1;
ptr1 = mat[i];
for ( j = i + 1; j < n; j++ ) {
ptr2 = mat[j];
ptr2[i] = d2 = ptr2[i] * d1;
for ( k = i + 1; k < n - 15; k += 16 ) {
ptr2[k+0] -= d2 * ptr1[k+0];
ptr2[k+1] -= d2 * ptr1[k+1];
ptr2[k+2] -= d2 * ptr1[k+2];
ptr2[k+3] -= d2 * ptr1[k+3];
ptr2[k+4] -= d2 * ptr1[k+4];
ptr2[k+5] -= d2 * ptr1[k+5];
ptr2[k+6] -= d2 * ptr1[k+6];
ptr2[k+7] -= d2 * ptr1[k+7];
ptr2[k+8] -= d2 * ptr1[k+8];
ptr2[k+9] -= d2 * ptr1[k+9];
ptr2[k+10] -= d2 * ptr1[k+10];
ptr2[k+11] -= d2 * ptr1[k+11];
ptr2[k+12] -= d2 * ptr1[k+12];
ptr2[k+13] -= d2 * ptr1[k+13];
ptr2[k+14] -= d2 * ptr1[k+14];
ptr2[k+15] -= d2 * ptr1[k+15];
}
switch( n - k ) {
NODEFAULT;
case 15: ptr2[k+14] -= d2 * ptr1[k+14];
case 14: ptr2[k+13] -= d2 * ptr1[k+13];
case 13: ptr2[k+12] -= d2 * ptr1[k+12];
case 12: ptr2[k+11] -= d2 * ptr1[k+11];
case 11: ptr2[k+10] -= d2 * ptr1[k+10];
case 10: ptr2[k+9] -= d2 * ptr1[k+9];
case 9: ptr2[k+8] -= d2 * ptr1[k+8];
case 8: ptr2[k+7] -= d2 * ptr1[k+7];
case 7: ptr2[k+6] -= d2 * ptr1[k+6];
case 6: ptr2[k+5] -= d2 * ptr1[k+5];
case 5: ptr2[k+4] -= d2 * ptr1[k+4];
case 4: ptr2[k+3] -= d2 * ptr1[k+3];
case 3: ptr2[k+2] -= d2 * ptr1[k+2];
case 2: ptr2[k+1] -= d2 * ptr1[k+1];
case 1: ptr2[k+0] -= d2 * ptr1[k+0];
case 0: break;
}
}
}
return true;
#else
int i, j, k;
float d;
for ( i = 0; i < n; i++ ) {
if ( mat[i][i] == 0.0f ) {
return false;
}
invDiag[i] = d = 1.0f / mat[i][i];
for ( j = i + 1; j < n; j++ ) {
mat[j][i] *= d;
}
for ( j = i + 1; j < n; j++ ) {
d = mat[j][i];
for ( k = i + 1; k < n; k++ ) {
mat[j][k] -= d * mat[i][k];
}
}
}
return true;
#endif
}
/*
============
idSIMD_Generic::MatX_LDLT_Factor
in-place factorization LDL' of the n * n sub-matrix of mat
the reciprocal of the diagonal elements are stored in invDiag
============
*/
bool VPCALL idSIMD_Generic::MatX_LDLT_Factor( idMatX &mat, idVecX &invDiag, const int n ) {
#if 1
int i, j, k, nc;
float *v, *diag, *mptr;
double s0, s1, s2, s3, sum, d;
v = (float *) _alloca16( n * sizeof( float ) );
diag = (float *) _alloca16( n * sizeof( float ) );
nc = mat.GetNumColumns();
if ( n <= 0 ) {
return true;
}
mptr = mat[0];
sum = mptr[0];
if ( sum == 0.0f ) {
return false;
}
diag[0] = sum;
invDiag[0] = d = 1.0f / sum;
if ( n <= 1 ) {
return true;
}
mptr = mat[0];
for ( j = 1; j < n; j++ ) {
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
}
mptr = mat[1];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
sum = mptr[1] - s0;
if ( sum == 0.0f ) {
return false;
}
mat[1][1] = sum;
diag[1] = sum;
invDiag[1] = d = 1.0f / sum;
if ( n <= 2 ) {
return true;
}
mptr = mat[0];
for ( j = 2; j < n; j++ ) {
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
}
mptr = mat[2];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
sum = mptr[2] - s0 - s1;
if ( sum == 0.0f ) {
return false;
}
mat[2][2] = sum;
diag[2] = sum;
invDiag[2] = d = 1.0f / sum;
if ( n <= 3 ) {
return true;
}
mptr = mat[0];
for ( j = 3; j < n; j++ ) {
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
}
mptr = mat[3];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
sum = mptr[3] - s0 - s1 - s2;
if ( sum == 0.0f ) {
return false;
}
mat[3][3] = sum;
diag[3] = sum;
invDiag[3] = d = 1.0f / sum;
if ( n <= 4 ) {
return true;
}
mptr = mat[0];
for ( j = 4; j < n; j++ ) {
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
}
for ( i = 4; i < n; i++ ) {
mptr = mat[i];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
for ( k = 4; k < i-3; k += 4 ) {
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
}
switch( i - k ) {
NODEFAULT;
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
case 0: break;
}
sum = s3;
sum += s2;
sum += s1;
sum += s0;
sum = mptr[i] - sum;
if ( sum == 0.0f ) {
return false;
}
mat[i][i] = sum;
diag[i] = sum;
invDiag[i] = d = 1.0f / sum;
if ( i + 1 >= n ) {
return true;
}
mptr = mat[i+1];
for ( j = i+1; j < n; j++ ) {
s0 = mptr[0] * v[0];
s1 = mptr[1] * v[1];
s2 = mptr[2] * v[2];
s3 = mptr[3] * v[3];
for ( k = 4; k < i-7; k += 8 ) {
s0 += mptr[k+0] * v[k+0];
s1 += mptr[k+1] * v[k+1];
s2 += mptr[k+2] * v[k+2];
s3 += mptr[k+3] * v[k+3];
s0 += mptr[k+4] * v[k+4];
s1 += mptr[k+5] * v[k+5];
s2 += mptr[k+6] * v[k+6];
s3 += mptr[k+7] * v[k+7];
}
switch( i - k ) {
NODEFAULT;
case 7: s0 += mptr[k+6] * v[k+6];
case 6: s1 += mptr[k+5] * v[k+5];
case 5: s2 += mptr[k+4] * v[k+4];
case 4: s3 += mptr[k+3] * v[k+3];
case 3: s0 += mptr[k+2] * v[k+2];
case 2: s1 += mptr[k+1] * v[k+1];
case 1: s2 += mptr[k+0] * v[k+0];
case 0: break;
}
sum = s3;
sum += s2;
sum += s1;
sum += s0;
mptr[i] = ( mptr[i] - sum ) * d;
mptr += nc;
}
}
return true;
#else
int i, j, k, nc;
float *v, *ptr, *diagPtr;
double d, sum;
v = (float *) _alloca16( n * sizeof( float ) );
nc = mat.GetNumColumns();
for ( i = 0; i < n; i++ ) {
ptr = mat[i];
diagPtr = mat[0];
sum = ptr[i];
for ( j = 0; j < i; j++ ) {
d = ptr[j];
v[j] = diagPtr[0] * d;
sum -= v[j] * d;
diagPtr += nc + 1;
}
if ( sum == 0.0f ) {
return false;
}
diagPtr[0] = sum;
invDiag[i] = d = 1.0f / sum;
if ( i + 1 >= n ) {
continue;
}
ptr = mat[i+1];
for ( j = i + 1; j < n; j++ ) {
sum = ptr[i];
for ( k = 0; k < i; k++ ) {
sum -= ptr[k] * v[k];
}
ptr[i] = sum * d;
ptr += nc;
}
}
return true;
#endif
}
/*
============
idSIMD_Generic::DecompressJoints
============
*/
void VPCALL idSIMD_Generic::DecompressJoints( idJointQuat *joints, const idCompressedJointQuat *compressedJoints, const int *index, const int numJoints ) {
for ( int i = 0; i < numJoints; i++ ) {
int j = index[i];
joints[j].q = compressedJoints[j].ToQuat();
joints[j].t = compressedJoints[j].ToOffset();
joints[j].w = 0.0f;
}
}
/*
============
SlerpUnoptimized
============
*/
void SlerpUnoptimized( const idQuat &from, const idQuat &to, float t, idQuat &result ) {
float cosom, absCosom, sinom, omega, scale0, scale1;
cosom = from.x * to.x + from.y * to.y + from.z * to.z + from.w * to.w;
absCosom = fabs( cosom );
if ( ( 1.0f - absCosom ) > 1e-6f ) {
omega = acos( absCosom );
sinom = 1.0f / sin( omega );
scale0 = sin( ( 1.0f - t ) * omega ) * sinom;
scale1 = sin( t * omega ) * sinom;
} else {
scale0 = 1.0f - t;
scale1 = t;
}
scale1 = ( cosom >= 0.0f ) ? scale1 : -scale1;
result.x = scale0 * from.x + scale1 * to.x;
result.y = scale0 * from.y + scale1 * to.y;
result.z = scale0 * from.z + scale1 * to.z;
result.w = scale0 * from.w + scale1 * to.w;
}
/*
============
SlerpOptimized
============
*/
void SlerpOptimized( const idQuat &from, const idQuat &to, float t, idQuat &result ) {
float cosom, absCosom, sinom, sinSqr, omega, scale0, scale1;
cosom = from.x * to.x + from.y * to.y + from.z * to.z + from.w * to.w;
absCosom = fabs( cosom );
if ( ( 1.0f - absCosom ) > 1e-6f ) {
sinSqr = 1.0f - cosom * cosom;
//sinom = 1.0f / sqrt( sinSqr );
{
long i;
float y, r;
y = sinSqr * 0.5f;
i = *reinterpret_cast<long *>( &sinSqr );
i = 0x5f3759df - ( i >> 1 );
r = *reinterpret_cast<float *>( &i );
sinom = r * ( 1.5f - r * r * y );
}
//omega = atan2( sinSqr * sinom, absCosom );
{
float y, a, d, s;
y = sinSqr * sinom;
if ( y > absCosom ) {
a = -absCosom / y;
d = idMath::HALF_PI;
} else {
a = y / absCosom;
d = 0.0f;
}
s = a * a;
omega = ( ( ( ( ( ( ( ( ( 0.0028662257f * s - 0.0161657367f ) * s + 0.0429096138f ) * s - 0.0752896400f )
* s + 0.1065626393f ) * s - 0.1420889944f ) * s + 0.1999355085f ) * s - 0.3333314528f ) * s ) + 1.0f ) * a + d;
}
// scale0 = sin( ( 1.0f - t ) * omega ) * sinom;
{
float a = ( 1.0f - t ) * omega;
float s = a * a;
scale0 = sinom * a * ( ( ( ( ( -2.39e-08f * s + 2.7526e-06f ) * s - 1.98409e-04f ) * s + 8.3333315e-03f ) * s - 1.666666664e-01f ) * s + 1.0f );
}
// scale1 = sin( t * omega ) * sinom;
{
float a = t * omega;
float s = a * a;
scale1 = sinom * a * ( ( ( ( ( -2.39e-08f * s + 2.7526e-06f ) * s - 1.98409e-04f ) * s + 8.3333315e-03f ) * s - 1.666666664e-01f ) * s + 1.0f );
}
} else {
scale0 = 1.0f - t;
scale1 = t;
}
scale1 = ( cosom >= 0.0f ) ? scale1 : -scale1;
result.x = scale0 * from.x + scale1 * to.x;
result.y = scale0 * from.y + scale1 * to.y;
result.z = scale0 * from.z + scale1 * to.z;
result.w = scale0 * from.w + scale1 * to.w;
}
/*
============
LerpUnoptimized
============
*/
void LerpUnoptimized( const idQuat &from, const idQuat &to, float t, idQuat &result ) {
float cosom, scale0, scale1, s;
cosom = from.x * to.x + from.y * to.y + from.z * to.z + from.w * to.w;
scale0 = 1.0f - t;
scale1 = ( cosom >= 0.0f ) ? t : -t;
result.x = scale0 * from.x + scale1 * to.x;
result.y = scale0 * from.y + scale1 * to.y;
result.z = scale0 * from.z + scale1 * to.z;
result.w = scale0 * from.w + scale1 * to.w;
s = 1.0f / sqrt( result.x * result.x + result.y * result.y + result.z * result.z + result.w * result.w );
result.x *= s;
result.y *= s;
result.z *= s;
result.w *= s;
}
/*
============
idSIMD_Generic::BlendJoints
============
*/
void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
int i;
for ( i = 0; i < numJoints; i++ ) {
int j = index[i];
joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
}
}
/*
============
idSIMD_Generic::BlendJointsFast
============
*/
void VPCALL idSIMD_Generic::BlendJointsFast( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
int i;
for ( i = 0; i < numJoints; i++ ) {
int j = index[i];
joints[j].q.SlerpFast( joints[j].q, blendJoints[j].q, lerp );
joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
}
}
/*
============
idSIMD_Generic::ConvertJointQuatsToJointMats
============
*/
void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
#if 1
compile_time_assert( (UINT_PTR)(&((idJointQuat *)0)->t) == (UINT_PTR)(&((idJointQuat *)0)->q) + (UINT_PTR)sizeof( ((idJointQuat *)0)->q ) );
for ( int i = 0; i < numJoints; i++ ) {
const float *q = jointQuats[i].q.ToFloatPtr();
float *m = jointMats[i].ToFloatPtr();
m[0*4+3] = q[4];
m[1*4+3] = q[5];
m[2*4+3] = q[6];
float x2 = q[0] + q[0];
float y2 = q[1] + q[1];
float z2 = q[2] + q[2];
{
float xx2 = q[0] * x2;
float yy2 = q[1] * y2;
float zz2 = q[2] * z2;
m[0*4+0] = 1.0f - yy2 - zz2;
m[1*4+1] = 1.0f - xx2 - zz2;
m[2*4+2] = 1.0f - xx2 - yy2;
}
{
float yz2 = q[1] * z2;
float wx2 = q[3] * x2;
m[2*4+1] = yz2 - wx2;
m[1*4+2] = yz2 + wx2;
}
{
float xy2 = q[0] * y2;
float wz2 = q[3] * z2;
m[1*4+0] = xy2 - wz2;
m[0*4+1] = xy2 + wz2;
}
{
float xz2 = q[0] * z2;
float wy2 = q[3] * y2;
m[0*4+2] = xz2 - wy2;
m[2*4+0] = xz2 + wy2;
}
}
#else
int i;
for ( i = 0; i < numJoints; i++ ) {
jointMats[i].SetRotation( jointQuats[i].q.ToMat3() );
jointMats[i].SetTranslation( jointQuats[i].t );
}
#endif
}
/*
============
idSIMD_Generic::ConvertJointMatsToJointQuats
============
*/
void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
#if 1
compile_time_assert( (UINT_PTR)(&((idJointQuat *)0)->t) == (UINT_PTR)(&((idJointQuat *)0)->q) + (UINT_PTR)sizeof( ((idJointQuat *)0)->q ) );
for ( int i = 0; i < numJoints; i++ ) {
float *q = jointQuats[i].q.ToFloatPtr();
const float *m = jointMats[i].ToFloatPtr();
if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) {
float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[3] = s * t;
q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
} else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) {
float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[0] = s * t;
q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s;
} else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) {
float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[1] = s * t;
q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s;
q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s;
q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
} else {
float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f;
float s = idMath::InvSqrt( t ) * 0.5f;
q[2] = s * t;
q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s;
q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s;
q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s;
}
q[4] = m[0 * 4 + 3];
q[5] = m[1 * 4 + 3];
q[6] = m[2 * 4 + 3];
q[7] = 0.0f;
}
#else
int i;
for ( i = 0; i < numJoints; i++ ) {
jointQuats[i] = jointMats[i].ToJointQuat();
}
#endif
}
/*
============
idSIMD_Generic::TransformJoints
============
*/
void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
int i;
for( i = firstJoint; i <= lastJoint; i++ ) {
assert( parents[i] < i );
jointMats[i] *= jointMats[parents[i]];
}
}
/*
============
idSIMD_Generic::UntransformJoints
============
*/
void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
int i;
for( i = lastJoint; i >= firstJoint; i-- ) {
assert( parents[i] < i );
jointMats[i] /= jointMats[parents[i]];
}
}
/*
============
idSIMD_Generic::MultiplyJoints
============
*/
void VPCALL idSIMD_Generic::MultiplyJoints( idJointMat *result, const idJointMat *joints1, const idJointMat *joints2, const int numJoints ) {
int i;
for ( i = 0; i < numJoints; i++ ) {
idJointMat::Multiply( result[i], joints1[i], joints2[i] );
}
}
/*
============
idSIMD_Generic::TransformVerts
============
*/
void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, int numWeights ) {
int i, j;
const byte *jointsPtr = (byte *)joints;
for( j = 0, i = 0; i < numVerts; i++, j++ ) {
idVec3 v;
v = ( *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ) ) * base[j];
while( weights[j].nextVertexOffset != JOINTWEIGHT_SIZE ) {
j++;
v += ( *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ) ) * base[j];
}
verts[i].xyz = v;
}
}
/*
============
idSIMD_Generic::TransformShadowVerts
============
*/
void VPCALL idSIMD_Generic::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const jointWeight_t *weights, const int numWeights ) {
int i;
const byte *jointsPtr = (byte *)joints;
const byte *weightsPtr = (byte *)weights;
for( i = 0; i < numVerts; i++ ) {
const idJointMat &mat = *(idJointMat *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
mat.Mul( verts[i].xyz, base[i].xyz );
}
}
/*
============
idSIMD_Generic::TransformShadowVerts
============
*/
void VPCALL idSIMD_Generic::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) {
int i;
const byte *jointsPtr = (byte *)joints;
const short *weightsPtr = (short *)weights;
for( i = 0; i < numVerts; i++ ) {
const idJointMat &mat = *(idJointMat *) ( jointsPtr + (*weightsPtr));
weightsPtr++;
mat.Mul( verts[i].xyz, base[i].xyz );
}
}
/*
============
idSIMD_Generic::TransformShadowVerts
============
*/
void VPCALL idSIMD_Generic::TransformShadowVerts( shadowCache_t *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) {
int i;
const byte *jointsPtr = (byte *)joints;
const short *weightsPtr = (short *)weights;
for( i = 0; i < numVerts; i++ ) {
const idJointMat &mat = *(idJointMat *) ( jointsPtr + (*weightsPtr));
weightsPtr++;
mat.Mul( verts[i].xyz.ToVec3(), base[i].xyz );
}
}
/*
============
idSIMD_Generic::TransformVertsAndTangents
============
*/
void VPCALL idSIMD_Generic::TransformVertsAndTangents( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
int i, j;
const byte *jointsPtr = (byte *)joints;
for( j = i = 0; i < numVerts; i++, j++ ) {
idJointMat mat;
idJointMat::Mul( mat, *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ), weights[j].weight );
while( weights[j].nextVertexOffset != JOINTWEIGHT_SIZE ) {
j++;
idJointMat::Mad( mat, *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ), weights[j].weight );
}
verts[i].xyz = mat * base[i*3+0];
verts[i].SetNormal( mat * base[i*3+1] );
verts[i].SetTangent( mat * base[i*3+2] );
}
}
/*
============
idSIMD_Generic::TransformVertsAndTangentsFast
============
*/
void VPCALL idSIMD_Generic::TransformVertsAndTangentsFast( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) {
int i;
const byte *jointsPtr = (byte *)joints;
const byte *weightsPtr = (byte *)weights;
for( i = 0; i < numVerts; i++ ) {
const idJointMat &mat = *(idJointMat *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset );
weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset;
verts[i].xyz = mat * base[i*3+0];
verts[i].SetNormal( mat * base[i*3+1] );
verts[i].SetTangent( mat * base[i*3+2] );
}
}
#if SD_SUPPORT_UNSMOOTHEDTANGENTS
/*
============
idSIMD_Generic::DeriveUnsmoothedTangents
Derives the normal and orthogonal tangent vectors for the triangle vertices.
For each vertex the normal and tangent vectors are derived from a single dominant triangle.
============
*/
#define DERIVE_UNSMOOTHED_BITANGENT
void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
int i;
for ( i = 0; i < numVerts; i++ ) {
idDrawVert *a, *b, *c;
#if defined( SD_USE_DRAWVERT_SIZE_32 )
idVec2 aST, bST, cST;
#endif
float d0, d1, d2, d3, d4;
float d5, d6, d7, d8, d9;
float s0, s1, s2;
float n0, n1, n2;
float t0, t1, t2;
float t3, t4, t5;
const dominantTri_s &dt = dominantTris[i];
a = verts + i;
b = verts + dt.v2;
c = verts + dt.v3;
#if defined( SD_USE_DRAWVERT_SIZE_32 )
aST = a->GetST();
bST = b->GetST();
cST = c->GetST();
#endif
d0 = b->xyz[0] - a->xyz[0];
d1 = b->xyz[1] - a->xyz[1];
d2 = b->xyz[2] - a->xyz[2];
#if defined( SD_USE_DRAWVERT_SIZE_32 )
d3 = bST[0] - aST[0];
d4 = bST[1] - aST[1];
#else
d3 = b->_st[0] - a->_st[0];
d4 = b->_st[1] - a->_st[1];
#endif
d5 = c->xyz[0] - a->xyz[0];
d6 = c->xyz[1] - a->xyz[1];
d7 = c->xyz[2] - a->xyz[2];
#if defined( SD_USE_DRAWVERT_SIZE_32 )
d8 = cST[0] - aST[0];
d9 = cST[1] - aST[1];
#else
d8 = c->_st[0] - a->_st[0];
d9 = c->_st[1] - a->_st[1];
#endif
s0 = dt.normalizationScale[0];
s1 = dt.normalizationScale[1];
s2 = dt.normalizationScale[2];
n0 = s2 * ( d6 * d2 - d7 * d1 );
n1 = s2 * ( d7 * d0 - d5 * d2 );
n2 = s2 * ( d5 * d1 - d6 * d0 );
t0 = s0 * ( d0 * d9 - d4 * d5 );
t1 = s0 * ( d1 * d9 - d4 * d6 );
t2 = s0 * ( d2 * d9 - d4 * d7 );
#ifndef DERIVE_UNSMOOTHED_BITANGENT
t3 = s1 * ( d3 * d5 - d0 * d8 );
t4 = s1 * ( d3 * d6 - d1 * d8 );
t5 = s1 * ( d3 * d7 - d2 * d8 );
#else
t3 = s1 * ( n2 * t1 - n1 * t2 );
t4 = s1 * ( n0 * t2 - n2 * t0 );
t5 = s1 * ( n1 * t0 - n0 * t1 );
#endif
a->SetNormal( n0, n1, n2 );
a->SetTangent( t0, t1, t2 );
a->SetBiTangent( t3, t4, t5 );
}
}
#endif // SD_SUPPORT_UNSMOOTHEDTANGENTS
/*
============
idSIMD_Generic::TracePointCull
============
*/
void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
int i;
byte tOr;
tOr = 0;
for ( i = 0; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, t;
const idVec3 &v = verts[i].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
t = d0 + radius;
bits = FLOATSIGNBITSET( t ) << 0;
t = d1 + radius;
bits |= FLOATSIGNBITSET( t ) << 1;
t = d2 + radius;
bits |= FLOATSIGNBITSET( t ) << 2;
t = d3 + radius;
bits |= FLOATSIGNBITSET( t ) << 3;
t = d0 - radius;
bits |= FLOATSIGNBITSET( t ) << 4;
t = d1 - radius;
bits |= FLOATSIGNBITSET( t ) << 5;
t = d2 - radius;
bits |= FLOATSIGNBITSET( t ) << 6;
t = d3 - radius;
bits |= FLOATSIGNBITSET( t ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
totalOr = tOr;
}
/*
============
idSIMD_Generic::TracePointCull
============
*/
void VPCALL idSIMD_Generic::TracePointCullShadowVerts( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const shadowCache_t *verts, const int numVerts ) {
int i;
byte tOr;
tOr = 0;
for ( i = 0; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, t;
const idVec3 &v = verts[i].xyz.ToVec3();
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
t = d0 + radius;
bits = FLOATSIGNBITSET( t ) << 0;
t = d1 + radius;
bits |= FLOATSIGNBITSET( t ) << 1;
t = d2 + radius;
bits |= FLOATSIGNBITSET( t ) << 2;
t = d3 + radius;
bits |= FLOATSIGNBITSET( t ) << 3;
t = d0 - radius;
bits |= FLOATSIGNBITSET( t ) << 4;
t = d1 - radius;
bits |= FLOATSIGNBITSET( t ) << 5;
t = d2 - radius;
bits |= FLOATSIGNBITSET( t ) << 6;
t = d3 - radius;
bits |= FLOATSIGNBITSET( t ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
totalOr = tOr;
}
/*
============
idSIMD_Generic::DecalPointCull
============
*/
void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
int i;
for ( i = 0; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, d4, d5;
const idVec3 &v = verts[i].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
d4 = planes[4].Distance( v );
d5 = planes[5].Distance( v );
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 2;
bits |= FLOATSIGNBITSET( d3 ) << 3;
bits |= FLOATSIGNBITSET( d4 ) << 4;
bits |= FLOATSIGNBITSET( d5 ) << 5;
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
}
}
void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts, int *indexes, int numIndexes ) {
int i;
for ( i = 0; i < numIndexes; i++ ) {
byte bits;
float d0, d1, d2, d3, d4, d5;
int idx = indexes[i];
const idVec3 &v = verts[idx].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
d4 = planes[4].Distance( v );
d5 = planes[5].Distance( v );
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 2;
bits |= FLOATSIGNBITSET( d3 ) << 3;
bits |= FLOATSIGNBITSET( d4 ) << 4;
bits |= FLOATSIGNBITSET( d5 ) << 5;
cullBits[idx] = bits ^ 0x3F; // flip lower 6 bits
}
}
void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts, unsigned short *indexes, int numIndexes ) {
int i;
for ( i = 0; i < numIndexes; i++ ) {
byte bits;
float d0, d1, d2, d3, d4, d5;
int idx = indexes[i];
const idVec3 &v = verts[idx].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
d4 = planes[4].Distance( v );
d5 = planes[5].Distance( v );
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 2;
bits |= FLOATSIGNBITSET( d3 ) << 3;
bits |= FLOATSIGNBITSET( d4 ) << 4;
bits |= FLOATSIGNBITSET( d5 ) << 5;
cullBits[idx] = bits ^ 0x3F; // flip lower 6 bits
}
}
/*
============
idSIMD_Generic::OverlayPointCull
============
*/
void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
int i;
for ( i = 0; i < numVerts; i++ ) {
byte bits;
float d0, d1;
const idVec3 &v = verts[i].xyz;
texCoords[i][0] = d0 = planes[0].Distance( v );
texCoords[i][1] = d1 = planes[1].Distance( v );
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
d0 = 1.0f - d0;
d1 = 1.0f - d1;
bits |= FLOATSIGNBITSET( d0 ) << 2;
bits |= FLOATSIGNBITSET( d1 ) << 3;
cullBits[i] = bits;
}
}
/*
============
idSIMD_Generic::OverlayPointCull
============
*/
void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const shadowCache_t *verts, const int numVerts ) {
int i;
for ( i = 0; i < numVerts; i++ ) {
byte bits;
float d0, d1;
const idVec3 &v = verts[i].xyz.ToVec3();
texCoords[i][0] = d0 = planes[0].Distance( v );
texCoords[i][1] = d1 = planes[1].Distance( v );
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
d0 = 1.0f - d0;
d1 = 1.0f - d1;
bits |= FLOATSIGNBITSET( d0 ) << 2;
bits |= FLOATSIGNBITSET( d1 ) << 3;
cullBits[i] = bits;
}
}
/*
============
idSIMD_Generic::DeriveTriPlanes
Derives a plane equation for each triangle.
============
*/
void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const vertIndex_t *indexes, const int numIndexes ) {
int i;
for ( i = 0; i < numIndexes; i += 3 ) {
const idDrawVert *a, *b, *c;
float d0[3], d1[3], f;
idVec3 n;
a = verts + indexes[i + 0];
b = verts + indexes[i + 1];
c = verts + indexes[i + 2];
d0[0] = b->xyz[0] - a->xyz[0];
d0[1] = b->xyz[1] - a->xyz[1];
d0[2] = b->xyz[2] - a->xyz[2];
d1[0] = c->xyz[0] - a->xyz[0];
d1[1] = c->xyz[1] - a->xyz[1];
d1[2] = c->xyz[2] - a->xyz[2];
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
#if defined( OPTIMIZED_TRI_PLANE_CODE )
f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
#else
f = 1.0f / sqrt( n.x * n.x + n.y * n.y + n.z * n.z );
#endif
n.x *= f;
n.y *= f;
n.z *= f;
planes->SetNormal( n );
planes->FitThroughPoint( a->xyz );
planes++;
}
}
/*
============
idSIMD_Generic::DeriveTriPlanes
Derives a plane equation for each triangle.
============
*/
void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const shadowCache_t *verts, const int numVerts, const vertIndex_t *indexes, const int numIndexes ) {
int i;
for ( i = 0; i < numIndexes; i += 3 ) {
const shadowCache_t *a, *b, *c;
float d0[3], d1[3], f;
idVec3 n;
a = verts + indexes[i + 0];
b = verts + indexes[i + 1];
c = verts + indexes[i + 2];
d0[0] = b->xyz[0] - a->xyz[0];
d0[1] = b->xyz[1] - a->xyz[1];
d0[2] = b->xyz[2] - a->xyz[2];
d1[0] = c->xyz[0] - a->xyz[0];
d1[1] = c->xyz[1] - a->xyz[1];
d1[2] = c->xyz[2] - a->xyz[2];
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
#if defined( OPTIMIZED_TRI_PLANE_CODE )
f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
#else
f = 1.0f / sqrt( n.x * n.x + n.y * n.y + n.z * n.z );
#endif
n.x *= f;
n.y *= f;
n.z *= f;
planes->SetNormal( n );
planes->FitThroughPoint( a->xyz.ToVec3() );
planes++;
}
}
/*
============
idSIMD_Generic::CalculateFacing
============
*/
void VPCALL idSIMD_Generic::CalculateFacing( byte *facing, const idPlane *planes, const int numTriangles, const idVec4 &light ) {
int i;
for ( i = 0; i < numTriangles; i++ ) {
facing[i] = planes[i][0] * light.x +
planes[i][1] * light.y +
planes[i][2] * light.z +
planes[i][3] * light.w > 0.0f;
}
facing[numTriangles] = 1; // for dangling edges to reference
}
/*
============
idSIMD_Generic::CalculateCullBits
============
*/
void VPCALL idSIMD_Generic::CalculateCullBits( byte *cullBits, const idDrawVert *verts, const int numVerts, const int frontBits, const idPlane lightPlanes[NUM_LIGHT_PLANES] ) {
int i, j;
assert( NUM_LIGHT_PLANES <= sizeof( cullBits[0] ) * 8 );
memset( cullBits, 0, numVerts * sizeof( cullBits[0] ) );
for ( i = 0; i < NUM_LIGHT_PLANES; i++ ) {
// if completely infront of this clipping plane
if ( frontBits & ( 1 << i ) ) {
continue;
}
const idPlane &plane = lightPlanes[i];
for ( j = 0; j < numVerts; j++ ) {
int bit = plane[0] * verts[j].xyz.x +
plane[1] * verts[j].xyz.y +
plane[2] * verts[j].xyz.z +
plane[3] < 0.0f;
cullBits[j] |= bit << i;
}
}
}
/*
============
idSIMD_Generic::CreateShadowCache
============
*/
int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
for ( int i = 0; i < numVerts; i++ ) {
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[i*2+0][0] = v[0];
vertexCache[i*2+1][0] = v[0];
vertexCache[i*2+0][1] = v[1];
vertexCache[i*2+1][1] = v[1];
vertexCache[i*2+0][2] = v[2];
vertexCache[i*2+1][2] = v[2];
vertexCache[i*2+0][3] = 1.0f;
vertexCache[i*2+1][3] = 0.0f;
}
return numVerts * 2;
}
/*
============
idSIMD_Generic::CreateShadowCache
============
*/
int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, const struct shadowCache_s *verts, const int numVerts ) {
for ( int i = 0; i < numVerts; i++ ) {
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[i*2+0][0] = v[0];
vertexCache[i*2+1][0] = v[0];
vertexCache[i*2+0][1] = v[1];
vertexCache[i*2+1][1] = v[1];
vertexCache[i*2+0][2] = v[2];
vertexCache[i*2+1][2] = v[2];
vertexCache[i*2+0][3] = 1.0f;
vertexCache[i*2+1][3] = 0.0f;
}
return numVerts * 2;
}
/*
============
idSIMD_Generic::ShadowVolume_CountFacing
============
*/
int VPCALL idSIMD_Generic::ShadowVolume_CountFacing( const byte *facing, const int numFaces ) {
#if defined( OPTIMIZED_SHADOW_VOLUME_CODE )
int i, n;
n = 0;
for ( i = 0; i < numFaces; i++ ) {
n += facing[i];
}
return n;
#else
int i, n;
n = 0;
for ( i = 0; i < numFaces; i++ ) {
if ( facing[i] ) {
n++;
}
}
return n;
#endif
}
/*
============
idSIMD_Generic::ShadowVolume_CountFacingCull
============
*/
int VPCALL idSIMD_Generic::ShadowVolume_CountFacingCull( byte *facing, const int numFaces, const vertIndex_t *indexes, const byte *cull ) {
#if defined( OPTIMIZED_SHADOW_VOLUME_CODE )
int i, n;
n = 0;
for ( i = 0; i < numFaces; i++ ) {
int c = cull[indexes[0]] & cull[indexes[1]] & cull[indexes[2]];
facing[i] |= ( (-c) >> 31 ) & 1;
n += facing[i];
indexes += 3;
}
return n;
#else
int i, n;
n = 0;
for ( i = 0; i < numFaces; i++ ) {
if ( !facing[i] ) {
int i1 = indexes[0];
int i2 = indexes[1];
int i3 = indexes[2];
if ( cull[i1] & cull[i2] & cull[i3] ) {
facing[i] = 1;
n++;
}
} else {
n++;
}
indexes += 3;
}
return n;
#endif
}
/*
============
idSIMD_Generic::ShadowVolume_CreateSilTriangles
============
*/
int VPCALL idSIMD_Generic::ShadowVolume_CreateSilTriangles( vertIndex_t *shadowIndexes, const byte *facing, const silEdge_t *silEdges, const int numSilEdges ) {
#if defined( OPTIMIZED_SHADOW_VOLUME_CODE )
int i;
const silEdge_t *sil;
vertIndex_t *si;
si = shadowIndexes;
for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) {
int f1 = facing[sil->p1];
int f2 = facing[sil->p2];
if ( !( f1 ^ f2 ) ) {
continue;
}
int v1 = sil->v1;
int v2 = sil->v2;
// set the two triangle winding orders based on facing
// without using a poorly-predictable branch
si[0] = v1;
si[1] = v2 ^ f2;
si[2] = v2 ^ f1;
si[3] = v1 ^ f2;
si[4] = v2 ^ 1;
si[5] = v1 ^ f1;
si += 6;
}
return si - shadowIndexes;
#else
int i;
const silEdge_t *sil;
vertIndex_t *si;
si = shadowIndexes;
for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) {
byte f1 = facing[sil->p1];
byte f2 = facing[sil->p2];
if ( f1 != f2 ) {
int v1 = sil->v1;
int v2 = sil->v2;
if ( f1 ) {
si[0] = v1;
si[1] = v2 + 1;
si[2] = v2;
si[3] = v1;
si[4] = v1 + 1;
si[5] = v2 + 1;
} else {
si[0] = v1;
si[1] = v2;
si[2] = v2 + 1;
si[3] = v1 + 1;
si[4] = v1;
si[5] = v2 + 1;
}
si += 6;
}
}
return si - shadowIndexes;
#endif
}
/*
============
idSIMD_Generic::ShadowVolume_CreateSilTrianglesParallel
============
*/
int VPCALL idSIMD_Generic::ShadowVolume_CreateSilTrianglesParallel( vertIndex_t *shadowIndexes, const byte *facing, const silEdge_t *silEdges, const int numSilEdges ) {
#if defined( OPTIMIZED_SHADOW_VOLUME_CODE )
int i;
const silEdge_t *sil;
vertIndex_t *si;
si = shadowIndexes;
for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) {
int f1 = facing[sil->p1];
int f2 = facing[sil->p2];
if ( !( f1 ^ f2 ) ) {
continue;
}
int v1 = sil->v1;
int v2 = sil->v2;
// set the triangle winding order based on facing
// without using a poorly-predictable branch
si[0] = v1;
si[1] = ( v2 & -f1 ) + ( f1 ^ 1 );
si[2] = ( v2 & -f2 ) + ( f2 ^ 1 );
si += 3;
}
return si - shadowIndexes;
#else
int i;
const silEdge_t *sil;
vertIndex_t *si;
si = shadowIndexes;
for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) {
byte f1 = facing[sil->p1];
byte f2 = facing[sil->p2];
if ( f1 != f2 ) {
int v1 = sil->v1;
int v2 = sil->v2;
if ( f1 ) {
si[0] = v1;
si[1] = 1;
si[2] = v2;
} else {
si[0] = v1;
si[1] = v2;
si[2] = 1;
}
si += 3;
}
}
return si - shadowIndexes;
#endif
}
/*
============
idSIMD_Generic::ShadowVolume_CreateCapTriangles
============
*/
int VPCALL idSIMD_Generic::ShadowVolume_CreateCapTriangles( vertIndex_t *shadowIndexes, const byte *facing, const vertIndex_t *indexes, const int numIndexes ) {
int i, j;
vertIndex_t *si;
si = shadowIndexes;
for ( i = 0, j = 0; i < numIndexes; i += 3, j++ ) {
if ( facing[j] ) {
continue;
}
int i0 = indexes[i+0] * 2;
int i1 = indexes[i+1] * 2;
int i2 = indexes[i+2] * 2;
si[0] = i0;
si[1] = i1;
si[2] = i2;
si[3] = i2 + 1;
si[4] = i1 + 1;
si[5] = i0 + 1;
si += 6;
}
return si - shadowIndexes;
}
/*
============
idSIMD_Generic::ShadowVolume_CreateCapTrianglesParallel
============
*/
int VPCALL idSIMD_Generic::ShadowVolume_CreateCapTrianglesParallel( vertIndex_t *shadowIndexes, const byte *facing, const vertIndex_t *indexes, const int numIndexes ) {
int i, j;
vertIndex_t *si;
si = shadowIndexes;
for ( i = 0, j = 0; i < numIndexes; i += 3, j++ ) {
if ( facing[j] ) {
continue;
}
si[0] = indexes[i+0] * 2;
si[1] = indexes[i+1] * 2;
si[2] = indexes[i+2] * 2;
si += 3;
}
return si - shadowIndexes;
}
/*
============
idSIMD_Generic::UpSamplePCMTo44kHz
Duplicate samples for 44kHz output.
============
*/
void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
for ( int i = 0; i < numSamples; i++ ) {
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
}
} else {
for ( int i = 0; i < numSamples; i += 2 ) {
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
}
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
for ( int i = 0; i < numSamples; i++ ) {
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
}
} else {
for ( int i = 0; i < numSamples; i += 2 ) {
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
}
}
} else if ( kHz == 44100 ) {
for ( int i = 0; i < numSamples; i++ ) {
dest[i] = (float) src[i];
}
} else {
assert( 0 );
}
}
/*
============
idSIMD_Generic::UpSampleOGGTo44kHz
Duplicate samples for 44kHz output.
============
*/
void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
for ( int i = 0; i < numSamples; i++ ) {
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
}
} else {
for ( int i = 0; i < numSamples >> 1; i++ ) {
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
}
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
for ( int i = 0; i < numSamples; i++ ) {
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
}
} else {
for ( int i = 0; i < numSamples >> 1; i++ ) {
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
}
}
} else if ( kHz == 44100 ) {
if ( numChannels == 1 ) {
for ( int i = 0; i < numSamples; i++ ) {
dest[i*1+0] = ogg[0][i] * 32768.0f;
}
} else {
for ( int i = 0; i < numSamples >> 1; i++ ) {
dest[i*2+0] = ogg[0][i] * 32768.0f;
dest[i*2+1] = ogg[1][i] * 32768.0f;
}
}
} else {
assert( 0 );
}
}
/*
============
idSIMD_Generic::MixSoundTwoSpeakerMono
============
*/
void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
float sL = lastV[0];
float sR = lastV[1];
float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
mixBuffer[j*2+0] += samples[j] * sL;
mixBuffer[j*2+1] += samples[j] * sR;
sL += incL;
sR += incR;
}
}
/*
============
idSIMD_Generic::MixSoundTwoSpeakerStereo
============
*/
void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
float sL = lastV[0];
float sR = lastV[1];
float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) {
mixBuffer[j*2+0] += samples[j*2+0] * sL;
mixBuffer[j*2+1] += samples[j*2+1] * sR;
sL += incL;
sR += incR;
}
}
/*
============
idSIMD_Generic::MixSoundFourSpeakerMono
============
*/
void VPCALL idSIMD_Generic::MixSoundFourSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
float sL0 = lastV[0];
float sL1 = lastV[1];
float sL2 = lastV[4];
float sL3 = lastV[5];
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
float incL2 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
float incL3 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
mixBuffer[i*4+0] += samples[i] * sL0;
mixBuffer[i*4+1] += samples[i] * sL1;
mixBuffer[i*4+2] += samples[i] * sL2;
mixBuffer[i*4+3] += samples[i] * sL3;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
}
}
/*
============
idSIMD_Generic::MixSoundFourSpeakerStereo
============
*/
void VPCALL idSIMD_Generic::MixSoundFourSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
float sL0 = lastV[0];
float sL1 = lastV[1];
float sL2 = lastV[4];
float sL3 = lastV[5];
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
float incL2 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
float incL3 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
mixBuffer[i*4+0] += samples[i*2+0] * sL0;
mixBuffer[i*4+1] += samples[i*2+1] * sL1;
mixBuffer[i*4+2] += samples[i*2+0] * sL2;
mixBuffer[i*4+3] += samples[i*2+1] * sL3;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
}
}
/*
============
idSIMD_Generic::MixSoundSixSpeakerMono
============
*/
void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
float sL0 = lastV[0];
float sL1 = lastV[1];
float sL2 = lastV[2];
float sL3 = lastV[3];
float sL4 = lastV[4];
float sL5 = lastV[5];
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
mixBuffer[i*6+0] += samples[i] * sL0;
mixBuffer[i*6+1] += samples[i] * sL1;
mixBuffer[i*6+2] += samples[i] * sL2;
mixBuffer[i*6+3] += samples[i] * sL3;
mixBuffer[i*6+4] += samples[i] * sL4;
mixBuffer[i*6+5] += samples[i] * sL5;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
sL4 += incL4;
sL5 += incL5;
}
}
/*
============
idSIMD_Generic::MixSoundSixSpeakerStereo
============
*/
void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
float sL0 = lastV[0];
float sL1 = lastV[1];
float sL2 = lastV[2];
float sL3 = lastV[3];
float sL4 = lastV[4];
float sL5 = lastV[5];
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
mixBuffer[i*6+0] += samples[i*2+0] * sL0;
mixBuffer[i*6+1] += samples[i*2+1] * sL1;
mixBuffer[i*6+2] += samples[i*2+0] * sL2;
mixBuffer[i*6+3] += samples[i*2+0] * sL3;
mixBuffer[i*6+4] += samples[i*2+0] * sL4;
mixBuffer[i*6+5] += samples[i*2+1] * sL5;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
sL4 += incL4;
sL5 += incL5;
}
}
/*
============
idSIMD_Generic::MixSoundEightSpeakerMono
============
*/
void VPCALL idSIMD_Generic::MixSoundEightSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[8], const float currentV[8] ) {
float sL0 = lastV[0];
float sL1 = lastV[1];
float sL2 = lastV[2];
float sL3 = lastV[3];
float sL4 = lastV[4];
float sL5 = lastV[5];
float sL6 = lastV[6];
float sL7 = lastV[7];
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
float incL6 = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES;
float incL7 = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
mixBuffer[i*8+0] += samples[i] * sL0;
mixBuffer[i*8+1] += samples[i] * sL1;
mixBuffer[i*8+2] += samples[i] * sL2;
mixBuffer[i*8+3] += samples[i] * sL3;
mixBuffer[i*8+4] += samples[i] * sL4;
mixBuffer[i*8+5] += samples[i] * sL5;
mixBuffer[i*8+6] += samples[i] * sL6;
mixBuffer[i*8+7] += samples[i] * sL7;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
sL4 += incL4;
sL5 += incL5;
sL6 += incL6;
sL7 += incL7;
}
}
/*
============
idSIMD_Generic::MixSoundEightSpeakerStereo
============
*/
void VPCALL idSIMD_Generic::MixSoundEightSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[8], const float currentV[8] ) {
float sL0 = lastV[0];
float sL1 = lastV[1];
float sL2 = lastV[2];
float sL3 = lastV[3];
float sL4 = lastV[4];
float sL5 = lastV[5];
float sL6 = lastV[6];
float sL7 = lastV[7];
float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
float incL6 = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES;
float incL7 = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES;
assert( numSamples == MIXBUFFER_SAMPLES );
for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) {
mixBuffer[i*8+0] += samples[i*2+0] * sL0;
mixBuffer[i*8+1] += samples[i*2+1] * sL1;
mixBuffer[i*8+2] += samples[i*2+0] * sL2;
mixBuffer[i*8+3] += samples[i*2+0] * sL3;
mixBuffer[i*8+4] += samples[i*2+0] * sL4;
mixBuffer[i*8+5] += samples[i*2+1] * sL5;
mixBuffer[i*8+6] += samples[i*2+0] * sL6;
mixBuffer[i*8+7] += samples[i*2+1] * sL7;
sL0 += incL0;
sL1 += incL1;
sL2 += incL2;
sL3 += incL3;
sL4 += incL4;
sL5 += incL5;
sL6 += incL6;
sL7 += incL7;
}
}
/*
============
idSIMD_Generic::MixedSoundToSamples
============
*/
void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
for ( int i = 0; i < numSamples; i++ ) {
if ( mixBuffer[i] <= -32768.0f ) {
samples[i] = -32768;
} else if ( mixBuffer[i] >= 32767.0f ) {
samples[i] = 32767;
} else {
samples[i] = (short) mixBuffer[i];
}
}
}
#pragma warning( default : 4244 )