// Copyright (C) 2007 Id Software, Inc. // #include "../precompiled.h" #pragma hdrstop #include "Simd_Generic.h" #pragma warning( disable : 4244 ) //=============================================================== // // Generic implementation of idSIMDProcessor // //=============================================================== #ifdef _DEBUG #define NODEFAULT default: assert( 0 ) #elif _WIN32 #define NODEFAULT default: __assume( 0 ) #else #define NODEFAULT #endif #define UNROLL1(Y) { int _IX; for (_IX=0;_IX constant; ============ */ void VPCALL idSIMD_Generic::CmpGT( byte *dst, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] = src0[(X)] > constant; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpGT dst[i] |= ( src0[i] > constant ) << bitNum; ============ */ void VPCALL idSIMD_Generic::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpGE dst[i] = src0[i] >= constant; ============ */ void VPCALL idSIMD_Generic::CmpGE( byte *dst, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] = src0[(X)] >= constant; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpGE dst[i] |= ( src0[i] >= constant ) << bitNum; ============ */ void VPCALL idSIMD_Generic::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] |= ( src0[(X)] >= constant ) << bitNum; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpLT dst[i] = src0[i] < constant; ============ */ void VPCALL idSIMD_Generic::CmpLT( byte *dst, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] = src0[(X)] < constant; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_Generic::SetCmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] = ( src0[(X)] < constant ) << bitNum; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_Generic::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpLE dst[i] = src0[i] <= constant; ============ */ void VPCALL idSIMD_Generic::CmpLE( byte *dst, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] = src0[(X)] <= constant; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::CmpLE dst[i] |= ( src0[i] <= constant ) << bitNum; ============ */ void VPCALL idSIMD_Generic::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { #define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum; UNROLL4(OPER) #undef OPER } /* ============ idSIMD_Generic::MinMax ============ */ void VPCALL idSIMD_Generic::MinMax( float &min, float &max, const float *src, const int count ) { min = idMath::INFINITY; max = -idMath::INFINITY; #define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];} UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MinMax ============ */ void VPCALL idSIMD_Generic::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) { min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY; #define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MinMax ============ */ void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) { min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY; #define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; } UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MinMax ============ */ void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) { min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY; #define OPER(X) const idVec3 &v = src[(X)].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; } UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MinMax ============ */ void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const vertIndex_t *indexes, const int count ) { min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY; #define OPER(X) const idVec3 &v = src[indexes[(X)]].xyz; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; } UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MinMax ============ */ void VPCALL idSIMD_Generic::MinMax( idVec3 &min, idVec3 &max, const struct shadowCache_s *src, const int count ) { min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY; #define OPER(X) const idVec3 &v = src[(X)].xyz.ToVec3(); if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; } UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::Clamp ============ */ void VPCALL idSIMD_Generic::Clamp( float *dst, const float *src, const float min, const float max, const int count ) { #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)]; UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::ClampMin ============ */ void VPCALL idSIMD_Generic::ClampMin( float *dst, const float *src, const float min, const int count ) { #define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)]; UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::ClampMax ============ */ void VPCALL idSIMD_Generic::ClampMax( float *dst, const float *src, const float max, const int count ) { #define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)]; UNROLL1(OPER) #undef OPER } /* ================ idSIMD_Generic::Memcpy ================ */ void VPCALL idSIMD_Generic::Memcpy( void *dst, const void *src, const int count ) { memcpy( dst, src, count ); } /* ================ idSIMD_Generic::Memset ================ */ void VPCALL idSIMD_Generic::Memset( void *dst, const int val, const int count ) { memset( dst, val, count ); } /* ============ idSIMD_Generic::Zero16 ============ */ void VPCALL idSIMD_Generic::Zero16( float *dst, const int count ) { memset( dst, 0, count * sizeof( float ) ); } /* ============ idSIMD_Generic::Negate16 ============ */ void VPCALL idSIMD_Generic::Negate16( float *dst, const int count ) { unsigned int *ptr = reinterpret_cast(dst); #define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::Copy16 ============ */ void VPCALL idSIMD_Generic::Copy16( float *dst, const float *src, const int count ) { #define OPER(X) dst[(X)] = src[(X)] UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::Add16 ============ */ void VPCALL idSIMD_Generic::Add16( float *dst, const float *src1, const float *src2, const int count ) { #define OPER(X) dst[(X)] = src1[(X)] + src2[(X)] UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::Sub16 ============ */ void VPCALL idSIMD_Generic::Sub16( float *dst, const float *src1, const float *src2, const int count ) { #define OPER(X) dst[(X)] = src1[(X)] - src2[(X)] UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::Mul16 ============ */ void VPCALL idSIMD_Generic::Mul16( float *dst, const float *src1, const float constant, const int count ) { #define OPER(X) dst[(X)] = src1[(X)] * constant UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::AddAssign16 ============ */ void VPCALL idSIMD_Generic::AddAssign16( float *dst, const float *src, const int count ) { #define OPER(X) dst[(X)] += src[(X)] UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::SubAssign16 ============ */ void VPCALL idSIMD_Generic::SubAssign16( float *dst, const float *src, const int count ) { #define OPER(X) dst[(X)] -= src[(X)] UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MulAssign16 ============ */ void VPCALL idSIMD_Generic::MulAssign16( float *dst, const float constant, const int count ) { #define OPER(X) dst[(X)] *= constant UNROLL1(OPER) #undef OPER } /* ============ idSIMD_Generic::MatX_MultiplyVecX ============ */ void VPCALL idSIMD_Generic::MatX_MultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { int i, j, numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: for ( i = 0; i < numRows; i++ ) { dstPtr[i] = mPtr[0] * vPtr[0]; mPtr++; } break; case 2: for ( i = 0; i < numRows; i++ ) { dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } break; case 3: for ( i = 0; i < numRows; i++ ) { dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } break; case 4: for ( i = 0; i < numRows; i++ ) { dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } break; case 5: for ( i = 0; i < numRows; i++ ) { dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } break; case 6: for ( i = 0; i < numRows; i++ ) { dstPtr[i] = mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } break; default: int numColumns = mat.GetNumColumns(); for ( i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] = sum; mPtr += numColumns; } break; } } /* ============ idSIMD_Generic::MatX_MultiplyAddVecX ============ */ void VPCALL idSIMD_Generic::MatX_MultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { int i, j, numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: for ( i = 0; i < numRows; i++ ) { dstPtr[i] += mPtr[0] * vPtr[0]; mPtr++; } break; case 2: for ( i = 0; i < numRows; i++ ) { dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } break; case 3: for ( i = 0; i < numRows; i++ ) { dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } break; case 4: for ( i = 0; i < numRows; i++ ) { dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } break; case 5: for ( i = 0; i < numRows; i++ ) { dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } break; case 6: for ( i = 0; i < numRows; i++ ) { dstPtr[i] += mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } break; default: int numColumns = mat.GetNumColumns(); for ( i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] += sum; mPtr += numColumns; } break; } } /* ============ idSIMD_Generic::MatX_MultiplySubVecX ============ */ void VPCALL idSIMD_Generic::MatX_MultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { int i, j, numRows; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumColumns() ); assert( dst.GetSize() >= mat.GetNumRows() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numRows = mat.GetNumRows(); switch( mat.GetNumColumns() ) { case 1: for ( i = 0; i < numRows; i++ ) { dstPtr[i] -= mPtr[0] * vPtr[0]; mPtr++; } break; case 2: for ( i = 0; i < numRows; i++ ) { dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1]; mPtr += 2; } break; case 3: for ( i = 0; i < numRows; i++ ) { dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2]; mPtr += 3; } break; case 4: for ( i = 0; i < numRows; i++ ) { dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3]; mPtr += 4; } break; case 5: for ( i = 0; i < numRows; i++ ) { dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4]; mPtr += 5; } break; case 6: for ( i = 0; i < numRows; i++ ) { dstPtr[i] -= mPtr[0] * vPtr[0] + mPtr[1] * vPtr[1] + mPtr[2] * vPtr[2] + mPtr[3] * vPtr[3] + mPtr[4] * vPtr[4] + mPtr[5] * vPtr[5]; mPtr += 6; } break; default: int numColumns = mat.GetNumColumns(); for ( i = 0; i < numRows; i++ ) { float sum = mPtr[0] * vPtr[0]; for ( j = 1; j < numColumns; j++ ) { sum += mPtr[j] * vPtr[j]; } dstPtr[i] -= sum; mPtr += numColumns; } break; } } /* ============ idSIMD_Generic::MatX_TransposeMultiplyVecX ============ */ void VPCALL idSIMD_Generic::MatX_TransposeMultiplyVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { int i, j, numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] = *(mPtr) * vPtr[0]; mPtr++; } break; case 2: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } break; case 3: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } break; case 4: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } break; case 5: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } break; case 6: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] = *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } break; default: int numRows = mat.GetNumRows(); for ( i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] = sum; } break; } } /* ============ idSIMD_Generic::MatX_TransposeMultiplyAddVecX ============ */ void VPCALL idSIMD_Generic::MatX_TransposeMultiplyAddVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { int i, j, numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] += *(mPtr) * vPtr[0]; mPtr++; } break; case 2: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } break; case 3: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } break; case 4: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } break; case 5: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } break; case 6: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] += *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } break; default: int numRows = mat.GetNumRows(); for ( i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] += sum; } break; } } /* ============ idSIMD_Generic::MatX_TransposeMultiplySubVecX ============ */ void VPCALL idSIMD_Generic::MatX_TransposeMultiplySubVecX( idVecX &dst, const idMatX &mat, const idVecX &vec ) { int i, numColumns; const float *mPtr, *vPtr; float *dstPtr; assert( vec.GetSize() >= mat.GetNumRows() ); assert( dst.GetSize() >= mat.GetNumColumns() ); mPtr = mat.ToFloatPtr(); vPtr = vec.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); numColumns = mat.GetNumColumns(); switch( mat.GetNumRows() ) { case 1: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] -= *(mPtr) * vPtr[0]; mPtr++; } break; case 2: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1]; mPtr++; } break; case 3: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2]; mPtr++; } break; case 4: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3]; mPtr++; } break; case 5: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4]; mPtr++; } break; case 6: for ( i = 0; i < numColumns; i++ ) { dstPtr[i] -= *(mPtr) * vPtr[0] + *(mPtr+numColumns) * vPtr[1] + *(mPtr+2*numColumns) * vPtr[2] + *(mPtr+3*numColumns) * vPtr[3] + *(mPtr+4*numColumns) * vPtr[4] + *(mPtr+5*numColumns) * vPtr[5]; mPtr++; } break; default: int numRows = mat.GetNumRows(); for ( i = 0; i < numColumns; i++ ) { mPtr = mat.ToFloatPtr() + i; float sum = mPtr[0] * vPtr[0]; for ( int j = 1; j < numRows; j++ ) { mPtr += numColumns; sum += mPtr[0] * vPtr[j]; } dstPtr[i] -= sum; } break; } } /* ============ idSIMD_Generic::MatX_MultiplyMatX optimizes the following matrix multiplications: NxN * Nx6 6xN * Nx6 Nx6 * 6xN 6x6 * 6xN with N in the range [1-6]. ============ */ void VPCALL idSIMD_Generic::MatX_MultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) { int i, j, k, l, n; float *dstPtr; const float *m1Ptr, *m2Ptr; double sum; assert( m1.GetNumColumns() == m2.GetNumRows() ); dstPtr = dst.ToFloatPtr(); m1Ptr = m1.ToFloatPtr(); m2Ptr = m2.ToFloatPtr(); k = m1.GetNumRows(); l = m2.GetNumColumns(); switch( m1.GetNumColumns() ) { case 1: { if ( l == 6 ) { for ( i = 0; i < k; i++ ) { // Nx1 * 1x6 *dstPtr++ = m1Ptr[i] * m2Ptr[0]; *dstPtr++ = m1Ptr[i] * m2Ptr[1]; *dstPtr++ = m1Ptr[i] * m2Ptr[2]; *dstPtr++ = m1Ptr[i] * m2Ptr[3]; *dstPtr++ = m1Ptr[i] * m2Ptr[4]; *dstPtr++ = m1Ptr[i] * m2Ptr[5]; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m2Ptr++; } m1Ptr++; } break; } case 2: { if ( l == 6 ) { for ( i = 0; i < k; i++ ) { // Nx2 * 2x6 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6]; *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7]; *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8]; *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9]; *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10]; *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11]; m1Ptr += 2; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l]; m2Ptr++; } m1Ptr += 2; } break; } case 3: { if ( l == 6 ) { for ( i = 0; i < k; i++ ) { // Nx3 * 3x6 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12]; *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13]; *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14]; *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15]; *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16]; *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17]; m1Ptr += 3; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l]; m2Ptr++; } m1Ptr += 3; } break; } case 4: { if ( l == 6 ) { for ( i = 0; i < k; i++ ) { // Nx4 * 4x6 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18]; *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19]; *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20]; *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21]; *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22]; *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23]; m1Ptr += 4; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l]; m2Ptr++; } m1Ptr += 4; } break; } case 5: { if ( l == 6 ) { for ( i = 0; i < k; i++ ) { // Nx5 * 5x6 *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[6] + m1Ptr[2] * m2Ptr[12] + m1Ptr[3] * m2Ptr[18] + m1Ptr[4] * m2Ptr[24]; *dstPtr++ = m1Ptr[0] * m2Ptr[1] + m1Ptr[1] * m2Ptr[7] + m1Ptr[2] * m2Ptr[13] + m1Ptr[3] * m2Ptr[19] + m1Ptr[4] * m2Ptr[25]; *dstPtr++ = m1Ptr[0] * m2Ptr[2] + m1Ptr[1] * m2Ptr[8] + m1Ptr[2] * m2Ptr[14] + m1Ptr[3] * m2Ptr[20] + m1Ptr[4] * m2Ptr[26]; *dstPtr++ = m1Ptr[0] * m2Ptr[3] + m1Ptr[1] * m2Ptr[9] + m1Ptr[2] * m2Ptr[15] + m1Ptr[3] * m2Ptr[21] + m1Ptr[4] * m2Ptr[27]; *dstPtr++ = m1Ptr[0] * m2Ptr[4] + m1Ptr[1] * m2Ptr[10] + m1Ptr[2] * m2Ptr[16] + m1Ptr[3] * m2Ptr[22] + m1Ptr[4] * m2Ptr[28]; *dstPtr++ = m1Ptr[0] * m2Ptr[5] + m1Ptr[1] * m2Ptr[11] + m1Ptr[2] * m2Ptr[17] + m1Ptr[3] * m2Ptr[23] + m1Ptr[4] * m2Ptr[29]; m1Ptr += 5; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l]; m2Ptr++; } m1Ptr += 5; } break; } case 6: { switch( k ) { case 1: { if ( l == 1 ) { // 1x6 * 6x1 dstPtr[0] = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[1] + m1Ptr[2] * m2Ptr[2] + m1Ptr[3] * m2Ptr[3] + m1Ptr[4] * m2Ptr[4] + m1Ptr[5] * m2Ptr[5]; return; } break; } case 2: { if ( l == 2 ) { // 2x6 * 6x2 for ( i = 0; i < 2; i++ ) { for ( j = 0; j < 2; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ] + m1Ptr[1] * m2Ptr[ 1 * 2 + j ] + m1Ptr[2] * m2Ptr[ 2 * 2 + j ] + m1Ptr[3] * m2Ptr[ 3 * 2 + j ] + m1Ptr[4] * m2Ptr[ 4 * 2 + j ] + m1Ptr[5] * m2Ptr[ 5 * 2 + j ]; dstPtr++; } m1Ptr += 6; } return; } break; } case 3: { if ( l == 3 ) { // 3x6 * 6x3 for ( i = 0; i < 3; i++ ) { for ( j = 0; j < 3; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ] + m1Ptr[1] * m2Ptr[ 1 * 3 + j ] + m1Ptr[2] * m2Ptr[ 2 * 3 + j ] + m1Ptr[3] * m2Ptr[ 3 * 3 + j ] + m1Ptr[4] * m2Ptr[ 4 * 3 + j ] + m1Ptr[5] * m2Ptr[ 5 * 3 + j ]; dstPtr++; } m1Ptr += 6; } return; } break; } case 4: { if ( l == 4 ) { // 4x6 * 6x4 for ( i = 0; i < 4; i++ ) { for ( j = 0; j < 4; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ] + m1Ptr[1] * m2Ptr[ 1 * 4 + j ] + m1Ptr[2] * m2Ptr[ 2 * 4 + j ] + m1Ptr[3] * m2Ptr[ 3 * 4 + j ] + m1Ptr[4] * m2Ptr[ 4 * 4 + j ] + m1Ptr[5] * m2Ptr[ 5 * 4 + j ]; dstPtr++; } m1Ptr += 6; } return; } } case 5: { if ( l == 5 ) { // 5x6 * 6x5 for ( i = 0; i < 5; i++ ) { for ( j = 0; j < 5; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ] + m1Ptr[1] * m2Ptr[ 1 * 5 + j ] + m1Ptr[2] * m2Ptr[ 2 * 5 + j ] + m1Ptr[3] * m2Ptr[ 3 * 5 + j ] + m1Ptr[4] * m2Ptr[ 4 * 5 + j ] + m1Ptr[5] * m2Ptr[ 5 * 5 + j ]; dstPtr++; } m1Ptr += 6; } return; } } case 6: { switch( l ) { case 1: { // 6x6 * 6x1 for ( i = 0; i < 6; i++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 1 ] + m1Ptr[1] * m2Ptr[ 1 * 1 ] + m1Ptr[2] * m2Ptr[ 2 * 1 ] + m1Ptr[3] * m2Ptr[ 3 * 1 ] + m1Ptr[4] * m2Ptr[ 4 * 1 ] + m1Ptr[5] * m2Ptr[ 5 * 1 ]; dstPtr++; m1Ptr += 6; } return; } case 2: { // 6x6 * 6x2 for ( i = 0; i < 6; i++ ) { for ( j = 0; j < 2; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 2 + j ] + m1Ptr[1] * m2Ptr[ 1 * 2 + j ] + m1Ptr[2] * m2Ptr[ 2 * 2 + j ] + m1Ptr[3] * m2Ptr[ 3 * 2 + j ] + m1Ptr[4] * m2Ptr[ 4 * 2 + j ] + m1Ptr[5] * m2Ptr[ 5 * 2 + j ]; dstPtr++; } m1Ptr += 6; } return; } case 3: { // 6x6 * 6x3 for ( i = 0; i < 6; i++ ) { for ( j = 0; j < 3; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 3 + j ] + m1Ptr[1] * m2Ptr[ 1 * 3 + j ] + m1Ptr[2] * m2Ptr[ 2 * 3 + j ] + m1Ptr[3] * m2Ptr[ 3 * 3 + j ] + m1Ptr[4] * m2Ptr[ 4 * 3 + j ] + m1Ptr[5] * m2Ptr[ 5 * 3 + j ]; dstPtr++; } m1Ptr += 6; } return; } case 4: { // 6x6 * 6x4 for ( i = 0; i < 6; i++ ) { for ( j = 0; j < 4; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 4 + j ] + m1Ptr[1] * m2Ptr[ 1 * 4 + j ] + m1Ptr[2] * m2Ptr[ 2 * 4 + j ] + m1Ptr[3] * m2Ptr[ 3 * 4 + j ] + m1Ptr[4] * m2Ptr[ 4 * 4 + j ] + m1Ptr[5] * m2Ptr[ 5 * 4 + j ]; dstPtr++; } m1Ptr += 6; } return; } case 5: { // 6x6 * 6x5 for ( i = 0; i < 6; i++ ) { for ( j = 0; j < 5; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 5 + j ] + m1Ptr[1] * m2Ptr[ 1 * 5 + j ] + m1Ptr[2] * m2Ptr[ 2 * 5 + j ] + m1Ptr[3] * m2Ptr[ 3 * 5 + j ] + m1Ptr[4] * m2Ptr[ 4 * 5 + j ] + m1Ptr[5] * m2Ptr[ 5 * 5 + j ]; dstPtr++; } m1Ptr += 6; } return; } case 6: { // 6x6 * 6x6 for ( i = 0; i < 6; i++ ) { for ( j = 0; j < 6; j++ ) { *dstPtr = m1Ptr[0] * m2Ptr[ 0 * 6 + j ] + m1Ptr[1] * m2Ptr[ 1 * 6 + j ] + m1Ptr[2] * m2Ptr[ 2 * 6 + j ] + m1Ptr[3] * m2Ptr[ 3 * 6 + j ] + m1Ptr[4] * m2Ptr[ 4 * 6 + j ] + m1Ptr[5] * m2Ptr[ 5 * 6 + j ]; dstPtr++; } m1Ptr += 6; } return; } } } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[1] * m2Ptr[l] + m1Ptr[2] * m2Ptr[2*l] + m1Ptr[3] * m2Ptr[3*l] + m1Ptr[4] * m2Ptr[4*l] + m1Ptr[5] * m2Ptr[5*l]; m2Ptr++; } m1Ptr += 6; } break; } default: { for ( i = 0; i < k; i++ ) { for ( j = 0; j < l; j++ ) { m2Ptr = m2.ToFloatPtr() + j; sum = m1Ptr[0] * m2Ptr[0]; for ( n = 1; n < m1.GetNumColumns(); n++ ) { m2Ptr += l; sum += m1Ptr[n] * m2Ptr[0]; } *dstPtr++ = sum; } m1Ptr += m1.GetNumColumns(); } break; } } } /* ============ idSIMD_Generic::MatX_TransposeMultiplyMatX optimizes the following tranpose matrix multiplications: Nx6 * NxN 6xN * 6x6 with N in the range [1-6]. ============ */ void VPCALL idSIMD_Generic::MatX_TransposeMultiplyMatX( idMatX &dst, const idMatX &m1, const idMatX &m2 ) { int i, j, k, l, n; float *dstPtr; const float *m1Ptr, *m2Ptr; double sum; assert( m1.GetNumRows() == m2.GetNumRows() ); m1Ptr = m1.ToFloatPtr(); m2Ptr = m2.ToFloatPtr(); dstPtr = dst.ToFloatPtr(); k = m1.GetNumColumns(); l = m2.GetNumColumns(); switch( m1.GetNumRows() ) { case 1: if ( k == 6 && l == 1 ) { // 1x6 * 1x1 for ( i = 0; i < 6; i++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m1Ptr++; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0]; m2Ptr++; } m1Ptr++; } break; case 2: if ( k == 6 && l == 2 ) { // 2x6 * 2x2 for ( i = 0; i < 6; i++ ) { *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+0] + m1Ptr[1*6] * m2Ptr[1*2+0]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*2+1] + m1Ptr[1*6] * m2Ptr[1*2+1]; m1Ptr++; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l]; m2Ptr++; } m1Ptr++; } break; case 3: if ( k == 6 && l == 3 ) { // 3x6 * 3x3 for ( i = 0; i < 6; i++ ) { *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+0] + m1Ptr[1*6] * m2Ptr[1*3+0] + m1Ptr[2*6] * m2Ptr[2*3+0]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+1] + m1Ptr[1*6] * m2Ptr[1*3+1] + m1Ptr[2*6] * m2Ptr[2*3+1]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*3+2] + m1Ptr[1*6] * m2Ptr[1*3+2] + m1Ptr[2*6] * m2Ptr[2*3+2]; m1Ptr++; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l]; m2Ptr++; } m1Ptr++; } break; case 4: if ( k == 6 && l == 4 ) { // 4x6 * 4x4 for ( i = 0; i < 6; i++ ) { *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+0] + m1Ptr[1*6] * m2Ptr[1*4+0] + m1Ptr[2*6] * m2Ptr[2*4+0] + m1Ptr[3*6] * m2Ptr[3*4+0]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+1] + m1Ptr[1*6] * m2Ptr[1*4+1] + m1Ptr[2*6] * m2Ptr[2*4+1] + m1Ptr[3*6] * m2Ptr[3*4+1]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+2] + m1Ptr[1*6] * m2Ptr[1*4+2] + m1Ptr[2*6] * m2Ptr[2*4+2] + m1Ptr[3*6] * m2Ptr[3*4+2]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*4+3] + m1Ptr[1*6] * m2Ptr[1*4+3] + m1Ptr[2*6] * m2Ptr[2*4+3] + m1Ptr[3*6] * m2Ptr[3*4+3]; m1Ptr++; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l]; m2Ptr++; } m1Ptr++; } break; case 5: if ( k == 6 && l == 5 ) { // 5x6 * 5x5 for ( i = 0; i < 6; i++ ) { *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+0] + m1Ptr[1*6] * m2Ptr[1*5+0] + m1Ptr[2*6] * m2Ptr[2*5+0] + m1Ptr[3*6] * m2Ptr[3*5+0] + m1Ptr[4*6] * m2Ptr[4*5+0]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+1] + m1Ptr[1*6] * m2Ptr[1*5+1] + m1Ptr[2*6] * m2Ptr[2*5+1] + m1Ptr[3*6] * m2Ptr[3*5+1] + m1Ptr[4*6] * m2Ptr[4*5+1]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+2] + m1Ptr[1*6] * m2Ptr[1*5+2] + m1Ptr[2*6] * m2Ptr[2*5+2] + m1Ptr[3*6] * m2Ptr[3*5+2] + m1Ptr[4*6] * m2Ptr[4*5+2]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+3] + m1Ptr[1*6] * m2Ptr[1*5+3] + m1Ptr[2*6] * m2Ptr[2*5+3] + m1Ptr[3*6] * m2Ptr[3*5+3] + m1Ptr[4*6] * m2Ptr[4*5+3]; *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*5+4] + m1Ptr[1*6] * m2Ptr[1*5+4] + m1Ptr[2*6] * m2Ptr[2*5+4] + m1Ptr[3*6] * m2Ptr[3*5+4] + m1Ptr[4*6] * m2Ptr[4*5+4]; m1Ptr++; } return; } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l]; m2Ptr++; } m1Ptr++; } break; case 6: if ( l == 6 ) { switch( k ) { case 1: // 6x1 * 6x6 m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < 6; j++ ) { *dstPtr++ = m1Ptr[0*1] * m2Ptr[0*6] + m1Ptr[1*1] * m2Ptr[1*6] + m1Ptr[2*1] * m2Ptr[2*6] + m1Ptr[3*1] * m2Ptr[3*6] + m1Ptr[4*1] * m2Ptr[4*6] + m1Ptr[5*1] * m2Ptr[5*6]; m2Ptr++; } return; case 2: // 6x2 * 6x6 for ( i = 0; i < 2; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < 6; j++ ) { *dstPtr++ = m1Ptr[0*2] * m2Ptr[0*6] + m1Ptr[1*2] * m2Ptr[1*6] + m1Ptr[2*2] * m2Ptr[2*6] + m1Ptr[3*2] * m2Ptr[3*6] + m1Ptr[4*2] * m2Ptr[4*6] + m1Ptr[5*2] * m2Ptr[5*6]; m2Ptr++; } m1Ptr++; } return; case 3: // 6x3 * 6x6 for ( i = 0; i < 3; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < 6; j++ ) { *dstPtr++ = m1Ptr[0*3] * m2Ptr[0*6] + m1Ptr[1*3] * m2Ptr[1*6] + m1Ptr[2*3] * m2Ptr[2*6] + m1Ptr[3*3] * m2Ptr[3*6] + m1Ptr[4*3] * m2Ptr[4*6] + m1Ptr[5*3] * m2Ptr[5*6]; m2Ptr++; } m1Ptr++; } return; case 4: // 6x4 * 6x6 for ( i = 0; i < 4; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < 6; j++ ) { *dstPtr++ = m1Ptr[0*4] * m2Ptr[0*6] + m1Ptr[1*4] * m2Ptr[1*6] + m1Ptr[2*4] * m2Ptr[2*6] + m1Ptr[3*4] * m2Ptr[3*6] + m1Ptr[4*4] * m2Ptr[4*6] + m1Ptr[5*4] * m2Ptr[5*6]; m2Ptr++; } m1Ptr++; } return; case 5: // 6x5 * 6x6 for ( i = 0; i < 5; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < 6; j++ ) { *dstPtr++ = m1Ptr[0*5] * m2Ptr[0*6] + m1Ptr[1*5] * m2Ptr[1*6] + m1Ptr[2*5] * m2Ptr[2*6] + m1Ptr[3*5] * m2Ptr[3*6] + m1Ptr[4*5] * m2Ptr[4*6] + m1Ptr[5*5] * m2Ptr[5*6]; m2Ptr++; } m1Ptr++; } return; case 6: // 6x6 * 6x6 for ( i = 0; i < 6; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < 6; j++ ) { *dstPtr++ = m1Ptr[0*6] * m2Ptr[0*6] + m1Ptr[1*6] * m2Ptr[1*6] + m1Ptr[2*6] * m2Ptr[2*6] + m1Ptr[3*6] * m2Ptr[3*6] + m1Ptr[4*6] * m2Ptr[4*6] + m1Ptr[5*6] * m2Ptr[5*6]; m2Ptr++; } m1Ptr++; } return; } } for ( i = 0; i < k; i++ ) { m2Ptr = m2.ToFloatPtr(); for ( j = 0; j < l; j++ ) { *dstPtr++ = m1Ptr[0] * m2Ptr[0] + m1Ptr[k] * m2Ptr[l] + m1Ptr[2*k] * m2Ptr[2*l] + m1Ptr[3*k] * m2Ptr[3*l] + m1Ptr[4*k] * m2Ptr[4*l] + m1Ptr[5*k] * m2Ptr[5*l]; m2Ptr++; } m1Ptr++; } break; default: for ( i = 0; i < k; i++ ) { for ( j = 0; j < l; j++ ) { m1Ptr = m1.ToFloatPtr() + i; m2Ptr = m2.ToFloatPtr() + j; sum = m1Ptr[0] * m2Ptr[0]; for ( n = 1; n < m1.GetNumRows(); n++ ) { m1Ptr += k; m2Ptr += l; sum += m1Ptr[0] * m2Ptr[0]; } *dstPtr++ = sum; } } break; } } /* ============ idSIMD_Generic::MatX_LowerTriangularSolve solves x in Lx = b for the n * n sub-matrix of L if skip > 0 the first skip elements of x are assumed to be valid already L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_Generic::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) { #if 1 int nc; const float *lptr; if ( skip >= n ) { return; } lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { #define NSKIP( n, s ) ((n<<3)|(s&7)) switch( NSKIP( n, skip ) ) { case NSKIP( 1, 0 ): x[0] = b[0]; return; case NSKIP( 2, 0 ): x[0] = b[0]; case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; return; case NSKIP( 3, 0 ): x[0] = b[0]; case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; return; case NSKIP( 4, 0 ): x[0] = b[0]; case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; return; case NSKIP( 5, 0 ): x[0] = b[0]; case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; return; case NSKIP( 6, 0 ): x[0] = b[0]; case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; return; case NSKIP( 7, 0 ): x[0] = b[0]; case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0]; case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3]; case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4]; case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5]; return; } return; } // process first 4 rows switch( skip ) { case 0: x[0] = b[0]; case 1: x[1] = b[1] - lptr[1*nc+0] * x[0]; case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1]; case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2]; skip = 4; } lptr = L[skip]; int i, j; register double s0, s1, s2, s3; for ( i = skip; i < n; i++ ) { s0 = lptr[0] * x[0]; s1 = lptr[1] * x[1]; s2 = lptr[2] * x[2]; s3 = lptr[3] * x[3]; for ( j = 4; j < i-7; j += 8 ) { s0 += lptr[j+0] * x[j+0]; s1 += lptr[j+1] * x[j+1]; s2 += lptr[j+2] * x[j+2]; s3 += lptr[j+3] * x[j+3]; s0 += lptr[j+4] * x[j+4]; s1 += lptr[j+5] * x[j+5]; s2 += lptr[j+6] * x[j+6]; s3 += lptr[j+7] * x[j+7]; } switch( i - j ) { NODEFAULT; case 7: s0 += lptr[j+6] * x[j+6]; case 6: s1 += lptr[j+5] * x[j+5]; case 5: s2 += lptr[j+4] * x[j+4]; case 4: s3 += lptr[j+3] * x[j+3]; case 3: s0 += lptr[j+2] * x[j+2]; case 2: s1 += lptr[j+1] * x[j+1]; case 1: s2 += lptr[j+0] * x[j+0]; case 0: break; } double sum; sum = s3; sum += s2; sum += s1; sum += s0; sum -= b[i]; x[i] = -sum; lptr += nc; } #else int i, j; const float *lptr; double sum; for ( i = skip; i < n; i++ ) { sum = b[i]; lptr = L[i]; for ( j = 0; j < i; j++ ) { sum -= lptr[j] * x[j]; } x[i] = sum; } #endif } /* ============ idSIMD_Generic::MatX_LowerTriangularSolveTranspose solves x in L'x = b for the n * n sub-matrix of L L has to be a lower triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_Generic::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) { #if 1 int nc; const float *lptr; lptr = L.ToFloatPtr(); nc = L.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[1] = b[1]; x[0] = b[0] - lptr[1*nc+0] * x[1]; return; case 3: x[2] = b[2]; x[1] = b[1] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 4: x[3] = b[3]; x[2] = b[2] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 5: x[4] = b[4]; x[3] = b[3] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 6: x[5] = b[5]; x[4] = b[4] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; case 7: x[6] = b[6]; x[5] = b[5] - lptr[6*nc+5] * x[6]; x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5]; x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4]; x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3]; x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2]; x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1]; return; } return; } int i, j; register double s0, s1, s2, s3; float *xptr; lptr = L.ToFloatPtr() + n * nc + n - 4; xptr = x + n; // process 4 rows at a time for ( i = n; i >= 4; i -= 4 ) { s0 = b[i-4]; s1 = b[i-3]; s2 = b[i-2]; s3 = b[i-1]; // process 4x4 blocks for ( j = 0; j < n-i; j += 4 ) { s0 -= lptr[(j+0)*nc+0] * xptr[j+0]; s1 -= lptr[(j+0)*nc+1] * xptr[j+0]; s2 -= lptr[(j+0)*nc+2] * xptr[j+0]; s3 -= lptr[(j+0)*nc+3] * xptr[j+0]; s0 -= lptr[(j+1)*nc+0] * xptr[j+1]; s1 -= lptr[(j+1)*nc+1] * xptr[j+1]; s2 -= lptr[(j+1)*nc+2] * xptr[j+1]; s3 -= lptr[(j+1)*nc+3] * xptr[j+1]; s0 -= lptr[(j+2)*nc+0] * xptr[j+2]; s1 -= lptr[(j+2)*nc+1] * xptr[j+2]; s2 -= lptr[(j+2)*nc+2] * xptr[j+2]; s3 -= lptr[(j+2)*nc+3] * xptr[j+2]; s0 -= lptr[(j+3)*nc+0] * xptr[j+3]; s1 -= lptr[(j+3)*nc+1] * xptr[j+3]; s2 -= lptr[(j+3)*nc+2] * xptr[j+3]; s3 -= lptr[(j+3)*nc+3] * xptr[j+3]; } // process left over of the 4 rows s0 -= lptr[0-1*nc] * s3; s1 -= lptr[1-1*nc] * s3; s2 -= lptr[2-1*nc] * s3; s0 -= lptr[0-2*nc] * s2; s1 -= lptr[1-2*nc] * s2; s0 -= lptr[0-3*nc] * s1; // store result xptr[-4] = s0; xptr[-3] = s1; xptr[-2] = s2; xptr[-1] = s3; // update pointers for next four rows lptr -= 4 + 4 * nc; xptr -= 4; } // process left over rows for ( i--; i >= 0; i-- ) { s0 = b[i]; lptr = L[0] + i; for ( j = i + 1; j < n; j++ ) { s0 -= lptr[j*nc] * x[j]; } x[i] = s0; } #else int i, j, nc; const float *ptr; double sum; nc = L.GetNumColumns(); for ( i = n - 1; i >= 0; i-- ) { sum = b[i]; ptr = L[0] + i; for ( j = i + 1; j < n; j++ ) { sum -= ptr[j*nc] * x[j]; } x[i] = sum; } #endif } /* ============ idSIMD_Generic::MatX_UpperTriangularSolve solves x in Ux = b for the n * n sub-matrix of U U has to be an upper triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_Generic::MatX_UpperTriangularSolve( const idMatX &U, float *x, const float *b, const int n ) { #if 1 int nc; const float *uptr; uptr = U.ToFloatPtr(); nc = U.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[1] = b[1]; x[0] = b[0] - uptr[0*nc+1] * x[1]; return; case 3: x[2] = b[2]; x[1] = b[1] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 4: x[3] = b[3]; x[2] = b[2] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 5: x[4] = b[4]; x[3] = b[3] - uptr[3*nc+4] * x[4]; x[2] = b[2] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 6: x[5] = b[5]; x[4] = b[4] - uptr[4*nc+5] * x[5]; x[3] = b[3] - uptr[3*nc+5] * x[5] - uptr[3*nc+4] * x[4]; x[2] = b[2] - uptr[2*nc+5] * x[5] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+5] * x[5] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+5] * x[5] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; case 7: x[6] = b[6]; x[5] = b[5] - uptr[5*nc+6] * x[6]; x[4] = b[4] - uptr[4*nc+6] * x[6] - uptr[4*nc+5] * x[5]; x[3] = b[3] - uptr[3*nc+6] * x[6] - uptr[3*nc+5] * x[5] - uptr[3*nc+4] * x[4]; x[2] = b[2] - uptr[2*nc+6] * x[6] - uptr[2*nc+5] * x[5] - uptr[2*nc+4] * x[4] - uptr[2*nc+3] * x[3]; x[1] = b[1] - uptr[1*nc+6] * x[6] - uptr[1*nc+5] * x[5] - uptr[1*nc+4] * x[4] - uptr[1*nc+3] * x[3] - uptr[1*nc+2] * x[2]; x[0] = b[0] - uptr[0*nc+6] * x[6] - uptr[0*nc+5] * x[5] - uptr[0*nc+4] * x[4] - uptr[0*nc+3] * x[3] - uptr[0*nc+2] * x[2] - uptr[0*nc+1] * x[1]; return; } return; } int i, j; register double s0, s1, s2, s3; // process the last four rows x[n-1] = b[n-1]; x[n-2] = b[n-2] - uptr[(n-2)*nc+(n-1)] * x[n-1]; x[n-3] = b[n-3] - uptr[(n-3)*nc+(n-1)] * x[n-1] - uptr[(n-3)*nc+(n-2)] * x[n-2]; x[n-4] = b[n-4] - uptr[(n-4)*nc+(n-1)] * x[n-1] - uptr[(n-4)*nc+(n-2)] * x[n-2] - uptr[(n-4)*nc+(n-3)] * x[n-3]; uptr = U[n - 5]; for ( i = n - 5; i >= 0; i-- ) { s0 = uptr[i+1] * x[i+1]; s1 = uptr[i+2] * x[i+2]; s2 = uptr[i+3] * x[i+3]; s3 = uptr[i+4] * x[i+4]; for ( j = i + 5; j < n-7; j += 8 ) { s0 += uptr[j+0] * x[j+0]; s1 += uptr[j+1] * x[j+1]; s2 += uptr[j+2] * x[j+2]; s3 += uptr[j+3] * x[j+3]; s0 += uptr[j+4] * x[j+4]; s1 += uptr[j+5] * x[j+5]; s2 += uptr[j+6] * x[j+6]; s3 += uptr[j+7] * x[j+7]; } switch( n - j ) { NODEFAULT; case 7: s0 += uptr[j+6] * x[j+6]; case 6: s1 += uptr[j+5] * x[j+5]; case 5: s2 += uptr[j+4] * x[j+4]; case 4: s3 += uptr[j+3] * x[j+3]; case 3: s0 += uptr[j+2] * x[j+2]; case 2: s1 += uptr[j+1] * x[j+1]; case 1: s2 += uptr[j+0] * x[j+0]; case 0: break; } double sum; sum = s3; sum += s2; sum += s1; sum += s0; sum -= b[i]; x[i] = -sum; uptr -= nc; } #else int i, j; const float *ptr; double sum; for ( i = n - 1; i >= 0; i-- ) { sum = b[i]; ptr = U[i]; for ( j = i + 1; j < n; j++ ) { sum -= ptr[j] * x[j]; } x[i] = sum; } #endif } /* ============ idSIMD_Generic::MatX_UpperTriangularSolveTranspose solves x in U'x = b for the n * n sub-matrix of U U has to be an upper triangular matrix with (implicit) ones on the diagonal x == b is allowed ============ */ void VPCALL idSIMD_Generic::MatX_UpperTriangularSolveTranspose( const idMatX &U, float *x, const float *b, const int n ) { #if 1 int nc; const float *uptr; uptr = U.ToFloatPtr(); nc = U.GetNumColumns(); // unrolled cases for n < 8 if ( n < 8 ) { switch( n ) { case 0: return; case 1: x[0] = b[0]; return; case 2: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; return; case 3: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; return; case 4: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; return; case 5: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3]; return; case 6: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3]; x[5] = b[5] - uptr[0*nc+5] * x[0] - uptr[1*nc+5] * x[1] - uptr[2*nc+5] * x[2] - uptr[3*nc+5] * x[3] - uptr[4*nc+5] * x[4]; return; case 7: x[0] = b[0]; x[1] = b[1] - uptr[0*nc+1] * x[0]; x[2] = b[2] - uptr[0*nc+2] * x[0] - uptr[1*nc+2] * x[1]; x[3] = b[3] - uptr[0*nc+3] * x[0] - uptr[1*nc+3] * x[1] - uptr[2*nc+3] * x[2]; x[4] = b[4] - uptr[0*nc+4] * x[0] - uptr[1*nc+4] * x[1] - uptr[2*nc+4] * x[2] - uptr[3*nc+4] * x[3]; x[5] = b[5] - uptr[0*nc+5] * x[0] - uptr[1*nc+5] * x[1] - uptr[2*nc+5] * x[2] - uptr[3*nc+5] * x[3] - uptr[4*nc+5] * x[4]; x[6] = b[6] - uptr[0*nc+6] * x[0] - uptr[1*nc+6] * x[1] - uptr[2*nc+6] * x[2] - uptr[3*nc+6] * x[3] - uptr[4*nc+6] * x[4] - uptr[5*nc+6] * x[5]; return; } return; } int i, j; register double s0, s1, s2, s3; uptr = U.ToFloatPtr(); // process 4 columns at a time for ( i = 0; i < n - 3; i += 4 ) { s0 = b[i+0]; s1 = b[i+1]; s2 = b[i+2]; s3 = b[i+3]; // process 4x4 blocks for ( j = 0; j < i-3; j += 4 ) { s0 -= uptr[(j+0)*nc+0] * x[j+0]; s1 -= uptr[(j+0)*nc+1] * x[j+0]; s2 -= uptr[(j+0)*nc+2] * x[j+0]; s3 -= uptr[(j+0)*nc+3] * x[j+0]; s0 -= uptr[(j+1)*nc+0] * x[j+1]; s1 -= uptr[(j+1)*nc+1] * x[j+1]; s2 -= uptr[(j+1)*nc+2] * x[j+1]; s3 -= uptr[(j+1)*nc+3] * x[j+1]; s0 -= uptr[(j+2)*nc+0] * x[j+2]; s1 -= uptr[(j+2)*nc+1] * x[j+2]; s2 -= uptr[(j+2)*nc+2] * x[j+2]; s3 -= uptr[(j+2)*nc+3] * x[j+2]; s0 -= uptr[(j+3)*nc+0] * x[j+3]; s1 -= uptr[(j+3)*nc+1] * x[j+3]; s2 -= uptr[(j+3)*nc+2] * x[j+3]; s3 -= uptr[(j+3)*nc+3] * x[j+3]; } // process left over of the 4 columns s1 -= uptr[(j+0)*nc+1] * s0; s2 -= uptr[(j+0)*nc+2] * s0; s2 -= uptr[(j+1)*nc+2] * s1; s3 -= uptr[(j+0)*nc+3] * s0; s3 -= uptr[(j+1)*nc+3] * s1; s3 -= uptr[(j+2)*nc+3] * s2; // store result x[i+0] = s0; x[i+1] = s1; x[i+2] = s2; x[i+3] = s3; // update pointer for next four columns uptr += 4; } // process left over columns for ( ; i < n; i++ ) { s0 = b[i]; uptr = U[0] + i; for ( j = 0; j < i; j++ ) { s0 -= uptr[j*nc] * x[j]; } x[i] = s0; } #else int i, j, nc; const float *uptr; double sum; nc = U.GetNumColumns(); for ( i = 0; i < n; i++ ) { sum = b[i]; uptr = U.ToFloatPtr() + i; for ( j = 0; j < i; j++ ) { sum -= uptr[j*nc] * x[j]; } x[i] = sum; } #endif } /* ============ idSIMD_Generic::MatX_LU_Factor in-place factorization LU of the n * n sub-matrix of mat the reciprocal of the diagonal elements of U are stored in invDiag no pivoting is used ============ */ bool VPCALL idSIMD_Generic::MatX_LU_Factor( idMatX &mat, idVecX &invDiag, const int n ) { #if 1 int i, j, k; float d1, d2, *ptr1, *ptr2; for ( i = 0; i < n; i++ ) { d1 = mat[i][i]; if ( d1 == 0.0f ) { return false; } invDiag[i] = d1 = 1.0f / d1; ptr1 = mat[i]; for ( j = i + 1; j < n; j++ ) { ptr2 = mat[j]; ptr2[i] = d2 = ptr2[i] * d1; for ( k = i + 1; k < n - 15; k += 16 ) { ptr2[k+0] -= d2 * ptr1[k+0]; ptr2[k+1] -= d2 * ptr1[k+1]; ptr2[k+2] -= d2 * ptr1[k+2]; ptr2[k+3] -= d2 * ptr1[k+3]; ptr2[k+4] -= d2 * ptr1[k+4]; ptr2[k+5] -= d2 * ptr1[k+5]; ptr2[k+6] -= d2 * ptr1[k+6]; ptr2[k+7] -= d2 * ptr1[k+7]; ptr2[k+8] -= d2 * ptr1[k+8]; ptr2[k+9] -= d2 * ptr1[k+9]; ptr2[k+10] -= d2 * ptr1[k+10]; ptr2[k+11] -= d2 * ptr1[k+11]; ptr2[k+12] -= d2 * ptr1[k+12]; ptr2[k+13] -= d2 * ptr1[k+13]; ptr2[k+14] -= d2 * ptr1[k+14]; ptr2[k+15] -= d2 * ptr1[k+15]; } switch( n - k ) { NODEFAULT; case 15: ptr2[k+14] -= d2 * ptr1[k+14]; case 14: ptr2[k+13] -= d2 * ptr1[k+13]; case 13: ptr2[k+12] -= d2 * ptr1[k+12]; case 12: ptr2[k+11] -= d2 * ptr1[k+11]; case 11: ptr2[k+10] -= d2 * ptr1[k+10]; case 10: ptr2[k+9] -= d2 * ptr1[k+9]; case 9: ptr2[k+8] -= d2 * ptr1[k+8]; case 8: ptr2[k+7] -= d2 * ptr1[k+7]; case 7: ptr2[k+6] -= d2 * ptr1[k+6]; case 6: ptr2[k+5] -= d2 * ptr1[k+5]; case 5: ptr2[k+4] -= d2 * ptr1[k+4]; case 4: ptr2[k+3] -= d2 * ptr1[k+3]; case 3: ptr2[k+2] -= d2 * ptr1[k+2]; case 2: ptr2[k+1] -= d2 * ptr1[k+1]; case 1: ptr2[k+0] -= d2 * ptr1[k+0]; case 0: break; } } } return true; #else int i, j, k; float d; for ( i = 0; i < n; i++ ) { if ( mat[i][i] == 0.0f ) { return false; } invDiag[i] = d = 1.0f / mat[i][i]; for ( j = i + 1; j < n; j++ ) { mat[j][i] *= d; } for ( j = i + 1; j < n; j++ ) { d = mat[j][i]; for ( k = i + 1; k < n; k++ ) { mat[j][k] -= d * mat[i][k]; } } } return true; #endif } /* ============ idSIMD_Generic::MatX_LDLT_Factor in-place factorization LDL' of the n * n sub-matrix of mat the reciprocal of the diagonal elements are stored in invDiag ============ */ bool VPCALL idSIMD_Generic::MatX_LDLT_Factor( idMatX &mat, idVecX &invDiag, const int n ) { #if 1 int i, j, k, nc; float *v, *diag, *mptr; double s0, s1, s2, s3, sum, d; v = (float *) _alloca16( n * sizeof( float ) ); diag = (float *) _alloca16( n * sizeof( float ) ); nc = mat.GetNumColumns(); if ( n <= 0 ) { return true; } mptr = mat[0]; sum = mptr[0]; if ( sum == 0.0f ) { return false; } diag[0] = sum; invDiag[0] = d = 1.0f / sum; if ( n <= 1 ) { return true; } mptr = mat[0]; for ( j = 1; j < n; j++ ) { mptr[j*nc+0] = ( mptr[j*nc+0] ) * d; } mptr = mat[1]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; sum = mptr[1] - s0; if ( sum == 0.0f ) { return false; } mat[1][1] = sum; diag[1] = sum; invDiag[1] = d = 1.0f / sum; if ( n <= 2 ) { return true; } mptr = mat[0]; for ( j = 2; j < n; j++ ) { mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d; } mptr = mat[2]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; sum = mptr[2] - s0 - s1; if ( sum == 0.0f ) { return false; } mat[2][2] = sum; diag[2] = sum; invDiag[2] = d = 1.0f / sum; if ( n <= 3 ) { return true; } mptr = mat[0]; for ( j = 3; j < n; j++ ) { mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d; } mptr = mat[3]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; sum = mptr[3] - s0 - s1 - s2; if ( sum == 0.0f ) { return false; } mat[3][3] = sum; diag[3] = sum; invDiag[3] = d = 1.0f / sum; if ( n <= 4 ) { return true; } mptr = mat[0]; for ( j = 4; j < n; j++ ) { mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d; } for ( i = 4; i < n; i++ ) { mptr = mat[i]; v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0]; v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1]; v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2]; v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3]; for ( k = 4; k < i-3; k += 4 ) { v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0]; v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2]; v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3]; } switch( i - k ) { NODEFAULT; case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2]; case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1]; case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0]; case 0: break; } sum = s3; sum += s2; sum += s1; sum += s0; sum = mptr[i] - sum; if ( sum == 0.0f ) { return false; } mat[i][i] = sum; diag[i] = sum; invDiag[i] = d = 1.0f / sum; if ( i + 1 >= n ) { return true; } mptr = mat[i+1]; for ( j = i+1; j < n; j++ ) { s0 = mptr[0] * v[0]; s1 = mptr[1] * v[1]; s2 = mptr[2] * v[2]; s3 = mptr[3] * v[3]; for ( k = 4; k < i-7; k += 8 ) { s0 += mptr[k+0] * v[k+0]; s1 += mptr[k+1] * v[k+1]; s2 += mptr[k+2] * v[k+2]; s3 += mptr[k+3] * v[k+3]; s0 += mptr[k+4] * v[k+4]; s1 += mptr[k+5] * v[k+5]; s2 += mptr[k+6] * v[k+6]; s3 += mptr[k+7] * v[k+7]; } switch( i - k ) { NODEFAULT; case 7: s0 += mptr[k+6] * v[k+6]; case 6: s1 += mptr[k+5] * v[k+5]; case 5: s2 += mptr[k+4] * v[k+4]; case 4: s3 += mptr[k+3] * v[k+3]; case 3: s0 += mptr[k+2] * v[k+2]; case 2: s1 += mptr[k+1] * v[k+1]; case 1: s2 += mptr[k+0] * v[k+0]; case 0: break; } sum = s3; sum += s2; sum += s1; sum += s0; mptr[i] = ( mptr[i] - sum ) * d; mptr += nc; } } return true; #else int i, j, k, nc; float *v, *ptr, *diagPtr; double d, sum; v = (float *) _alloca16( n * sizeof( float ) ); nc = mat.GetNumColumns(); for ( i = 0; i < n; i++ ) { ptr = mat[i]; diagPtr = mat[0]; sum = ptr[i]; for ( j = 0; j < i; j++ ) { d = ptr[j]; v[j] = diagPtr[0] * d; sum -= v[j] * d; diagPtr += nc + 1; } if ( sum == 0.0f ) { return false; } diagPtr[0] = sum; invDiag[i] = d = 1.0f / sum; if ( i + 1 >= n ) { continue; } ptr = mat[i+1]; for ( j = i + 1; j < n; j++ ) { sum = ptr[i]; for ( k = 0; k < i; k++ ) { sum -= ptr[k] * v[k]; } ptr[i] = sum * d; ptr += nc; } } return true; #endif } /* ============ idSIMD_Generic::DecompressJoints ============ */ void VPCALL idSIMD_Generic::DecompressJoints( idJointQuat *joints, const idCompressedJointQuat *compressedJoints, const int *index, const int numJoints ) { for ( int i = 0; i < numJoints; i++ ) { int j = index[i]; joints[j].q = compressedJoints[j].ToQuat(); joints[j].t = compressedJoints[j].ToOffset(); joints[j].w = 0.0f; } } /* ============ SlerpUnoptimized ============ */ void SlerpUnoptimized( const idQuat &from, const idQuat &to, float t, idQuat &result ) { float cosom, absCosom, sinom, omega, scale0, scale1; cosom = from.x * to.x + from.y * to.y + from.z * to.z + from.w * to.w; absCosom = fabs( cosom ); if ( ( 1.0f - absCosom ) > 1e-6f ) { omega = acos( absCosom ); sinom = 1.0f / sin( omega ); scale0 = sin( ( 1.0f - t ) * omega ) * sinom; scale1 = sin( t * omega ) * sinom; } else { scale0 = 1.0f - t; scale1 = t; } scale1 = ( cosom >= 0.0f ) ? scale1 : -scale1; result.x = scale0 * from.x + scale1 * to.x; result.y = scale0 * from.y + scale1 * to.y; result.z = scale0 * from.z + scale1 * to.z; result.w = scale0 * from.w + scale1 * to.w; } /* ============ SlerpOptimized ============ */ void SlerpOptimized( const idQuat &from, const idQuat &to, float t, idQuat &result ) { float cosom, absCosom, sinom, sinSqr, omega, scale0, scale1; cosom = from.x * to.x + from.y * to.y + from.z * to.z + from.w * to.w; absCosom = fabs( cosom ); if ( ( 1.0f - absCosom ) > 1e-6f ) { sinSqr = 1.0f - cosom * cosom; //sinom = 1.0f / sqrt( sinSqr ); { long i; float y, r; y = sinSqr * 0.5f; i = *reinterpret_cast( &sinSqr ); i = 0x5f3759df - ( i >> 1 ); r = *reinterpret_cast( &i ); sinom = r * ( 1.5f - r * r * y ); } //omega = atan2( sinSqr * sinom, absCosom ); { float y, a, d, s; y = sinSqr * sinom; if ( y > absCosom ) { a = -absCosom / y; d = idMath::HALF_PI; } else { a = y / absCosom; d = 0.0f; } s = a * a; omega = ( ( ( ( ( ( ( ( ( 0.0028662257f * s - 0.0161657367f ) * s + 0.0429096138f ) * s - 0.0752896400f ) * s + 0.1065626393f ) * s - 0.1420889944f ) * s + 0.1999355085f ) * s - 0.3333314528f ) * s ) + 1.0f ) * a + d; } // scale0 = sin( ( 1.0f - t ) * omega ) * sinom; { float a = ( 1.0f - t ) * omega; float s = a * a; scale0 = sinom * a * ( ( ( ( ( -2.39e-08f * s + 2.7526e-06f ) * s - 1.98409e-04f ) * s + 8.3333315e-03f ) * s - 1.666666664e-01f ) * s + 1.0f ); } // scale1 = sin( t * omega ) * sinom; { float a = t * omega; float s = a * a; scale1 = sinom * a * ( ( ( ( ( -2.39e-08f * s + 2.7526e-06f ) * s - 1.98409e-04f ) * s + 8.3333315e-03f ) * s - 1.666666664e-01f ) * s + 1.0f ); } } else { scale0 = 1.0f - t; scale1 = t; } scale1 = ( cosom >= 0.0f ) ? scale1 : -scale1; result.x = scale0 * from.x + scale1 * to.x; result.y = scale0 * from.y + scale1 * to.y; result.z = scale0 * from.z + scale1 * to.z; result.w = scale0 * from.w + scale1 * to.w; } /* ============ LerpUnoptimized ============ */ void LerpUnoptimized( const idQuat &from, const idQuat &to, float t, idQuat &result ) { float cosom, scale0, scale1, s; cosom = from.x * to.x + from.y * to.y + from.z * to.z + from.w * to.w; scale0 = 1.0f - t; scale1 = ( cosom >= 0.0f ) ? t : -t; result.x = scale0 * from.x + scale1 * to.x; result.y = scale0 * from.y + scale1 * to.y; result.z = scale0 * from.z + scale1 * to.z; result.w = scale0 * from.w + scale1 * to.w; s = 1.0f / sqrt( result.x * result.x + result.y * result.y + result.z * result.z + result.w * result.w ); result.x *= s; result.y *= s; result.z *= s; result.w *= s; } /* ============ idSIMD_Generic::BlendJoints ============ */ void VPCALL idSIMD_Generic::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { int i; for ( i = 0; i < numJoints; i++ ) { int j = index[i]; joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp ); joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp ); } } /* ============ idSIMD_Generic::BlendJointsFast ============ */ void VPCALL idSIMD_Generic::BlendJointsFast( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) { int i; for ( i = 0; i < numJoints; i++ ) { int j = index[i]; joints[j].q.SlerpFast( joints[j].q, blendJoints[j].q, lerp ); joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp ); } } /* ============ idSIMD_Generic::ConvertJointQuatsToJointMats ============ */ void VPCALL idSIMD_Generic::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) { #if 1 compile_time_assert( (UINT_PTR)(&((idJointQuat *)0)->t) == (UINT_PTR)(&((idJointQuat *)0)->q) + (UINT_PTR)sizeof( ((idJointQuat *)0)->q ) ); for ( int i = 0; i < numJoints; i++ ) { const float *q = jointQuats[i].q.ToFloatPtr(); float *m = jointMats[i].ToFloatPtr(); m[0*4+3] = q[4]; m[1*4+3] = q[5]; m[2*4+3] = q[6]; float x2 = q[0] + q[0]; float y2 = q[1] + q[1]; float z2 = q[2] + q[2]; { float xx2 = q[0] * x2; float yy2 = q[1] * y2; float zz2 = q[2] * z2; m[0*4+0] = 1.0f - yy2 - zz2; m[1*4+1] = 1.0f - xx2 - zz2; m[2*4+2] = 1.0f - xx2 - yy2; } { float yz2 = q[1] * z2; float wx2 = q[3] * x2; m[2*4+1] = yz2 - wx2; m[1*4+2] = yz2 + wx2; } { float xy2 = q[0] * y2; float wz2 = q[3] * z2; m[1*4+0] = xy2 - wz2; m[0*4+1] = xy2 + wz2; } { float xz2 = q[0] * z2; float wy2 = q[3] * y2; m[0*4+2] = xz2 - wy2; m[2*4+0] = xz2 + wy2; } } #else int i; for ( i = 0; i < numJoints; i++ ) { jointMats[i].SetRotation( jointQuats[i].q.ToMat3() ); jointMats[i].SetTranslation( jointQuats[i].t ); } #endif } /* ============ idSIMD_Generic::ConvertJointMatsToJointQuats ============ */ void VPCALL idSIMD_Generic::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) { #if 1 compile_time_assert( (UINT_PTR)(&((idJointQuat *)0)->t) == (UINT_PTR)(&((idJointQuat *)0)->q) + (UINT_PTR)sizeof( ((idJointQuat *)0)->q ) ); for ( int i = 0; i < numJoints; i++ ) { float *q = jointQuats[i].q.ToFloatPtr(); const float *m = jointMats[i].ToFloatPtr(); if ( m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] > 0.0f ) { float t = + m[0 * 4 + 0] + m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[3] = s * t; q[2] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s; q[1] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s; q[0] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s; } else if ( m[0 * 4 + 0] > m[1 * 4 + 1] && m[0 * 4 + 0] > m[2 * 4 + 2] ) { float t = + m[0 * 4 + 0] - m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[0] = s * t; q[1] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s; q[2] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s; q[3] = ( m[1 * 4 + 2] - m[2 * 4 + 1] ) * s; } else if ( m[1 * 4 + 1] > m[2 * 4 + 2] ) { float t = - m[0 * 4 + 0] + m[1 * 4 + 1] - m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[1] = s * t; q[0] = ( m[0 * 4 + 1] + m[1 * 4 + 0] ) * s; q[3] = ( m[2 * 4 + 0] - m[0 * 4 + 2] ) * s; q[2] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s; } else { float t = - m[0 * 4 + 0] - m[1 * 4 + 1] + m[2 * 4 + 2] + 1.0f; float s = idMath::InvSqrt( t ) * 0.5f; q[2] = s * t; q[3] = ( m[0 * 4 + 1] - m[1 * 4 + 0] ) * s; q[0] = ( m[2 * 4 + 0] + m[0 * 4 + 2] ) * s; q[1] = ( m[1 * 4 + 2] + m[2 * 4 + 1] ) * s; } q[4] = m[0 * 4 + 3]; q[5] = m[1 * 4 + 3]; q[6] = m[2 * 4 + 3]; q[7] = 0.0f; } #else int i; for ( i = 0; i < numJoints; i++ ) { jointQuats[i] = jointMats[i].ToJointQuat(); } #endif } /* ============ idSIMD_Generic::TransformJoints ============ */ void VPCALL idSIMD_Generic::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { int i; for( i = firstJoint; i <= lastJoint; i++ ) { assert( parents[i] < i ); jointMats[i] *= jointMats[parents[i]]; } } /* ============ idSIMD_Generic::UntransformJoints ============ */ void VPCALL idSIMD_Generic::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) { int i; for( i = lastJoint; i >= firstJoint; i-- ) { assert( parents[i] < i ); jointMats[i] /= jointMats[parents[i]]; } } /* ============ idSIMD_Generic::MultiplyJoints ============ */ void VPCALL idSIMD_Generic::MultiplyJoints( idJointMat *result, const idJointMat *joints1, const idJointMat *joints2, const int numJoints ) { int i; for ( i = 0; i < numJoints; i++ ) { idJointMat::Multiply( result[i], joints1[i], joints2[i] ); } } /* ============ idSIMD_Generic::TransformVerts ============ */ void VPCALL idSIMD_Generic::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, int numWeights ) { int i, j; const byte *jointsPtr = (byte *)joints; for( j = 0, i = 0; i < numVerts; i++, j++ ) { idVec3 v; v = ( *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ) ) * base[j]; while( weights[j].nextVertexOffset != JOINTWEIGHT_SIZE ) { j++; v += ( *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ) ) * base[j]; } verts[i].xyz = v; } } /* ============ idSIMD_Generic::TransformShadowVerts ============ */ void VPCALL idSIMD_Generic::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const jointWeight_t *weights, const int numWeights ) { int i; const byte *jointsPtr = (byte *)joints; const byte *weightsPtr = (byte *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset ); weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset; mat.Mul( verts[i].xyz, base[i].xyz ); } } /* ============ idSIMD_Generic::TransformShadowVerts ============ */ void VPCALL idSIMD_Generic::TransformShadowVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) { int i; const byte *jointsPtr = (byte *)joints; const short *weightsPtr = (short *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + (*weightsPtr)); weightsPtr++; mat.Mul( verts[i].xyz, base[i].xyz ); } } /* ============ idSIMD_Generic::TransformShadowVerts ============ */ void VPCALL idSIMD_Generic::TransformShadowVerts( shadowCache_t *verts, const int numVerts, const idJointMat *joints, const idDrawVert *base, const short *weights, const int numWeights ) { int i; const byte *jointsPtr = (byte *)joints; const short *weightsPtr = (short *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + (*weightsPtr)); weightsPtr++; mat.Mul( verts[i].xyz.ToVec3(), base[i].xyz ); } } /* ============ idSIMD_Generic::TransformVertsAndTangents ============ */ void VPCALL idSIMD_Generic::TransformVertsAndTangents( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) { int i, j; const byte *jointsPtr = (byte *)joints; for( j = i = 0; i < numVerts; i++, j++ ) { idJointMat mat; idJointMat::Mul( mat, *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ), weights[j].weight ); while( weights[j].nextVertexOffset != JOINTWEIGHT_SIZE ) { j++; idJointMat::Mad( mat, *(idJointMat *) ( jointsPtr + weights[j].jointMatOffset ), weights[j].weight ); } verts[i].xyz = mat * base[i*3+0]; verts[i].SetNormal( mat * base[i*3+1] ); verts[i].SetTangent( mat * base[i*3+2] ); } } /* ============ idSIMD_Generic::TransformVertsAndTangentsFast ============ */ void VPCALL idSIMD_Generic::TransformVertsAndTangentsFast( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *base, const jointWeight_t *weights, const int numWeights ) { int i; const byte *jointsPtr = (byte *)joints; const byte *weightsPtr = (byte *)weights; for( i = 0; i < numVerts; i++ ) { const idJointMat &mat = *(idJointMat *) ( jointsPtr + ((jointWeight_t *)weightsPtr)->jointMatOffset ); weightsPtr += ((jointWeight_t *)weightsPtr)->nextVertexOffset; verts[i].xyz = mat * base[i*3+0]; verts[i].SetNormal( mat * base[i*3+1] ); verts[i].SetTangent( mat * base[i*3+2] ); } } #if SD_SUPPORT_UNSMOOTHEDTANGENTS /* ============ idSIMD_Generic::DeriveUnsmoothedTangents Derives the normal and orthogonal tangent vectors for the triangle vertices. For each vertex the normal and tangent vectors are derived from a single dominant triangle. ============ */ #define DERIVE_UNSMOOTHED_BITANGENT void VPCALL idSIMD_Generic::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) { int i; for ( i = 0; i < numVerts; i++ ) { idDrawVert *a, *b, *c; #if defined( SD_USE_DRAWVERT_SIZE_32 ) idVec2 aST, bST, cST; #endif float d0, d1, d2, d3, d4; float d5, d6, d7, d8, d9; float s0, s1, s2; float n0, n1, n2; float t0, t1, t2; float t3, t4, t5; const dominantTri_s &dt = dominantTris[i]; a = verts + i; b = verts + dt.v2; c = verts + dt.v3; #if defined( SD_USE_DRAWVERT_SIZE_32 ) aST = a->GetST(); bST = b->GetST(); cST = c->GetST(); #endif d0 = b->xyz[0] - a->xyz[0]; d1 = b->xyz[1] - a->xyz[1]; d2 = b->xyz[2] - a->xyz[2]; #if defined( SD_USE_DRAWVERT_SIZE_32 ) d3 = bST[0] - aST[0]; d4 = bST[1] - aST[1]; #else d3 = b->_st[0] - a->_st[0]; d4 = b->_st[1] - a->_st[1]; #endif d5 = c->xyz[0] - a->xyz[0]; d6 = c->xyz[1] - a->xyz[1]; d7 = c->xyz[2] - a->xyz[2]; #if defined( SD_USE_DRAWVERT_SIZE_32 ) d8 = cST[0] - aST[0]; d9 = cST[1] - aST[1]; #else d8 = c->_st[0] - a->_st[0]; d9 = c->_st[1] - a->_st[1]; #endif s0 = dt.normalizationScale[0]; s1 = dt.normalizationScale[1]; s2 = dt.normalizationScale[2]; n0 = s2 * ( d6 * d2 - d7 * d1 ); n1 = s2 * ( d7 * d0 - d5 * d2 ); n2 = s2 * ( d5 * d1 - d6 * d0 ); t0 = s0 * ( d0 * d9 - d4 * d5 ); t1 = s0 * ( d1 * d9 - d4 * d6 ); t2 = s0 * ( d2 * d9 - d4 * d7 ); #ifndef DERIVE_UNSMOOTHED_BITANGENT t3 = s1 * ( d3 * d5 - d0 * d8 ); t4 = s1 * ( d3 * d6 - d1 * d8 ); t5 = s1 * ( d3 * d7 - d2 * d8 ); #else t3 = s1 * ( n2 * t1 - n1 * t2 ); t4 = s1 * ( n0 * t2 - n2 * t0 ); t5 = s1 * ( n1 * t0 - n0 * t1 ); #endif a->SetNormal( n0, n1, n2 ); a->SetTangent( t0, t1, t2 ); a->SetBiTangent( t3, t4, t5 ); } } #endif // SD_SUPPORT_UNSMOOTHEDTANGENTS /* ============ idSIMD_Generic::TracePointCull ============ */ void VPCALL idSIMD_Generic::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { int i; byte tOr; tOr = 0; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1, d2, d3, t; const idVec3 &v = verts[i].xyz; d0 = planes[0].Distance( v ); d1 = planes[1].Distance( v ); d2 = planes[2].Distance( v ); d3 = planes[3].Distance( v ); t = d0 + radius; bits = FLOATSIGNBITSET( t ) << 0; t = d1 + radius; bits |= FLOATSIGNBITSET( t ) << 1; t = d2 + radius; bits |= FLOATSIGNBITSET( t ) << 2; t = d3 + radius; bits |= FLOATSIGNBITSET( t ) << 3; t = d0 - radius; bits |= FLOATSIGNBITSET( t ) << 4; t = d1 - radius; bits |= FLOATSIGNBITSET( t ) << 5; t = d2 - radius; bits |= FLOATSIGNBITSET( t ) << 6; t = d3 - radius; bits |= FLOATSIGNBITSET( t ) << 7; bits ^= 0x0F; // flip lower four bits tOr |= bits; cullBits[i] = bits; } totalOr = tOr; } /* ============ idSIMD_Generic::TracePointCull ============ */ void VPCALL idSIMD_Generic::TracePointCullShadowVerts( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const shadowCache_t *verts, const int numVerts ) { int i; byte tOr; tOr = 0; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1, d2, d3, t; const idVec3 &v = verts[i].xyz.ToVec3(); d0 = planes[0].Distance( v ); d1 = planes[1].Distance( v ); d2 = planes[2].Distance( v ); d3 = planes[3].Distance( v ); t = d0 + radius; bits = FLOATSIGNBITSET( t ) << 0; t = d1 + radius; bits |= FLOATSIGNBITSET( t ) << 1; t = d2 + radius; bits |= FLOATSIGNBITSET( t ) << 2; t = d3 + radius; bits |= FLOATSIGNBITSET( t ) << 3; t = d0 - radius; bits |= FLOATSIGNBITSET( t ) << 4; t = d1 - radius; bits |= FLOATSIGNBITSET( t ) << 5; t = d2 - radius; bits |= FLOATSIGNBITSET( t ) << 6; t = d3 - radius; bits |= FLOATSIGNBITSET( t ) << 7; bits ^= 0x0F; // flip lower four bits tOr |= bits; cullBits[i] = bits; } totalOr = tOr; } /* ============ idSIMD_Generic::DecalPointCull ============ */ void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { int i; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1, d2, d3, d4, d5; const idVec3 &v = verts[i].xyz; d0 = planes[0].Distance( v ); d1 = planes[1].Distance( v ); d2 = planes[2].Distance( v ); d3 = planes[3].Distance( v ); d4 = planes[4].Distance( v ); d5 = planes[5].Distance( v ); bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[i] = bits ^ 0x3F; // flip lower 6 bits } } void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts, int *indexes, int numIndexes ) { int i; for ( i = 0; i < numIndexes; i++ ) { byte bits; float d0, d1, d2, d3, d4, d5; int idx = indexes[i]; const idVec3 &v = verts[idx].xyz; d0 = planes[0].Distance( v ); d1 = planes[1].Distance( v ); d2 = planes[2].Distance( v ); d3 = planes[3].Distance( v ); d4 = planes[4].Distance( v ); d5 = planes[5].Distance( v ); bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[idx] = bits ^ 0x3F; // flip lower 6 bits } } void VPCALL idSIMD_Generic::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts, unsigned short *indexes, int numIndexes ) { int i; for ( i = 0; i < numIndexes; i++ ) { byte bits; float d0, d1, d2, d3, d4, d5; int idx = indexes[i]; const idVec3 &v = verts[idx].xyz; d0 = planes[0].Distance( v ); d1 = planes[1].Distance( v ); d2 = planes[2].Distance( v ); d3 = planes[3].Distance( v ); d4 = planes[4].Distance( v ); d5 = planes[5].Distance( v ); bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; bits |= FLOATSIGNBITSET( d2 ) << 2; bits |= FLOATSIGNBITSET( d3 ) << 3; bits |= FLOATSIGNBITSET( d4 ) << 4; bits |= FLOATSIGNBITSET( d5 ) << 5; cullBits[idx] = bits ^ 0x3F; // flip lower 6 bits } } /* ============ idSIMD_Generic::OverlayPointCull ============ */ void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) { int i; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1; const idVec3 &v = verts[i].xyz; texCoords[i][0] = d0 = planes[0].Distance( v ); texCoords[i][1] = d1 = planes[1].Distance( v ); bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; d0 = 1.0f - d0; d1 = 1.0f - d1; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; cullBits[i] = bits; } } /* ============ idSIMD_Generic::OverlayPointCull ============ */ void VPCALL idSIMD_Generic::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const shadowCache_t *verts, const int numVerts ) { int i; for ( i = 0; i < numVerts; i++ ) { byte bits; float d0, d1; const idVec3 &v = verts[i].xyz.ToVec3(); texCoords[i][0] = d0 = planes[0].Distance( v ); texCoords[i][1] = d1 = planes[1].Distance( v ); bits = FLOATSIGNBITSET( d0 ) << 0; bits |= FLOATSIGNBITSET( d1 ) << 1; d0 = 1.0f - d0; d1 = 1.0f - d1; bits |= FLOATSIGNBITSET( d0 ) << 2; bits |= FLOATSIGNBITSET( d1 ) << 3; cullBits[i] = bits; } } /* ============ idSIMD_Generic::DeriveTriPlanes Derives a plane equation for each triangle. ============ */ void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const vertIndex_t *indexes, const int numIndexes ) { int i; for ( i = 0; i < numIndexes; i += 3 ) { const idDrawVert *a, *b, *c; float d0[3], d1[3], f; idVec3 n; a = verts + indexes[i + 0]; b = verts + indexes[i + 1]; c = verts + indexes[i + 2]; d0[0] = b->xyz[0] - a->xyz[0]; d0[1] = b->xyz[1] - a->xyz[1]; d0[2] = b->xyz[2] - a->xyz[2]; d1[0] = c->xyz[0] - a->xyz[0]; d1[1] = c->xyz[1] - a->xyz[1]; d1[2] = c->xyz[2] - a->xyz[2]; n[0] = d1[1] * d0[2] - d1[2] * d0[1]; n[1] = d1[2] * d0[0] - d1[0] * d0[2]; n[2] = d1[0] * d0[1] - d1[1] * d0[0]; #if defined( OPTIMIZED_TRI_PLANE_CODE ) f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z ); #else f = 1.0f / sqrt( n.x * n.x + n.y * n.y + n.z * n.z ); #endif n.x *= f; n.y *= f; n.z *= f; planes->SetNormal( n ); planes->FitThroughPoint( a->xyz ); planes++; } } /* ============ idSIMD_Generic::DeriveTriPlanes Derives a plane equation for each triangle. ============ */ void VPCALL idSIMD_Generic::DeriveTriPlanes( idPlane *planes, const shadowCache_t *verts, const int numVerts, const vertIndex_t *indexes, const int numIndexes ) { int i; for ( i = 0; i < numIndexes; i += 3 ) { const shadowCache_t *a, *b, *c; float d0[3], d1[3], f; idVec3 n; a = verts + indexes[i + 0]; b = verts + indexes[i + 1]; c = verts + indexes[i + 2]; d0[0] = b->xyz[0] - a->xyz[0]; d0[1] = b->xyz[1] - a->xyz[1]; d0[2] = b->xyz[2] - a->xyz[2]; d1[0] = c->xyz[0] - a->xyz[0]; d1[1] = c->xyz[1] - a->xyz[1]; d1[2] = c->xyz[2] - a->xyz[2]; n[0] = d1[1] * d0[2] - d1[2] * d0[1]; n[1] = d1[2] * d0[0] - d1[0] * d0[2]; n[2] = d1[0] * d0[1] - d1[1] * d0[0]; #if defined( OPTIMIZED_TRI_PLANE_CODE ) f = idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z ); #else f = 1.0f / sqrt( n.x * n.x + n.y * n.y + n.z * n.z ); #endif n.x *= f; n.y *= f; n.z *= f; planes->SetNormal( n ); planes->FitThroughPoint( a->xyz.ToVec3() ); planes++; } } /* ============ idSIMD_Generic::CalculateFacing ============ */ void VPCALL idSIMD_Generic::CalculateFacing( byte *facing, const idPlane *planes, const int numTriangles, const idVec4 &light ) { int i; for ( i = 0; i < numTriangles; i++ ) { facing[i] = planes[i][0] * light.x + planes[i][1] * light.y + planes[i][2] * light.z + planes[i][3] * light.w > 0.0f; } facing[numTriangles] = 1; // for dangling edges to reference } /* ============ idSIMD_Generic::CalculateCullBits ============ */ void VPCALL idSIMD_Generic::CalculateCullBits( byte *cullBits, const idDrawVert *verts, const int numVerts, const int frontBits, const idPlane lightPlanes[NUM_LIGHT_PLANES] ) { int i, j; assert( NUM_LIGHT_PLANES <= sizeof( cullBits[0] ) * 8 ); memset( cullBits, 0, numVerts * sizeof( cullBits[0] ) ); for ( i = 0; i < NUM_LIGHT_PLANES; i++ ) { // if completely infront of this clipping plane if ( frontBits & ( 1 << i ) ) { continue; } const idPlane &plane = lightPlanes[i]; for ( j = 0; j < numVerts; j++ ) { int bit = plane[0] * verts[j].xyz.x + plane[1] * verts[j].xyz.y + plane[2] * verts[j].xyz.z + plane[3] < 0.0f; cullBits[j] |= bit << i; } } } /* ============ idSIMD_Generic::CreateShadowCache ============ */ int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) { for ( int i = 0; i < numVerts; i++ ) { const float *v = verts[i].xyz.ToFloatPtr(); vertexCache[i*2+0][0] = v[0]; vertexCache[i*2+1][0] = v[0]; vertexCache[i*2+0][1] = v[1]; vertexCache[i*2+1][1] = v[1]; vertexCache[i*2+0][2] = v[2]; vertexCache[i*2+1][2] = v[2]; vertexCache[i*2+0][3] = 1.0f; vertexCache[i*2+1][3] = 0.0f; } return numVerts * 2; } /* ============ idSIMD_Generic::CreateShadowCache ============ */ int VPCALL idSIMD_Generic::CreateShadowCache( idVec4 *vertexCache, const struct shadowCache_s *verts, const int numVerts ) { for ( int i = 0; i < numVerts; i++ ) { const float *v = verts[i].xyz.ToFloatPtr(); vertexCache[i*2+0][0] = v[0]; vertexCache[i*2+1][0] = v[0]; vertexCache[i*2+0][1] = v[1]; vertexCache[i*2+1][1] = v[1]; vertexCache[i*2+0][2] = v[2]; vertexCache[i*2+1][2] = v[2]; vertexCache[i*2+0][3] = 1.0f; vertexCache[i*2+1][3] = 0.0f; } return numVerts * 2; } /* ============ idSIMD_Generic::ShadowVolume_CountFacing ============ */ int VPCALL idSIMD_Generic::ShadowVolume_CountFacing( const byte *facing, const int numFaces ) { #if defined( OPTIMIZED_SHADOW_VOLUME_CODE ) int i, n; n = 0; for ( i = 0; i < numFaces; i++ ) { n += facing[i]; } return n; #else int i, n; n = 0; for ( i = 0; i < numFaces; i++ ) { if ( facing[i] ) { n++; } } return n; #endif } /* ============ idSIMD_Generic::ShadowVolume_CountFacingCull ============ */ int VPCALL idSIMD_Generic::ShadowVolume_CountFacingCull( byte *facing, const int numFaces, const vertIndex_t *indexes, const byte *cull ) { #if defined( OPTIMIZED_SHADOW_VOLUME_CODE ) int i, n; n = 0; for ( i = 0; i < numFaces; i++ ) { int c = cull[indexes[0]] & cull[indexes[1]] & cull[indexes[2]]; facing[i] |= ( (-c) >> 31 ) & 1; n += facing[i]; indexes += 3; } return n; #else int i, n; n = 0; for ( i = 0; i < numFaces; i++ ) { if ( !facing[i] ) { int i1 = indexes[0]; int i2 = indexes[1]; int i3 = indexes[2]; if ( cull[i1] & cull[i2] & cull[i3] ) { facing[i] = 1; n++; } } else { n++; } indexes += 3; } return n; #endif } /* ============ idSIMD_Generic::ShadowVolume_CreateSilTriangles ============ */ int VPCALL idSIMD_Generic::ShadowVolume_CreateSilTriangles( vertIndex_t *shadowIndexes, const byte *facing, const silEdge_t *silEdges, const int numSilEdges ) { #if defined( OPTIMIZED_SHADOW_VOLUME_CODE ) int i; const silEdge_t *sil; vertIndex_t *si; si = shadowIndexes; for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) { int f1 = facing[sil->p1]; int f2 = facing[sil->p2]; if ( !( f1 ^ f2 ) ) { continue; } int v1 = sil->v1; int v2 = sil->v2; // set the two triangle winding orders based on facing // without using a poorly-predictable branch si[0] = v1; si[1] = v2 ^ f2; si[2] = v2 ^ f1; si[3] = v1 ^ f2; si[4] = v2 ^ 1; si[5] = v1 ^ f1; si += 6; } return si - shadowIndexes; #else int i; const silEdge_t *sil; vertIndex_t *si; si = shadowIndexes; for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) { byte f1 = facing[sil->p1]; byte f2 = facing[sil->p2]; if ( f1 != f2 ) { int v1 = sil->v1; int v2 = sil->v2; if ( f1 ) { si[0] = v1; si[1] = v2 + 1; si[2] = v2; si[3] = v1; si[4] = v1 + 1; si[5] = v2 + 1; } else { si[0] = v1; si[1] = v2; si[2] = v2 + 1; si[3] = v1 + 1; si[4] = v1; si[5] = v2 + 1; } si += 6; } } return si - shadowIndexes; #endif } /* ============ idSIMD_Generic::ShadowVolume_CreateSilTrianglesParallel ============ */ int VPCALL idSIMD_Generic::ShadowVolume_CreateSilTrianglesParallel( vertIndex_t *shadowIndexes, const byte *facing, const silEdge_t *silEdges, const int numSilEdges ) { #if defined( OPTIMIZED_SHADOW_VOLUME_CODE ) int i; const silEdge_t *sil; vertIndex_t *si; si = shadowIndexes; for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) { int f1 = facing[sil->p1]; int f2 = facing[sil->p2]; if ( !( f1 ^ f2 ) ) { continue; } int v1 = sil->v1; int v2 = sil->v2; // set the triangle winding order based on facing // without using a poorly-predictable branch si[0] = v1; si[1] = ( v2 & -f1 ) + ( f1 ^ 1 ); si[2] = ( v2 & -f2 ) + ( f2 ^ 1 ); si += 3; } return si - shadowIndexes; #else int i; const silEdge_t *sil; vertIndex_t *si; si = shadowIndexes; for ( sil = silEdges, i = numSilEdges; i > 0; i--, sil++ ) { byte f1 = facing[sil->p1]; byte f2 = facing[sil->p2]; if ( f1 != f2 ) { int v1 = sil->v1; int v2 = sil->v2; if ( f1 ) { si[0] = v1; si[1] = 1; si[2] = v2; } else { si[0] = v1; si[1] = v2; si[2] = 1; } si += 3; } } return si - shadowIndexes; #endif } /* ============ idSIMD_Generic::ShadowVolume_CreateCapTriangles ============ */ int VPCALL idSIMD_Generic::ShadowVolume_CreateCapTriangles( vertIndex_t *shadowIndexes, const byte *facing, const vertIndex_t *indexes, const int numIndexes ) { int i, j; vertIndex_t *si; si = shadowIndexes; for ( i = 0, j = 0; i < numIndexes; i += 3, j++ ) { if ( facing[j] ) { continue; } int i0 = indexes[i+0] * 2; int i1 = indexes[i+1] * 2; int i2 = indexes[i+2] * 2; si[0] = i0; si[1] = i1; si[2] = i2; si[3] = i2 + 1; si[4] = i1 + 1; si[5] = i0 + 1; si += 6; } return si - shadowIndexes; } /* ============ idSIMD_Generic::ShadowVolume_CreateCapTrianglesParallel ============ */ int VPCALL idSIMD_Generic::ShadowVolume_CreateCapTrianglesParallel( vertIndex_t *shadowIndexes, const byte *facing, const vertIndex_t *indexes, const int numIndexes ) { int i, j; vertIndex_t *si; si = shadowIndexes; for ( i = 0, j = 0; i < numIndexes; i += 3, j++ ) { if ( facing[j] ) { continue; } si[0] = indexes[i+0] * 2; si[1] = indexes[i+1] * 2; si[2] = indexes[i+2] * 2; si += 3; } return si - shadowIndexes; } /* ============ idSIMD_Generic::UpSamplePCMTo44kHz Duplicate samples for 44kHz output. ============ */ void idSIMD_Generic::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { for ( int i = 0; i < numSamples; i++ ) { dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0]; } } else { for ( int i = 0; i < numSamples; i += 2 ) { dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0]; dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1]; } } } else if ( kHz == 22050 ) { if ( numChannels == 1 ) { for ( int i = 0; i < numSamples; i++ ) { dest[i*2+0] = dest[i*2+1] = (float) src[i+0]; } } else { for ( int i = 0; i < numSamples; i += 2 ) { dest[i*2+0] = dest[i*2+2] = (float) src[i+0]; dest[i*2+1] = dest[i*2+3] = (float) src[i+1]; } } } else if ( kHz == 44100 ) { for ( int i = 0; i < numSamples; i++ ) { dest[i] = (float) src[i]; } } else { assert( 0 ); } } /* ============ idSIMD_Generic::UpSampleOGGTo44kHz Duplicate samples for 44kHz output. ============ */ void idSIMD_Generic::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) { if ( kHz == 11025 ) { if ( numChannels == 1 ) { for ( int i = 0; i < numSamples; i++ ) { dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f; } } else { for ( int i = 0; i < numSamples >> 1; i++ ) { dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f; dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f; } } } else if ( kHz == 22050 ) { if ( numChannels == 1 ) { for ( int i = 0; i < numSamples; i++ ) { dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f; } } else { for ( int i = 0; i < numSamples >> 1; i++ ) { dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f; dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f; } } } else if ( kHz == 44100 ) { if ( numChannels == 1 ) { for ( int i = 0; i < numSamples; i++ ) { dest[i*1+0] = ogg[0][i] * 32768.0f; } } else { for ( int i = 0; i < numSamples >> 1; i++ ) { dest[i*2+0] = ogg[0][i] * 32768.0f; dest[i*2+1] = ogg[1][i] * 32768.0f; } } } else { assert( 0 ); } } /* ============ idSIMD_Generic::MixSoundTwoSpeakerMono ============ */ void VPCALL idSIMD_Generic::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) { float sL = lastV[0]; float sR = lastV[1]; float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) { mixBuffer[j*2+0] += samples[j] * sL; mixBuffer[j*2+1] += samples[j] * sR; sL += incL; sR += incR; } } /* ============ idSIMD_Generic::MixSoundTwoSpeakerStereo ============ */ void VPCALL idSIMD_Generic::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) { float sL = lastV[0]; float sR = lastV[1]; float incL = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incR = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int j = 0; j < MIXBUFFER_SAMPLES; j++ ) { mixBuffer[j*2+0] += samples[j*2+0] * sL; mixBuffer[j*2+1] += samples[j*2+1] * sR; sL += incL; sR += incR; } } /* ============ idSIMD_Generic::MixSoundFourSpeakerMono ============ */ void VPCALL idSIMD_Generic::MixSoundFourSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { float sL0 = lastV[0]; float sL1 = lastV[1]; float sL2 = lastV[4]; float sL3 = lastV[5]; float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; float incL2 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; float incL3 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) { mixBuffer[i*4+0] += samples[i] * sL0; mixBuffer[i*4+1] += samples[i] * sL1; mixBuffer[i*4+2] += samples[i] * sL2; mixBuffer[i*4+3] += samples[i] * sL3; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; } } /* ============ idSIMD_Generic::MixSoundFourSpeakerStereo ============ */ void VPCALL idSIMD_Generic::MixSoundFourSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { float sL0 = lastV[0]; float sL1 = lastV[1]; float sL2 = lastV[4]; float sL3 = lastV[5]; float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; float incL2 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; float incL3 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) { mixBuffer[i*4+0] += samples[i*2+0] * sL0; mixBuffer[i*4+1] += samples[i*2+1] * sL1; mixBuffer[i*4+2] += samples[i*2+0] * sL2; mixBuffer[i*4+3] += samples[i*2+1] * sL3; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; } } /* ============ idSIMD_Generic::MixSoundSixSpeakerMono ============ */ void VPCALL idSIMD_Generic::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { float sL0 = lastV[0]; float sL1 = lastV[1]; float sL2 = lastV[2]; float sL3 = lastV[3]; float sL4 = lastV[4]; float sL5 = lastV[5]; float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) { mixBuffer[i*6+0] += samples[i] * sL0; mixBuffer[i*6+1] += samples[i] * sL1; mixBuffer[i*6+2] += samples[i] * sL2; mixBuffer[i*6+3] += samples[i] * sL3; mixBuffer[i*6+4] += samples[i] * sL4; mixBuffer[i*6+5] += samples[i] * sL5; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; } } /* ============ idSIMD_Generic::MixSoundSixSpeakerStereo ============ */ void VPCALL idSIMD_Generic::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) { float sL0 = lastV[0]; float sL1 = lastV[1]; float sL2 = lastV[2]; float sL3 = lastV[3]; float sL4 = lastV[4]; float sL5 = lastV[5]; float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) { mixBuffer[i*6+0] += samples[i*2+0] * sL0; mixBuffer[i*6+1] += samples[i*2+1] * sL1; mixBuffer[i*6+2] += samples[i*2+0] * sL2; mixBuffer[i*6+3] += samples[i*2+0] * sL3; mixBuffer[i*6+4] += samples[i*2+0] * sL4; mixBuffer[i*6+5] += samples[i*2+1] * sL5; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; } } /* ============ idSIMD_Generic::MixSoundEightSpeakerMono ============ */ void VPCALL idSIMD_Generic::MixSoundEightSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[8], const float currentV[8] ) { float sL0 = lastV[0]; float sL1 = lastV[1]; float sL2 = lastV[2]; float sL3 = lastV[3]; float sL4 = lastV[4]; float sL5 = lastV[5]; float sL6 = lastV[6]; float sL7 = lastV[7]; float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; float incL6 = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES; float incL7 = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) { mixBuffer[i*8+0] += samples[i] * sL0; mixBuffer[i*8+1] += samples[i] * sL1; mixBuffer[i*8+2] += samples[i] * sL2; mixBuffer[i*8+3] += samples[i] * sL3; mixBuffer[i*8+4] += samples[i] * sL4; mixBuffer[i*8+5] += samples[i] * sL5; mixBuffer[i*8+6] += samples[i] * sL6; mixBuffer[i*8+7] += samples[i] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; } } /* ============ idSIMD_Generic::MixSoundEightSpeakerStereo ============ */ void VPCALL idSIMD_Generic::MixSoundEightSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[8], const float currentV[8] ) { float sL0 = lastV[0]; float sL1 = lastV[1]; float sL2 = lastV[2]; float sL3 = lastV[3]; float sL4 = lastV[4]; float sL5 = lastV[5]; float sL6 = lastV[6]; float sL7 = lastV[7]; float incL0 = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES; float incL1 = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES; float incL2 = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES; float incL3 = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES; float incL4 = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES; float incL5 = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES; float incL6 = ( currentV[6] - lastV[6] ) / MIXBUFFER_SAMPLES; float incL7 = ( currentV[7] - lastV[7] ) / MIXBUFFER_SAMPLES; assert( numSamples == MIXBUFFER_SAMPLES ); for( int i = 0; i < MIXBUFFER_SAMPLES; i++ ) { mixBuffer[i*8+0] += samples[i*2+0] * sL0; mixBuffer[i*8+1] += samples[i*2+1] * sL1; mixBuffer[i*8+2] += samples[i*2+0] * sL2; mixBuffer[i*8+3] += samples[i*2+0] * sL3; mixBuffer[i*8+4] += samples[i*2+0] * sL4; mixBuffer[i*8+5] += samples[i*2+1] * sL5; mixBuffer[i*8+6] += samples[i*2+0] * sL6; mixBuffer[i*8+7] += samples[i*2+1] * sL7; sL0 += incL0; sL1 += incL1; sL2 += incL2; sL3 += incL3; sL4 += incL4; sL5 += incL5; sL6 += incL6; sL7 += incL7; } } /* ============ idSIMD_Generic::MixedSoundToSamples ============ */ void VPCALL idSIMD_Generic::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) { for ( int i = 0; i < numSamples; i++ ) { if ( mixBuffer[i] <= -32768.0f ) { samples[i] = -32768; } else if ( mixBuffer[i] >= 32767.0f ) { samples[i] = 32767; } else { samples[i] = (short) mixBuffer[i]; } } } #pragma warning( default : 4244 )