dhewm3/neo/idlib/math/Simd_AltiVec.cpp
dhewg 8023bc5d56 Always compile all SIMD code
Protect all SIMD implementations with the according defines and
let the compiler decide if it supports the intructions.

Linux will still use Simd_Generic because CPU feature runtime
detection is missing.
2011-12-13 18:56:38 +01:00

11240 lines
364 KiB
C++

/*
===========================================================================
Doom 3 GPL Source Code
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
Doom 3 Source Code is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Doom 3 Source Code is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
===========================================================================
*/
#include "../precompiled.h"
#pragma hdrstop
#include "Simd_Generic.h"
#include "Simd_AltiVec.h"
#include <math.h>
#include <float.h>
// Doom3 SIMD Library version 0.5
// Patrick Flanagan (pflanagan@apple.com)
// Sanjay Patel (spatel@apple.com)
// Architecture & Performance Group, Apple Computer
//===============================================================
//
// AltiVec implementation of idSIMDProcessor
//
//===============================================================
#if defined(__GNUC__) && defined(__ALTIVEC__)
#ifdef PPC_INTRINSICS
// for square root estimate instruction
#include <ppc_intrinsics.h>
#endif
// Data struct sizes
#ifndef DRAWVERT_PADDED
// 60 bytes, 15 floats at 4 bytes each
#define DRAWVERT_OFFSET 15
#else
// 64 bytes, 16 floats
#define DRAWVERT_OFFSET 16
#endif
// 16 bytes each, 4 floats
#define PLANE_OFFSET 4
// 16 bytes each, 4 floats
#define IDVEC4_OFFSET 4
// Alignment tests
#define IS_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F ) == 0 )
#define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F) != 0 )
// Aligned storing floats
#define ALIGNED_STORE2( ADDR, V0, V1 ) \
vec_st( V0, 0, ADDR ); \
vec_st( V1, 16, ADDR )
#define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
vec_st( V0, 0, ADDR ); \
vec_st( V1, 16, ADDR ); \
vec_st( V2, 32, ADDR )
#define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
vec_st( V0, 0, ADDR ); \
vec_st( V1, 16, ADDR ); \
vec_st( V2, 32, ADDR ); \
vec_st( V3, 48, ADDR )
#define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
vec_st( V0, 0, ADDR ); \
vec_st( V1, 16, ADDR ); \
vec_st( V2, 32, ADDR ); \
vec_st( V3, 48, ADDR ); \
vec_st( V4, 64, ADDR ); \
vec_st( V5, 80, ADDR )
#define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
vec_st( V0, 0, ADDR ); \
vec_st( V1, 16, ADDR ); \
vec_st( V2, 32, ADDR ); \
vec_st( V3, 48, ADDR ); \
vec_st( V4, 64, ADDR ); \
vec_st( V5, 80, ADDR ); \
vec_st( V6, 96, ADDR ); \
vec_st( V7, 112, ADDR )
// Unaligned storing floats. These assume that we can trash the input
#define UNALIGNED_STORE1( ADDR, V0 ) { \
/* use store element */ \
vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
vec_ste( V0, 0, ADDR ); \
vec_ste( V0, 4, ADDR ); \
vec_ste( V0, 8, ADDR ); \
vec_ste( V0, 12, ADDR ); \
}
#define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
/* load up the values that are there now */ \
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
/* generate permute vector and mask */ \
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
/* right rotate input data */ \
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
/* setup the output vectors */ \
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
/* store results */ \
vec_st( ULStoreVal1, 0, ADDR ); \
vec_st( ULStoreVal2, 15, ADDR ); \
vec_st( ULStoreVal3, 31, ADDR ); }
#define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
/* load up the values that are there now */ \
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
/* generate permute vector and mask */ \
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
/* right rotate input data */ \
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
/* setup the output vectors */ \
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
/* store results */ \
vec_st( ULStoreVal1, 0, ADDR ); \
vec_st( ULStoreVal2, 15, ADDR ); \
vec_st( ULStoreVal3, 31, ADDR ); \
vec_st( ULStoreVal4, 47, ADDR ); }
#define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
/* load up the values that are there now */ \
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
/* generate permute vector and mask */ \
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
/* right rotate input data */ \
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
/* setup the output vectors */ \
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
/* store results */ \
vec_st( ULStoreVal1, 0, ADDR ); \
vec_st( ULStoreVal2, 15, ADDR ); \
vec_st( ULStoreVal3, 31, ADDR ); \
vec_st( ULStoreVal4, 47, ADDR ); \
vec_st( ULStoreVal5, 63, ADDR ); }
#define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
/* load up the values that are there now */ \
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
/* generate permute vector and mask */ \
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
/* right rotate input data */ \
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
/* setup the output vectors */ \
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
/* store results */ \
vec_st( ULStoreVal1, 0, ADDR ); \
vec_st( ULStoreVal2, 15, ADDR ); \
vec_st( ULStoreVal3, 31, ADDR ); \
vec_st( ULStoreVal4, 47, ADDR ); \
vec_st( ULStoreVal5, 63, ADDR ); \
vec_st( ULStoreVal6, 79, ADDR ); \
vec_st( ULStoreVal7, 95, ADDR ); }
#define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
/* load up the values that are there now */ \
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
/* generate permute vector and mask */ \
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
/* right rotate input data */ \
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
/* setup the output vectors */ \
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
/* store results */ \
vec_st( ULStoreVal1, 0, ADDR ); \
vec_st( ULStoreVal2, 15, ADDR ); \
vec_st( ULStoreVal3, 31, ADDR ); \
vec_st( ULStoreVal4, 47, ADDR ); \
vec_st( ULStoreVal5, 63, ADDR ); \
vec_st( ULStoreVal6, 79, ADDR ); \
vec_st( ULStoreVal7, 95, ADDR ); \
vec_st( ULStoreVal8, 111, ADDR ); \
vec_st( ULStoreVal9, 127, ADDR ); \
vec_st( ULStoreVal10, 143, ADDR ); }
/*
============
idSIMD_AltiVec::GetName
============
*/
const char *idSIMD_AltiVec::GetName( void ) const {
return "AltiVec";
}
/*
Helper Functions
*/
#if 0
// Prints the values of a vector, useful for debugging but
// should never be called in real code
inline void debugPrintVector( vector float v, char *msg ) {
printf("%s -- %vf\n", msg, v );
}
inline void debugPrintVector( vector unsigned int v, char *msg ) {
printf("%s -- %vd\n", msg, v );
}
inline void debugPrintVector( vector bool int v, char *msg ) {
printf("%s -- %vi\n", msg, v );
}
inline void debugPrintVector( vector unsigned char v, char *msg ) {
printf("%s -- %vuc\n", msg, v );
}
inline void debugPrintVector( vector unsigned short v, char *msg ) {
printf("%s -- %vs\n", msg, v );
}
#endif
/*
===============
Reciprocal
For each element in vector:
n = 1 / n
===============
*/
// Use Newton-Raphson to calculate reciprocal of a vector
inline vector float Reciprocal( vector float v ) {
//Get the reciprocal estimate
vector float estimate = vec_re( v );
//One round of Newton-Raphson refinement
return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
}
/*
===============
ReciprocalSquareRoot
For each element in vector:
n = 1 / sqrt(n)
===============
*/
// Reciprocal square root estimate of a vector
inline vector float ReciprocalSquareRoot( vector float v ) {
//Get the square root reciprocal estimate
vector float zero = (vector float)(0);
vector float oneHalf = (vector float)(0.5);
vector float one = (vector float)(1.0);
vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
//One round of Newton-Raphson refinement
vector float estimateSquared = vec_madd( estimate, estimate, zero );
vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
}
/*
===============
Divide
For each element in vectors:
n = a / b
===============
*/
// Use reciprocal estimate and multiply to divide a vector
inline vector float Divide( vector float a, vector float b ) {
return vec_madd( a, Reciprocal( b ), (vector float)(0) );
}
/*
===============
loadSplatUnalignedScalar
For each element in vector:
n = s
===============
*/
inline vector float loadSplatUnalignedScalar( const float *s ) {
vector unsigned char splatMap = vec_lvsl( 0, s );
vector float v = vec_ld( 0, s );
splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
return vec_perm( v, v, splatMap );
}
/*
===============
VectorATan16
For each element in vector:
n = idMath::ATan16( x, y )
===============
*/
// calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
inline vector float VectorATan16( vector float x, vector float y ) {
vector float xDivY = Divide( x, y );
vector float yDivX = Divide( y, x );
vector float zeroVector = (vector float)(0);
vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
// do calculation for S
vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
// get the regular S value
vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
// calculate what to return if y > x
vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
return vec_sel( modRet, vecS, vecCmp );
}
/*
===============
VectorSin16
For each element in vector:
n = idMath::Sin16( v )
===============
*/
inline vector float VectorSin16( vector float v ) {
vector float zero = (vector float)(0);
#if 0
// load up half PI and use it to calculate the rest of the values. This is
// sometimes cheaper than loading them from memory
vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
vector float PI = vec_add( halfPI, halfPI );
vector float oneandhalfPI = vec_add( PI, halfPI );
vector float twoPI = vec_add( oneandhalfPI, halfPI );
#else
vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
vector float PI = (vector float)(3.14159265358979323846f);
vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
#endif
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
vector float vecMod;
vector float vecResult;
// fix the range if needbe
vecMod = vec_floor( Divide( v, twoPI ) );
vecResult = vec_nmsub( vecMod, twoPI, v );
vector float vecPIminusA = vec_sub( PI, vecResult );
vector float vecAminus2PI = vec_sub( vecResult, twoPI );
vecCmp1 = vec_cmplt( vecResult, PI );
vecCmp2 = vec_cmpgt( vecResult, halfPI );
// these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
// we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
// these are ones where a < PI and a > HALF_PI so we set a = PI - a
vecCmp1 = vec_and( vecCmp1, vecCmp2 );
vecCmp1 = vec_or( vecCmp1, vecCmp4 );
// put the correct values into place
vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
// calculate answer
vector float vecASquared = vec_madd( vecResult, vecResult, zero );
vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
return vec_madd( vecResult, vecEst, zero );
}
/*
===============
vecSplatWithRunTime
For each element in vector:
n = v(i)
===============
*/
// splats an element across a vector using a runtime variable
inline vector float vecSplatWithRunTime( vector float v, int i ) {
vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
v = vec_perm( v, v, rotate );
return vec_splat( v, 0 );
}
/*
===============
FastScalarInvSqrt
n = 1 / sqrt( f )
===============
*/
inline float FastScalarInvSqrt( float f ) {
#ifdef PPC_INTRINSICS
float estimate;
const float kSmallestFloat = FLT_MIN;
//Calculate a 5 bit starting estimate for the reciprocal sqrt
estimate = __frsqrte ( f + kSmallestFloat );
//if you require less precision, you may reduce the number of loop iterations.
// This will do 2 rounds of NR
estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
return estimate;
#else
return idMath::InvSqrt( f );
#endif
}
/*
===============
FastScalarInvSqrt_x3
arg1 = 1 / sqrt( arg1 )
arg2 = 1 / sqrt( arg2 )
arg3 = 1 / sqrt( arg3 )
===============
*/
inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
#ifdef PPC_INTRINSICS
register float estimate1, estimate2, estimate3;
const float kSmallestFloat = FLT_MIN;
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
// two rounds newton-raphson
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
*arg1 = estimate1;
*arg2 = estimate2;
*arg3 = estimate3;
#else
*arg1 = idMath::InvSqrt( *arg1 );
*arg2 = idMath::InvSqrt( *arg2 );
*arg3 = idMath::InvSqrt( *arg3 );
#endif
}
/*
===============
FastScalarInvSqrt_x6
arg1 = 1 / sqrt( arg1 )
arg2 = 1 / sqrt( arg2 )
arg3 = 1 / sqrt( arg3 )
arg4 = 1 / sqrt( arg4 )
arg5 = 1 / sqrt( arg5 )
arg6 = 1 / sqrt( arg6 )
On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
===============
*/
inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
#ifdef PPC_INTRINSICS
register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
const float kSmallestFloat = FLT_MIN;
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
// two rounds newton-raphson
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
*arg1 = estimate1;
*arg2 = estimate2;
*arg3 = estimate3;
*arg4 = estimate4;
*arg5 = estimate5;
*arg6 = estimate6;
#else
*arg1 = idMath::InvSqrt( *arg1 );
*arg2 = idMath::InvSqrt( *arg2 );
*arg3 = idMath::InvSqrt( *arg3 );
*arg4 = idMath::InvSqrt( *arg4 );
*arg5 = idMath::InvSqrt( *arg5 );
*arg6 = idMath::InvSqrt( *arg6 );
#endif
}
// End Helper Functions
#ifdef ENABLE_SIMPLE_MATH
/*
============
idSIMD_AltiVec::Add
dst[i] = constant + src[i];
============
*/
void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
vector float v0, v1, v2, v3;
vector float v0_low, v0_hi, v1_hi;
vector unsigned char permVec;
vector float constVec;
int i;
// handle unaligned cases at beginning
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant + src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do first load
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
v1_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v0_hi, v1_hi, permVec );
v2 = vec_add( v0, constVec );
v3 = vec_add( v1, constVec );
// store results
ALIGNED_STORE2( &dst[i], v2, v3 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = constant + src[i];
}
}
/*
============
idSIMD_AltiVec::Add
dst[i] = src0[i] + src1[i];
============
*/
void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
register vector float v0, v1, v2, v3, v4, v5;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
//permute vectors
register vector unsigned char permVec1, permVec2;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = src0[i] + src1[i];
}
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src0[i] );
v3_hi = vec_ld( 0, &src1[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src0[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src0[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src1[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src1[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
v4 = vec_add( v0, v1 );
v5 = vec_add( v2, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] + src1[i];
}
}
/*
============
idSIMD_AltiVec::Sub
dst[i] = constant - src[i];
============
*/
void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
register vector float v0, v1, v2, v3;
register vector float v0_low, v0_hi, v1_low, v1_hi;
register vector unsigned char permVec;
register vector float constVec;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant - src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute vector and do first load
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
v1_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_sub( constVec, v0 );
v3 = vec_sub( constVec, v1 );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = constant - src[i];
}
}
/*
============
idSIMD_AltiVec::Sub
dst[i] = src0[i] - src1[i];
============
*/
void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
register vector float v0, v1, v2, v3, v4, v5;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
register vector unsigned char permVec1, permVec2;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = src0[i] - src1[i];
}
//calculate permute and do first loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src0[i] );
v3_hi = vec_ld( 0, &src1[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src0[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src0[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src1[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src1[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
v4 = vec_sub( v0, v1 );
v5 = vec_sub( v2, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] - src1[i];
}
}
/*
============
idSIMD_AltiVec::Mul
dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
register vector float constVec;
register vector unsigned char permVec;
vector unsigned char oneCharVector = (vector unsigned char)(1);
register vector float zeroVector = (vector float)(0.0);
int i;
// handle unaligned data at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = constant * src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
v1_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_madd( constVec, v0, zeroVector );
v3 = vec_madd( constVec, v1, zeroVector );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = constant * src[i];
}
}
/*
============
idSIMD_AltiVec::Mul
dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
register vector float v0, v1, v2, v3, v4, v5;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
//permute vectors
register vector unsigned char permVec1, permVec2;
register vector float constVec = (vector float)(0.0);
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = src0[i] * src1[i];
}
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src0[i] );
v3_hi = vec_ld( 0, &src1[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src0[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src0[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src1[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src1[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
//no such thing as regular multiply so we do
//multiply then add zero
v4 = vec_madd( v0, v1, constVec );
v5 = vec_madd( v2, v3, constVec );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] * src1[i];
}
}
/*
============
idSIMD_AltiVec::Div
dst[i] = constant / divisor[i];
============
*/
void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
register vector float v0, v1, v2, v3;
register vector float v0_low, v0_hi, v1_low, v1_hi;
register vector unsigned char permVec;
register vector float constVec;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = constant / divisor[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do first loads
permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
v1_hi = vec_ld( 0, &divisor[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &divisor[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &divisor[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = Divide( constVec, v0 );
v3 = Divide( constVec, v1 );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = constant / divisor[i];
}
}
/*
============
idSIMD_AltiVec::Div
dst[i] = src0[i] / src1[i];
============
*/
void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
register vector float v0, v1, v2, v3, v4, v5;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
//permute vectors
register vector unsigned char permVec1, permVec2;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = src0[i] / src1[i];
}
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src0[i] );
v3_hi = vec_ld( 0, &src1[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src0[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src0[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src1[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src1[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
v4 = Divide( v0, v1 );
v5 = Divide( v2, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] / src1[i];
}
}
/*
============
idSIMD_AltiVec::MulAdd
dst[i] += constant * src[i];
============
*/
void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
register vector float v0, v1, v2, v3, v4, v5;
register vector float constVec;
//src
register vector float v0_low, v0_hi, v2_low, v2_hi;
//permute vectors
register vector unsigned char permVec1;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] += constant * src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
// at this point, dst is known to be aligned
v1 = vec_ld( 0, &dst[i] );
v3 = vec_ld( 16, &dst[i] );
v4 = vec_madd( constVec, v0, v1 );
v5 = vec_madd( constVec, v2, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] += constant * src[i];
}
}
/*
============
idSIMD_AltiVec::MulAdd
dst[i] += src0[i] * src1[i];
============
*/
void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
//permute vectors
register vector unsigned char permVec1, permVec2;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] += src0[i] * src1[i];
}
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src0[i] );
v3_hi = vec_ld( 0, &src1[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
// load sources
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src0[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src0[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src1[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src1[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
//we know dst is aligned because we handled unaligned cases
//up front
v4 = vec_ld( 0, &dst[i] );
v5 = vec_ld( 16, &dst[i] );
v6 = vec_madd( v0, v1, v4 );
v7 = vec_madd( v2, v3, v5 );
ALIGNED_STORE2( &dst[i], v6, v7 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] += src0[i] * src1[i];
}
}
/*
============
idSIMD_AltiVec::MulSub
dst[i] -= constant * src[i];
============
*/
void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
register vector float v0, v1, v2, v3, v4, v5;
register vector float constVec;
//src
register vector float v0_low, v0_hi, v2_low, v2_hi;
//permute vectors
register vector unsigned char permVec1;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] -= constant * src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
//we know dst will be aligned here because we already handled the preceeding
//unaligned cases
v1 = vec_ld( 0, &dst[i] );
v3 = vec_ld( 16, &dst[i] );
v4 = vec_nmsub( v0, constVec, v1 );
v5 = vec_nmsub( v2, constVec, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] -= constant * src[i];
}
}
/*
============
idSIMD_AltiVec::MulSub
dst[i] -= src0[i] * src1[i];
============
*/
void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
//permute vectors
register vector unsigned char permVec1, permVec2;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i;
//unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] -= src0[i] * src1[i];
}
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src0[i] );
v3_hi = vec_ld( 0, &src1[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
// load sources
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src0[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src0[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src1[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src1[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
//we know dst is aligned because we handled unaligned cases
//up front
v4 = vec_ld( 0, &dst[i] );
v5 = vec_ld( 16, &dst[i] );
v6 = vec_nmsub( v0, v1, v4 );
v7 = vec_nmsub( v2, v3, v5 );
ALIGNED_STORE2( &dst[i], v6, v7 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] -= src0[i] * src1[i];
}
}
#endif /* ENABLE_SIMPLE_MATH */
#ifdef ENABLE_DOT
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant * src[i];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
register vector float vecX, vecY, vecZ;
vector float vecX2, vecY2, vecZ2;
const float *addr = src[0].ToFloatPtr();
float tempVal[4];
float constVal[4];
register vector float zeroVector = (vector float)(0.0);
register vector float vecConstX, vecConstY, vecConstZ;
// permute vectors
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
int i;
// for scalar cleanup, if necessary
constVal[0] = constant[0];
constVal[1] = constant[1];
constVal[2] = constant[2];
constVal[3] = 0;
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
// populate const vectors
vecConstX = vec_splat( vecLd1, 0 );
vecConstY = vec_splat( vecLd1, 1 );
vecConstZ = vec_splat( vecLd1, 2 );
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
vector float vecOld = vec_ld( 0, addr );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant * src[i];
}
for ( ; i + 7 < count; i += 8 ) {
float *vecPtr = (float*)( addr + (i*3) );
vector float v0, v1, v2, v3, v4, v5;
v0 = vecOld; //vec_ld( 0, vecPtr );
v1 = vec_ld( 15, vecPtr );
v2 = vec_ld( 31, vecPtr );
v3 = vec_ld( 47, vecPtr );
v4 = vec_ld( 63, vecPtr );
v5 = vec_ld( 79, vecPtr );
vecOld = vec_ld( 95, vecPtr );
vecLd1 = vec_perm( v0, v1, permVec );
vecLd2 = vec_perm( v1, v2, permVec );
vecLd3 = vec_perm( v2, v3, permVec );
vecLd4 = vec_perm( v3, v4, permVec );
vecLd5 = vec_perm( v4, v5, permVec );
vecLd6 = vec_perm( v5, vecOld, permVec );
// permute into X Y Z vectors
vecX = vec_perm( vecLd1, vecLd2, permX1 );
vecY = vec_perm( vecLd1, vecLd2, permY1 );
vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
vecX = vec_perm( vecX, vecLd3, permX2 );
vecY = vec_perm( vecY, vecLd3, permY2 );
vecZ = vec_perm( vecZ, vecLd3, permZ2 );
vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
vecX2 = vec_perm( vecX2, vecLd6, permX2 );
vecY2 = vec_perm( vecY2, vecLd6, permY2 );
vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
// do multiply
vecX = vec_madd( vecX, vecConstX, zeroVector );
vecY = vec_madd( vecY, vecConstY, vecX );
vecZ = vec_madd( vecZ, vecConstZ, vecY );
vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
// store out results
ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
}
//cleanup
for ( ; i < count; i++ ) {
// look up whats at the address we want, cast it as float pointer, then
// dereference that pointer
tempVal[0] = *( addr + (i*3) + 0 );
tempVal[1] = *( addr + (i*3) + 1 );
tempVal[2] = *( addr + (i*3) + 2 );
dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
}
}
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant * src[i].Normal() + src[i][3];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
//#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
int i;
float constVal[4];
float srcVal[3];
float srcI3;
float tempVal;
vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
vector float vecX, vecY, vecZ, vecI3;
vector float vecX2, vecY2, vecZ2, vecI32;
vector float vecConstX, vecConstY, vecConstZ;
constVal[0] = constant[0];
constVal[1] = constant[1];
constVal[2] = constant[2];
constVal[3] = 1;
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
vector float vecConst = vec_perm( v0, v1, constPerm );
vecConstX = vec_splat( vecConst, 0 );
vecConstY = vec_splat( vecConst, 1 );
vecConstZ = vec_splat( vecConst, 2 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant * src[i].Normal() + src[i][3];
}
const float *addr = src[i].ToFloatPtr();
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
vector float vecOld = vec_ld( 0, addr );
for ( ; i + 7 < count; i += 8 ) {
float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
vector float v0, v1, v2, v3, v4, v5, v6, v7;
v0 = vecOld; //vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
v2 = vec_ld( 31, planePtr );
v3 = vec_ld( 47, planePtr );
v4 = vec_ld( 63, planePtr );
v5 = vec_ld( 79, planePtr );
v6 = vec_ld( 95, planePtr );
v7 = vec_ld( 111, planePtr );
vecOld = vec_ld( 127, planePtr );
vecPlaneLd1 = vec_perm( v0, v1, permVec );
vecPlaneLd2 = vec_perm( v1, v2, permVec );
vecPlaneLd3 = vec_perm( v2, v3, permVec );
vecPlaneLd4 = vec_perm( v3, v4, permVec );
vecPlaneLd5 = vec_perm( v4, v5, permVec );
vecPlaneLd6 = vec_perm( v5, v6, permVec );
vecPlaneLd7 = vec_perm( v6, v7, permVec );
vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
// permute into X Y Z vectors, since this is square its basically
// a matrix transpose
v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
vecX = vec_mergeh( v0, v1 );
vecY = vec_mergel( v0, v1 );
vecZ = vec_mergeh( v2, v3 );
vecI3 = vec_mergel( v2, v3 );
v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
vecX2 = vec_mergeh( v4, v5 );
vecY2 = vec_mergel( v4, v5 );
vecZ2 = vec_mergeh( v6, v7 );
vecI32 = vec_mergel( v6, v7 );
// do calculation
v6 = vec_madd( vecZ, vecConstZ, vecI3 );
v5 = vec_madd( vecY, vecConstY, v6 );
v4 = vec_madd( vecX, vecConstX, v5 );
v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
v1 = vec_madd( vecY2, vecConstY, v0 );
v2 = vec_madd( vecX2, vecConstX, v1 );
// store results
ALIGNED_STORE2( &dst[i], v4, v2 );
}
// cleanup
for ( ; i < count; i++ ) {
// populate srcVal with src X Y Z
srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
// put src[i][3] into srcI3
srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
dst[i] = tempVal + srcI3;
}
}
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant * src[i].xyz;
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
//#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
// idDrawVert size is 60 bytes
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
int i;
register vector float vecConstX, vecConstY, vecConstZ;
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
register vector float zeroVector = (vector float)(0.0);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
v0 = vec_ld( 0, constant.ToFloatPtr() );
v1 = vec_ld( 11, constant.ToFloatPtr() );
v0 = vec_perm( v0, v1, constPerm );
// permute into constant vectors
vecConstX = vec_splat( v0, 0 );
vecConstY = vec_splat( v0, 1 );
vecConstZ = vec_splat( v0, 2 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant * src[i].xyz;
}
// every fourth one will have the same alignment. Make sure we've got enough here
if ( i+3 < count ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
for ( ; i+3 < count; i += 4 ) {
const float *vertPtr = src[i].xyz.ToFloatPtr();
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 11, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 11, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 11, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 11, vertPtr4 );
v0 = vec_perm( v0, v1, vertPerm1 );
v2 = vec_perm( v2, v3, vertPerm2 );
v4 = vec_perm( v4, v5, vertPerm3 );
v6 = vec_perm( v6, v7, vertPerm4 );
// transpose into X Y Z vectors
v1 = vec_mergeh( v0, v4 );
v3 = vec_mergeh( v2, v6 );
v5 = vec_mergel( v0, v4 );
v7 = vec_mergel( v2, v6 );
vecSrcX1 = vec_mergeh( v1, v3 );
vecSrcY1 = vec_mergel( v1, v3 );
vecSrcZ1 = vec_mergeh( v5, v7 );
// now calculate dot product
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
// store results
vec_st( vecSrcZ1, 0, &dst[i] );
}
for ( ; i < count; i++ ) {
dst[i] = constant * src[i].xyz;
}
}
#else
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant * src[i].xyz;
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
//#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
// idDrawVert size is 64 bytes
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
int i;
register vector float vecConstX, vecConstY, vecConstZ;
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
register vector float zeroVector = (vector float)(0.0);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
v0 = vec_ld( 0, constant.ToFloatPtr() );
v1 = vec_ld( 11, constant.ToFloatPtr() );
v0 = vec_perm( v0, v1, constPerm );
// permute into constant vectors
vecConstX = vec_splat( v0, 0 );
vecConstY = vec_splat( v0, 1 );
vecConstZ = vec_splat( v0, 2 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant * src[i].xyz;
}
for ( ; i+3 < count; i += 4 ) {
const float *vertPtr = src[i].xyz.ToFloatPtr();
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
// transpose into X Y Z vectors
v1 = vec_mergeh( v0, v4 );
v3 = vec_mergeh( v2, v6 );
v5 = vec_mergel( v0, v4 );
v7 = vec_mergel( v2, v6 );
vecSrcX1 = vec_mergeh( v1, v3 );
vecSrcY1 = vec_mergel( v1, v3 );
vecSrcZ1 = vec_mergeh( v5, v7 );
// now calculate dot product
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
// store results
vec_st( vecSrcZ1, 0, &dst[i] );
}
for ( ; i < count; i++ ) {
dst[i] = constant * src[i].xyz;
}
}
#endif /* DRAWVERT_PADDED */
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant.Normal() * src[i] + constant[3];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
register vector float zeroVector = (vector float)(0.0);
register vector float vecConstX, vecConstY, vecConstZ;
register vector float vecConst3;
idVec3 constNormal = constant.Normal();
float const3 = constant[3];
// permute vectors
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
int i;
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
// populate const vec
vecConstX = vec_splat( vecLd1, 0 );
vecConstY = vec_splat( vecLd1, 1 );
vecConstZ = vec_splat( vecLd1, 2 );
// put constant to add in vector
vecConst3 = loadSplatUnalignedScalar( &const3 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant.Normal() * src[i] + constant[3];
}
const float *addr = src[i].ToFloatPtr();
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
vector float vecOld = vec_ld( 0, addr );
for ( ; i+7 < count; i += 8 ) {
float *vecPtr = (float*)( addr + (i*3) );
vector float v0, v1, v2, v3, v4, v5;
v0 = vecOld; //vec_ld( 0, vecPtr );
v1 = vec_ld( 15, vecPtr );
v2 = vec_ld( 31, vecPtr );
v3 = vec_ld( 47, vecPtr );
v4 = vec_ld( 63, vecPtr );
v5 = vec_ld( 79, vecPtr );
vecOld = vec_ld( 95, vecPtr );
vecLd1 = vec_perm( v0, v1, permVec );
vecLd2 = vec_perm( v1, v2, permVec );
vecLd3 = vec_perm( v2, v3, permVec );
vecLd4 = vec_perm( v3, v4, permVec );
vecLd5 = vec_perm( v4, v5, permVec );
vecLd6 = vec_perm( v5, vecOld, permVec );
// permute into X Y Z vectors
vecX = vec_perm( vecLd1, vecLd2, permX1 );
vecY = vec_perm( vecLd1, vecLd2, permY1 );
vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
vecX = vec_perm( vecX, vecLd3, permX2 );
vecY = vec_perm( vecY, vecLd3, permY2 );
vecZ = vec_perm( vecZ, vecLd3, permZ2 );
vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
vecX2 = vec_perm( vecX2, vecLd6, permX2 );
vecY2 = vec_perm( vecY2, vecLd6, permY2 );
vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
// calculate dot product
vecX = vec_madd( vecX, vecConstX, zeroVector );
vecY = vec_madd( vecY, vecConstY, vecX );
vecZ = vec_madd( vecZ, vecConstZ, vecY );
vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
// add in constant[3]
vecZ = vec_add( vecZ, vecConst3 );
vecZ2 = vec_add( vecZ2, vecConst3 );
// store out results
ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
}
//cleanup
for ( ; i < count; i++ ) {
dst[i] = constNormal * src[i] + const3;
}
}
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
// check plane size
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
float constVal[4];
float srcVal[4];
int i;
const float *constPtr = constant.ToFloatPtr();
register vector float vecX, vecY, vecZ, vecI3;
register vector float vecX2, vecY2, vecZ2, vecI32;
vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
register vector float zeroVector = (vector float)(0.0);
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
constVal[0] = *(constPtr);
constVal[1] = *(constPtr+1);
constVal[2] = *(constPtr+2);
constVal[3] = *(constPtr+3);
// populate const vector
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
vector float vecConst = vec_perm( v0, v1, constPerm );
vecConstX = vec_splat( vecConst, 0 );
vecConstY = vec_splat( vecConst, 1 );
vecConstZ = vec_splat( vecConst, 2 );
vecConstI3 = vec_splat( vecConst, 3 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
}
const float *srcPtr = src[i].ToFloatPtr();
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
vector float vecOld = vec_ld( 0, srcPtr );
for ( ; i+7 < count; i += 8 ) {
float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
vector float v0, v1, v2, v3, v4, v5, v6, v7;
v0 = vecOld; // vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
v2 = vec_ld( 31, planePtr );
v3 = vec_ld( 47, planePtr );
v4 = vec_ld( 63, planePtr );
v5 = vec_ld( 79, planePtr );
v6 = vec_ld( 95, planePtr );
v7 = vec_ld( 111, planePtr );
vecOld = vec_ld( 127, planePtr );
vecPlaneLd1 = vec_perm( v0, v1, permVec );
vecPlaneLd2 = vec_perm( v1, v2, permVec );
vecPlaneLd3 = vec_perm( v2, v3, permVec );
vecPlaneLd4 = vec_perm( v3, v4, permVec );
vecPlaneLd5 = vec_perm( v4, v5, permVec );
vecPlaneLd6 = vec_perm( v5, v6, permVec );
vecPlaneLd7 = vec_perm( v6, v7, permVec );
vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
// permute into X Y Z vectors, since this is square its basically
// a matrix transpose
v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
vecX = vec_mergeh( v0, v1 );
vecY = vec_mergel( v0, v1 );
vecZ = vec_mergeh( v2, v3 );
vecI3 = vec_mergel( v2, v3 );
v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
vecX2 = vec_mergeh( v4, v5 );
vecY2 = vec_mergel( v4, v5 );
vecZ2 = vec_mergeh( v6, v7 );
vecI32 = vec_mergel( v6, v7 );
// do calculation
v4 = vec_madd( vecConstX, vecX, zeroVector );
v5 = vec_madd( vecConstY, vecY, v4 );
v6 = vec_madd( vecConstZ, vecZ, v5 );
v7 = vec_madd( vecConstI3, vecI3, v6 );
v0 = vec_madd( vecConstX, vecX2, zeroVector );
v1 = vec_madd( vecConstY, vecY2, v0 );
v2 = vec_madd( vecConstZ, vecZ2, v1 );
v3 = vec_madd( vecConstI3, vecI32, v2 );
//store result
ALIGNED_STORE2( &dst[i], v7, v3 );
}
// cleanup
for ( ; i < count; i++ ) {
//dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
}
}
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
// idDrawVert size is 60 bytes
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
int i;
const float *constPtr = constant.ToFloatPtr();
const float *srcPtr = src[0].xyz.ToFloatPtr();
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
register vector float vecDest1;
register vector float zeroVector = (vector float)(0.0);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
float constVal[4];
float srcVal[3];
constVal[0] = *(constPtr+0);
constVal[1] = *(constPtr+1);
constVal[2] = *(constPtr+2);
constVal[3] = *(constPtr+3);
// populate const vec
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
v0 = vec_ld( 0, constant.ToFloatPtr() );
v1 = vec_ld( 15, constant.ToFloatPtr() );
v0 = vec_perm( v0, v1, constPerm );
vecConstX = vec_splat( v0, 0 );
vecConstY = vec_splat( v0, 1 );
vecConstZ = vec_splat( v0, 2 );
vecConstI3 = vec_splat( v0, 3 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant.Normal() * src[i].xyz + constant[3];
}
// every fourth one will have the same alignment, so can store these. Make sure we
// have enough so we don't run off the end of the array
if ( i+3 < count ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
for ( ; i+3 < count; i+=4 ) {
const float *vertPtr = src[i].xyz.ToFloatPtr();
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 11, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 11, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 11, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 11, vertPtr4 );
v0 = vec_perm( v0, v1, vertPerm1 );
v2 = vec_perm( v2, v3, vertPerm2 );
v4 = vec_perm( v4, v5, vertPerm3 );
v6 = vec_perm( v6, v7, vertPerm4 );
// transpose into X Y Z vectors
v1 = vec_mergeh( v0, v4 );
v3 = vec_mergeh( v2, v6 );
v5 = vec_mergel( v0, v4 );
v7 = vec_mergel( v2, v6 );
vecSrcX1 = vec_mergeh( v1, v3 );
vecSrcY1 = vec_mergel( v1, v3 );
vecSrcZ1 = vec_mergeh( v5, v7 );
// now calculate dot product
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
// store results
vec_st( vecDest1, 0, &dst[i] );
}
// cleanup
for ( ; i < count; i++ ) {
srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
// dst[i] = constant.Normal() * src[i].xyz + constant[3];
dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
dst[i] += constVal[3];
}
}
#else
/*
============
idSIMD_AltiVec::Dot
dst[i] = constant.Normal() * src[i].xyz + constant[3];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
// idDrawVert size is 60 bytes
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
int i;
const float *constPtr = constant.ToFloatPtr();
const float *srcPtr = src[0].xyz.ToFloatPtr();
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
register vector float vecDest1;
register vector float zeroVector = (vector float)(0.0);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
float constVal[4];
float srcVal[3];
constVal[0] = *(constPtr+0);
constVal[1] = *(constPtr+1);
constVal[2] = *(constPtr+2);
constVal[3] = *(constPtr+3);
// populate const vec
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
v0 = vec_ld( 0, constant.ToFloatPtr() );
v1 = vec_ld( 15, constant.ToFloatPtr() );
v0 = vec_perm( v0, v1, constPerm );
vecConstX = vec_splat( v0, 0 );
vecConstY = vec_splat( v0, 1 );
vecConstZ = vec_splat( v0, 2 );
vecConstI3 = vec_splat( v0, 3 );
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = constant.Normal() * src[i].xyz + constant[3];
}
for ( ; i+3 < count; i+=4 ) {
const float *vertPtr = src[i].xyz.ToFloatPtr();
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
// transpose into X Y Z vectors
v1 = vec_mergeh( v0, v4 );
v3 = vec_mergeh( v2, v6 );
v5 = vec_mergel( v0, v4 );
v7 = vec_mergel( v2, v6 );
vecSrcX1 = vec_mergeh( v1, v3 );
vecSrcY1 = vec_mergel( v1, v3 );
vecSrcZ1 = vec_mergeh( v5, v7 );
// now calculate dot product
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
// store results
vec_st( vecDest1, 0, &dst[i] );
}
// cleanup
for ( ; i < count; i++ ) {
srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
// dst[i] = constant.Normal() * src[i].xyz + constant[3];
dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
dst[i] += constVal[3];
}
}
#endif /* DRAWVERT_PADDED */
/*
============
idSIMD_AltiVec::Dot
dst[i] = src0[i] * src1[i];
============
*/
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
//#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
int i;
float src0Val[3];
float src1Val[3];
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
register vector float zeroVector = (vector float)(0.0);
// permute vectors
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
// handle unaligned case at beginning
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = src0[i] * src1[i];
}
const float *src0Ptr = src0[i].ToFloatPtr();
const float *src1Ptr = src1[i].ToFloatPtr();
vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
vector float vecOld0 = vec_ld( 0, src0Ptr );
vector float vecOld1 = vec_ld( 0, src1Ptr );
for ( i = 0; i+7 < count; i += 8 ) {
float *s0Ptr = (float*)( src0Ptr + (i*3) );
float *s1Ptr = (float*)( src1Ptr + (i*3) );
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
v0 = vecOld0;
v1 = vec_ld( 15, s0Ptr );
v2 = vec_ld( 31, s0Ptr );
v3 = vec_ld( 47, s0Ptr );
v4 = vec_ld( 63, s0Ptr );
v5 = vec_ld( 79, s0Ptr );
vecOld0 = vec_ld( 95, s0Ptr );
v6 = vecOld1;
v7 = vec_ld( 15, s1Ptr );
v8 = vec_ld( 31, s1Ptr );
v9 = vec_ld( 47, s1Ptr );
v10 = vec_ld( 63, s1Ptr );
v11 = vec_ld( 79, s1Ptr );
vecOld1 = vec_ld( 95, s1Ptr );
vecLd1 = vec_perm( v0, v1, permVec1 );
vecLd2 = vec_perm( v1, v2, permVec1 );
vecLd3 = vec_perm( v2, v3, permVec1 );
vecLd4 = vec_perm( v3, v4, permVec1 );
vecLd5 = vec_perm( v4, v5, permVec1 );
vecLd6 = vec_perm( v5, vecOld0, permVec1 );
vecLd7 = vec_perm( v6, v7, permVec2 );
vecLd8 = vec_perm( v7, v8, permVec2 );
vecLd9 = vec_perm( v8, v9, permVec2 );
vecLd10 = vec_perm( v9, v10, permVec2 );
vecLd11 = vec_perm( v10, v11, permVec2 );
vecLd12 = vec_perm( v11, vecOld1, permVec2 );
// permute into X Y Z vectors
vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
vecX0 = vec_perm( vecX0, vecLd3, permX2 );
vecY0 = vec_perm( vecY0, vecLd3, permY2 );
vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
vecX02 = vec_perm( vecX02, vecLd6, permX2 );
vecY02 = vec_perm( vecY02, vecLd6, permY2 );
vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
vecX1 = vec_perm( vecX1, vecLd9, permX2 );
vecY1 = vec_perm( vecY1, vecLd9, permY2 );
vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
vecX12 = vec_perm( vecX12, vecLd12, permX2 );
vecY12 = vec_perm( vecY12, vecLd12, permY2 );
vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
// do multiply
vecX0 = vec_madd( vecX0, vecX1, zeroVector );
vecY0 = vec_madd( vecY0, vecY1, vecX0 );
vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
vecX02 = vec_madd( vecX02, vecX12, zeroVector );
vecY02 = vec_madd( vecY02, vecY12, vecX02 );
vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
// store out results
ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
}
// cleanup
for ( ; i < count; i++ ) {
// dst[i] = src0[i] * src1[i];
src0Val[0] = *( src0Ptr + (i*3) + 0 );
src0Val[1] = *( src0Ptr + (i*3) + 1 );
src0Val[2] = *( src0Ptr + (i*3) + 2 );
src1Val[0] = *( src1Ptr + (i*3) + 0 );
src1Val[1] = *( src1Ptr + (i*3) + 1 );
src1Val[2] = *( src1Ptr + (i*3) + 2 );
dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
}
}
/*
============
idSIMD_AltiVec::Dot
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
============
*/
void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
dot = 0.0f;
register vector float v0, v1, v2, v3;
register vector float zeroVector;
register vector float runningTotal1, runningTotal2;
//src0
register vector float v0_low, v0_hi, v2_low, v2_hi;
//src1
register vector float v1_low, v1_hi, v3_low, v3_hi;
//permute vectors
register vector unsigned char permVec1, permVec2;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i = 0;
runningTotal1 = (vector float)(0.0);
runningTotal2 = (vector float)(0.0);
zeroVector = (vector float)(0.0);
if ( count >= 8 ) {
//calculate permute and do loads
permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
v2_hi = vec_ld( 0, &src1[i] );
v3_hi = vec_ld( 0, &src2[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load sources
v0_low = v2_hi;
v0_hi = vec_ld( 15, &src1[i] );
v2_low = v0_hi;
v2_hi = vec_ld( 31, &src1[i] );
v1_low = v3_hi;
v1_hi = vec_ld( 15, &src2[i] );
v3_low = v1_hi;
v3_hi = vec_ld( 31, &src2[i] );
v0 = vec_perm( v0_low, v0_hi, permVec1 );
v1 = vec_perm( v1_low, v1_hi, permVec2 );
v2 = vec_perm( v2_low, v2_hi, permVec1 );
v3 = vec_perm( v3_low, v3_hi, permVec2 );
//multiply together and keep running sum
runningTotal1 = vec_madd( v0, v1, runningTotal1 );
runningTotal2 = vec_madd( v2, v3, runningTotal2 );
}
runningTotal1 = vec_add( runningTotal1, runningTotal2 );
// sum accross vector
v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
runningTotal1 = vec_splat( v1, 0 );
vec_ste( runningTotal1, 0, &dot );
}
//handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
// spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
// counts less than 50, so not much point in trying to get vector code in on the action
for ( ; i < count ; i++ ) {
dot += src1[i] * src2[i];
}
}
#endif /* ENABLE_DOT */
#ifdef ENABLE_COMPARES
/*
============
idSIMD_AltiVec::CmpGT
dst[i] = src0[i] > constant;
============
*/
void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
//#define OPER(X) dst[(X)] = src0[(X)] > constant;
register vector float v0, v1, v2, v3;
register vector bool int vr1, vr2, vr3, vr4;
register vector bool short vs1, vs2;
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
register vector unsigned char vc1;
register vector bool char vbc1;
register vector float constVec;
register vector unsigned char oneVector = (vector unsigned char)(1);
register vector unsigned char permVec;
int i;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = src0[i] > constant;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
v3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
// load values
v0_low = v3_hi;
v0_hi = vec_ld( 15, &src0[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src0[i] );
v2_low = v1_hi;
v2_hi = vec_ld( 47, &src0[i] );
v3_low = v2_hi;
v3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_perm( v2_low, v2_hi, permVec );
v3 = vec_perm( v3_low, v3_hi, permVec );
//do comparison
vr1 = vec_cmpgt( v0, constVec );
vr2 = vec_cmpgt( v1, constVec );
vr3 = vec_cmpgt( v2, constVec );
vr4 = vec_cmpgt( v3, constVec );
// pack results into shorts
vs1 = vec_pack(vr1, vr2);
vs2 = vec_pack(vr3, vr4);
// pack results into byte
vbc1 = vec_pack(vs1, vs2);
//AND with 1 to get true=1 not true=255
vc1 = vec_and( vbc1, oneVector );
//store results
vec_st( vc1, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] > constant;
}
}
/*
============
idSIMD_AltiVec::CmpGT
dst[i] |= ( src0[i] > constant ) << bitNum;
============
*/
void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
//#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
// Temp vector registers
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
register vector bool short vtbs0, vtbs1;
register vector bool char vtbc0;
register vector unsigned char vtuc0;
register vector unsigned char permVec, permVec2;
// dest vectors
register vector unsigned char vd;
// bitNum vectors
register vector unsigned char bitNumVec;
// src0 vectors
register vector float vs0, vs1, vs2, vs3;
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
// constant vector
register vector float constVec;
// all one's
register vector unsigned char oneVector = (vector unsigned char)(1);
int i = 0;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] |= ( src0[i] > constant ) << bitNum;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//bitNum is unaligned.
permVec2 = vec_lvsl( 0, &bitNum );
vtuc0 = vec_ld( 0, &bitNum );
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
bitNumVec = vec_splat( bitNumVec, 0 );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
vs3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
//load sources (floats)
vs0_low = vs3_hi;
vs0_hi = vec_ld( 15, &src0[i] );
vs1_low = vs0_hi;
vs1_hi = vec_ld( 31, &src0[i] );
vs2_low = vs1_hi;
vs2_hi = vec_ld( 47, &src0[i] );
vs3_low = vs2_hi;
vs3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
//load dest (bytes) as unsigned char
vd = vec_ld( 0, &dst[i] );
// do comparison and get bool int result
vtbi0 = vec_cmpgt( vs0, constVec );
vtbi1 = vec_cmpgt( vs1, constVec );
vtbi2 = vec_cmpgt( vs2, constVec );
vtbi3 = vec_cmpgt( vs3, constVec );
// pack results into shorts
vtbs0 = vec_pack(vtbi0, vtbi1);
vtbs1 = vec_pack(vtbi2, vtbi3);
// pack results into byte
vtbc0 = vec_pack(vtbs0, vtbs1);
//and with 1 to get true=1 instead of true=255
vtuc0 = vec_and(vtbc0, oneVector);
vtuc0 = vec_sl(vtuc0, bitNumVec );
//or with original
vd = vec_or( vd, vtuc0 );
vec_st( vd, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] |= ( src0[i] > constant ) << bitNum;
}
}
/*
============
idSIMD_AltiVec::CmpGE
dst[i] = src0[i] >= constant;
============
*/
void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
register vector float v0, v1, v2, v3;
register vector bool int vr1, vr2, vr3, vr4;
register vector bool short vs1, vs2;
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
register vector unsigned char vc1;
register vector bool char vbc1;
register vector float constVec;
register vector unsigned char oneVector = (vector unsigned char)(1);
register vector unsigned char permVec;
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = src0[i] >= constant;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
v3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
// load values
v0_low = v3_hi;
v0_hi = vec_ld( 15, &src0[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src0[i] );
v2_low = v1_hi;
v2_hi = vec_ld( 47, &src0[i] );
v3_low = v2_hi;
v3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_perm( v2_low, v2_hi, permVec );
v3 = vec_perm( v3_low, v3_hi, permVec );
//do comparison
vr1 = vec_cmpge( v0, constVec );
vr2 = vec_cmpge( v1, constVec );
vr3 = vec_cmpge( v2, constVec );
vr4 = vec_cmpge( v3, constVec );
// pack results into shorts
vs1 = vec_pack(vr1, vr2);
vs2 = vec_pack(vr3, vr4);
// pack results into byte
vbc1 = vec_pack(vs1, vs2);
//AND with 1 to get true=1 not true=255
vc1 = vec_and( vbc1, oneVector );
//store results
vec_st( vc1, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] >= constant;
}
}
/*
============
idSIMD_AltiVec::CmpGE
dst[i] |= ( src0[i] >= constant ) << bitNum;
============
*/
void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
register vector bool short vtbs0, vtbs1;
register vector bool char vtbc0;
register vector unsigned char vtuc0;
register vector unsigned char permVec, permVec2;
// dest vectors
register vector unsigned char vd;
// bitNum vectors
register vector unsigned char bitNumVec;
// src0 vectors
register vector float vs0, vs1, vs2, vs3;
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
// constant vector
register vector float constVec;
// all one's
register vector unsigned char oneVector = (vector unsigned char)(1);
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] |= ( src0[i] >= constant ) << bitNum;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//bitNum is unaligned.
permVec2 = vec_lvsl( 0, &bitNum );
vtuc0 = vec_ld( 0, &bitNum );
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
bitNumVec = vec_splat( bitNumVec, 0 );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
vs3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
//load sources (floats)
vs0_low = vs3_hi;
vs0_hi = vec_ld( 15, &src0[i] );
vs1_low = vs0_hi;
vs1_hi = vec_ld( 31, &src0[i] );
vs2_low = vs1_hi;
vs2_hi = vec_ld( 47, &src0[i] );
vs3_low = vs2_hi;
vs3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
//load dest (bytes) as unsigned char
vd = vec_ld( 0, &dst[i] );
// do comparison and get bool int result
vtbi0 = vec_cmpge( vs0, constVec );
vtbi1 = vec_cmpge( vs1, constVec );
vtbi2 = vec_cmpge( vs2, constVec );
vtbi3 = vec_cmpge( vs3, constVec );
// pack results into shorts
vtbs0 = vec_pack(vtbi0, vtbi1);
vtbs1 = vec_pack(vtbi2, vtbi3);
// pack results into byte
vtbc0 = vec_pack(vtbs0, vtbs1);
//and with 1L to get true=1 instead of true=255
vtuc0 = vec_and(vtbc0, oneVector);
vtuc0 = vec_sl(vtuc0, bitNumVec );
//or with original
vd = vec_or( vd, vtuc0 );
vec_st( vd, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] |= ( src0[i] >= constant ) << bitNum;
}
}
/*
============
idSIMD_AltiVec::CmpLT
dst[i] = src0[i] < constant;
============
*/
void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
//#define OPER(X) dst[(X)] = src0[(X)] < constant;
register vector float v0, v1, v2, v3;
register vector bool int vr1, vr2, vr3, vr4;
register vector bool short vs1, vs2;
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
register vector unsigned char vc1;
register vector bool char vbc1;
register vector float constVec;
register vector unsigned char oneVector = (vector unsigned char)(1);
register vector unsigned char permVec;
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = src0[i] < constant;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
v3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
// load values
v0_low = v3_hi;
v0_hi = vec_ld( 15, &src0[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src0[i] );
v2_low = v1_hi;
v2_hi = vec_ld( 47, &src0[i] );
v3_low = v2_hi;
v3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_perm( v2_low, v2_hi, permVec );
v3 = vec_perm( v3_low, v3_hi, permVec );
//do comparison
vr1 = vec_cmplt( v0, constVec );
vr2 = vec_cmplt( v1, constVec );
vr3 = vec_cmplt( v2, constVec );
vr4 = vec_cmplt( v3, constVec );
// pack results into shorts
vs1 = vec_pack(vr1, vr2);
vs2 = vec_pack(vr3, vr4);
// pack results into byte
vbc1 = vec_pack(vs1, vs2);
//AND with 1 to get true=1 not true=255
vc1 = vec_and( vbc1, oneVector );
//store results
vec_st( vc1, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] < constant;
}
}
/*
============
idSIMD_AltiVec::CmpLT
dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
//#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
register vector bool short vtbs0, vtbs1;
register vector bool char vtbc0;
register vector unsigned char vtuc0;
register vector unsigned char permVec, permVec2;
// dest vectors
register vector unsigned char vd;
// bitNum vectors
register vector unsigned char bitNumVec;
// src0 vectors
register vector float vs0, vs1, vs2, vs3;
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
// constant vector
register vector float constVec;
// all one's
register vector unsigned char oneVector = (vector unsigned char)(1);
int i = 0;
//handle unaligned at start
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] |= ( src0[i] < constant ) << bitNum;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//bitNum is unaligned.
permVec2 = vec_lvsl( 0, &bitNum );
vtuc0 = vec_ld( 0, &bitNum );
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
bitNumVec = vec_splat( bitNumVec, 0 );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
vs3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
//load sources (floats)
vs0_low = vs3_hi;
vs0_hi = vec_ld( 15, &src0[i] );
vs1_low = vs0_hi;
vs1_hi = vec_ld( 31, &src0[i] );
vs2_low = vs1_hi;
vs2_hi = vec_ld( 47, &src0[i] );
vs3_low = vs2_hi;
vs3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
//load dest (bytes) as unsigned char
vd = vec_ld( 0, &dst[i] );
// do comparison and get bool int result
vtbi0 = vec_cmplt( vs0, constVec );
vtbi1 = vec_cmplt( vs1, constVec );
vtbi2 = vec_cmplt( vs2, constVec );
vtbi3 = vec_cmplt( vs3, constVec );
// pack results into shorts
vtbs0 = vec_pack(vtbi0, vtbi1);
vtbs1 = vec_pack(vtbi2, vtbi3);
// pack results into byte
vtbc0 = vec_pack(vtbs0, vtbs1);
//and with 1L to get true=1 instead of true=255
vtuc0 = vec_and(vtbc0, oneVector);
vtuc0 = vec_sl(vtuc0, bitNumVec );
//or with original
vd = vec_or( vd, vtuc0 );
vec_st( vd, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] |= ( src0[i] < constant ) << bitNum;
}
}
//#endif
/*
============
idSIMD_AltiVec::CmpLE
dst[i] = src0[i] <= constant;
============
*/
void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
//#define OPER(X) dst[(X)] = src0[(X)] <= constant;
register vector float v0, v1, v2, v3;
register vector bool int vr1, vr2, vr3, vr4;
register vector bool short vs1, vs2;
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
register vector unsigned char vc1;
register vector bool char vbc1;
register vector float constVec;
register vector unsigned char oneVector = (vector unsigned char)(1);
register vector unsigned char permVec;
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] = src0[i] <= constant;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
v3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
// load values
v0_low = v3_hi;
v0_hi = vec_ld( 15, &src0[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src0[i] );
v2_low = v1_hi;
v2_hi = vec_ld( 47, &src0[i] );
v3_low = v2_hi;
v3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_perm( v2_low, v2_hi, permVec );
v3 = vec_perm( v3_low, v3_hi, permVec );
//do comparison
vr1 = vec_cmple( v0, constVec );
vr2 = vec_cmple( v1, constVec );
vr3 = vec_cmple( v2, constVec );
vr4 = vec_cmple( v3, constVec );
// pack results into shorts
vs1 = vec_pack(vr1, vr2);
vs2 = vec_pack(vr3, vr4);
// pack results into byte
vbc1 = vec_pack(vs1, vs2);
//AND with 1 to get true=1 not true=255
vc1 = vec_and( vbc1, oneVector );
//store results
vec_st( vc1, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src0[i] <= constant;
}
}
/*
============
idSIMD_AltiVec::CmpLE
dst[i] |= ( src0[i] <= constant ) << bitNum;
============
*/
void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
//#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
register vector bool short vtbs0, vtbs1;
register vector bool char vtbc0;
register vector unsigned char vtuc0;
register vector unsigned char permVec, permVec2;
// dest vectors
register vector unsigned char vd;
// bitNum vectors
register vector unsigned char bitNumVec;
// src0 vectors
register vector float vs0, vs1, vs2, vs3;
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
// constant vector
register vector float constVec;
// all one's
register vector unsigned char oneVector = (vector unsigned char)(1);
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
dst[i] |= ( src0[i] <= constant ) << bitNum;
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//bitNum is unaligned.
permVec2 = vec_lvsl( 0, &bitNum );
vtuc0 = vec_ld( 0, &bitNum );
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
bitNumVec = vec_splat( bitNumVec, 0 );
//calculate permute and do loads
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
vs3_hi = vec_ld( 0, &src0[i] );
//vectorize!
for ( ; i+15 < count; i += 16 ) {
//load sources (floats)
vs0_low = vs3_hi;
vs0_hi = vec_ld( 15, &src0[i] );
vs1_low = vs0_hi;
vs1_hi = vec_ld( 31, &src0[i] );
vs2_low = vs1_hi;
vs2_hi = vec_ld( 47, &src0[i] );
vs3_low = vs2_hi;
vs3_hi = vec_ld( 63, &src0[i] );
//permute into the vectors we want
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
//load dest (bytes) as unsigned char
vd = vec_ld( 0, &dst[i] );
// do comparison and get bool int result
vtbi0 = vec_cmple( vs0, constVec );
vtbi1 = vec_cmple( vs1, constVec );
vtbi2 = vec_cmple( vs2, constVec );
vtbi3 = vec_cmple( vs3, constVec );
// pack results into shorts
vtbs0 = vec_pack(vtbi0, vtbi1);
vtbs1 = vec_pack(vtbi2, vtbi3);
// pack results into byte
vtbc0 = vec_pack(vtbs0, vtbs1);
//and with 1L to get true=1 instead of true=255
vtuc0 = vec_and(vtbc0, oneVector);
vtuc0 = vec_sl(vtuc0, bitNumVec );
//or with original
vd = vec_or( vd, vtuc0 );
vec_st( vd, 0, &dst[i] );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] |= ( src0[i] <= constant ) << bitNum;
}
}
#endif /* ENABLE_COMPARES */
#ifdef ENABLE_MINMAX
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
min = idMath::INFINITY; max = -idMath::INFINITY;
//#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
register vector float v0, v1, v2, v3;
register vector float maxVec, minVec, tempMin, tempMax;
register vector unsigned char permVec;
register vector float v0_low, v0_hi, v1_low, v1_hi;
vector unsigned char oneCharVector = (vector unsigned char)(1);
int i = 0;
if ( count >= 4 ) {
//calculate permute and do first load to
//get a starting point for min and max
permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
v1_hi = vec_ld( 0, &src[0] );
maxVec = loadSplatUnalignedScalar( &max );
minVec = loadSplatUnalignedScalar( &min );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load sources
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
// minimum
v2 = vec_min( v0, v1 );
minVec = vec_min( minVec, v2 );
// maximum
v3 = vec_max( v0, v1 );
maxVec = vec_max( maxVec, v3 );
}
//minVec and maxVec hold the min/max elements from the array, but now
//we need to figure out which particular element it is
tempMin = minVec;
tempMax = maxVec;
// rotate vector around and compare to itself to find the real min/max
tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
minVec = vec_splat( tempMin, 0 );
maxVec = vec_splat( tempMax, 0 );
vec_ste( minVec, 0, &min );
vec_ste( maxVec, 0, &max );
}
//cleanup
for ( ; i < count; i++ ) {
if ( src[i] < min ) {
min = src[i];
}
if ( src[i] > max ) {
max = src[i];
}
}
}
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
//#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
idVec2 v;
int i = 0;
int j;
const float *srcPtr = src[0].ToFloatPtr();
register vector float vecLd1, vecLd2, vecLd3, vecLd4;
register vector float vecMin, vecMax;
register vector float v0, v1, v2, v3;
if ( count > 4 ) {
vecMin = (vector float)(FLT_MAX);
vecMax = (vector float)(FLT_MIN);
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
vector float vecOld = vec_ld( 0, srcPtr );
for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
// load data
float *vecPtr = (float*)( srcPtr + (j*4) );
vector float v0, v1, v2, v3;
v0 = vecOld;
v1 = vec_ld( 15, vecPtr );
v2 = vec_ld( 31, vecPtr );
v3 = vec_ld( 47, vecPtr );
vecOld = vec_ld( 63, vecPtr );
vecLd1 = vec_perm( v0, v1, permVec );
vecLd2 = vec_perm( v1, v2, permVec );
vecLd3 = vec_perm( v2, v3, permVec );
vecLd4 = vec_perm( v3, vecOld, permVec );
// each of these vectors contains 2 elements
// looks like | X Y X Y | X Y X Y
v0 = vec_min( vecLd1, vecLd2 );
v1 = vec_min( vecLd3, vecLd4 );
v0 = vec_min( v0, v1 );
v2 = vec_max( vecLd1, vecLd2 );
v3 = vec_max( vecLd3, vecLd4 );
v2 = vec_max( v2, v3 );
// since its always X Y X Y we don't have to re-merge each time. we can wait
// until the end
vecMin = vec_min( v0, vecMin );
vecMax = vec_max( v2, vecMax );
}
vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
v0 = vec_splat( vecMin, 0 );
v1 = vec_splat( vecMin, 1 );
v2 = vec_splat( vecMax, 0 );
v3 = vec_splat( vecMax, 1 );
vec_ste( v0, 0, &min[0] );
vec_ste( v1, 0, &min[1] );
vec_ste( v2, 0, &max[0] );
vec_ste( v3, 0, &max[1] );
}
// cleanup
for ( ; i < count; i++ ) {
v = src[i];
if ( v[0] < min[0] ) {
min[0] = v[0];
}
if ( v[0] > max[0] ) {
max[0] = v[0];
}
if ( v[1] < min[1] ) {
min[1] = v[1];
}
if ( v[1] > max[1] ) {
max[1] = v[1];
}
}
}
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
//#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
int i = 0;
const float *srcPtr = src[0].ToFloatPtr();
idVec3 v;
register vector float vecLd1, vecLd2, vecLd3;
register vector float vecMin, vecMax;
register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
if ( count >= 4 ) {
vecMin = (vector float)(FLT_MAX);
vecMax = (vector float)(FLT_MIN);
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
vector float vecOld = vec_ld( 0, srcPtr );
// 4 elements at a time
for ( ; i+3 < count; i += 4 ) {
float *vecPtr = (float*)( srcPtr + (i*3) );
vector float v0, v1, v2;
v0 = vecOld;
v1 = vec_ld( 15, vecPtr );
v2 = vec_ld( 31, vecPtr );
vecOld = vec_ld( 47, vecPtr );
vecLd1 = vec_perm( v0, v1, permVec );
vecLd2 = vec_perm( v1, v2, permVec );
vecLd3 = vec_perm( v2, vecOld, permVec );
// put each idVec3 into its own vector as X Y Z (crap)
vecSrc1 = vecLd1;
vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
// do min and max
vecMin1 = vec_min( vecSrc1, vecSrc2 );
vecMin2 = vec_min( vecSrc3, vecSrc4 );
vecMin1 = vec_min( vecMin1, vecMin2 );
vecMin = vec_min( vecMin, vecMin1 );
vecMax1 = vec_max( vecSrc1, vecSrc2 );
vecMax2 = vec_max( vecSrc3, vecSrc4 );
vecMax1 = vec_max( vecMax1, vecMax2 );
vecMax = vec_max( vecMax1, vecMax );
}
// store results
vector float v0, v1, v2, v3, v4, v5;
v0 = vec_splat( vecMin, 0 );
v1 = vec_splat( vecMin, 1 );
v2 = vec_splat( vecMin, 2 );
v3 = vec_splat( vecMax, 0 );
v4 = vec_splat( vecMax, 1 );
v5 = vec_splat( vecMax, 2 );
vec_ste( v0, 0, &min[0] );
vec_ste( v1, 0, &min[1] );
vec_ste( v2, 0, &min[2] );
vec_ste( v3, 0, &max[0] );
vec_ste( v4, 0, &max[1] );
vec_ste( v5, 0, &max[2] );
}
// cleanup
for ( ; i < count; i ++ ) {
v = src[i];
if ( v[0] < min[0] ) {
min[0] = v[0];
}
if ( v[0] > max[0] ) {
max[0] = v[0];
}
if ( v[1] < min[1] ) {
min[1] = v[1];
}
if ( v[1] > max[1] ) {
max[1] = v[1];
}
if ( v[2] < min[2] ) {
min[2] = v[2];
}
if ( v[2] > max[2] ) {
max[2] = v[2];
}
}
}
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
idVec3 v;
int i = 0;
register vector float vecMin, vecMax;
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
if ( count >= 4 ) {
vecMin = (vector float)(FLT_MAX);
vecMax = (vector float)(FLT_MIN);
vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
for ( ; i+3 < count; i += 4) {
const float *vertPtr = src[i].xyz.ToFloatPtr();
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 11, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 11, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 11, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 11, vertPtr4 );
v0 = vec_perm( v0, v1, vertPerm1 );
v2 = vec_perm( v2, v3, vertPerm2 );
v4 = vec_perm( v4, v5, vertPerm3 );
v6 = vec_perm( v6, v7, vertPerm4 );
vecMin1 = vec_min( v0, v2 );
vecMin2 = vec_min( v4, v6 );
vecMin1 = vec_min( vecMin1, vecMin2 );
vecMin = vec_min( vecMin, vecMin1 );
vecMax1 = vec_max( v0, v2 );
vecMax2 = vec_max( v4, v6 );
vecMax1 = vec_max( vecMax1, vecMax2 );
vecMax = vec_max( vecMax, vecMax1 );
}
// now we have min/max vectors in X Y Z form, store out
v0 = vec_splat( vecMin, 0 );
v1 = vec_splat( vecMin, 1 );
v2 = vec_splat( vecMin, 2 );
v3 = vec_splat( vecMax, 0 );
v4 = vec_splat( vecMax, 1 );
v5 = vec_splat( vecMax, 2 );
vec_ste( v0, 0, &min[0] );
vec_ste( v1, 0, &min[1] );
vec_ste( v2, 0, &min[2] );
vec_ste( v3, 0, &max[0] );
vec_ste( v4, 0, &max[1] );
vec_ste( v5, 0, &max[2] );
}
// cleanup
for ( ; i < count; i++ ) {
v = src[i].xyz;
if ( v[0] < min[0] ) {
min[0] = v[0];
}
if ( v[0] > max[0] ) {
max[0] = v[0];
}
if ( v[1] < min[1] ) {
min[1] = v[1];
}
if ( v[1] > max[1] ) {
max[1] = v[1];
}
if ( v[2] > max[2] ) {
max[2] = v[2];
}
if ( v[2] < min[2] ) {
min[2] = v[2];
}
}
}
#else
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
idVec3 v;
int i = 0;
register vector float vecMin, vecMax;
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
if ( count >= 4 ) {
vecMin = (vector float)(FLT_MAX);
vecMax = (vector float)(FLT_MIN);
for ( ; i+3 < count; i += 4) {
const float *vertPtr = src[i].xyz.ToFloatPtr();
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
vecMin1 = vec_min( v0, v2 );
vecMin2 = vec_min( v4, v6 );
vecMin1 = vec_min( vecMin1, vecMin2 );
vecMin = vec_min( vecMin, vecMin1 );
vecMax1 = vec_max( v0, v2 );
vecMax2 = vec_max( v4, v6 );
vecMax1 = vec_max( vecMax1, vecMax2 );
vecMax = vec_max( vecMax, vecMax1 );
}
// now we have min/max vectors in X Y Z form, store out
v0 = vec_splat( vecMin, 0 );
v1 = vec_splat( vecMin, 1 );
v2 = vec_splat( vecMin, 2 );
v3 = vec_splat( vecMax, 0 );
v4 = vec_splat( vecMax, 1 );
v5 = vec_splat( vecMax, 2 );
vec_ste( v0, 0, &min[0] );
vec_ste( v1, 0, &min[1] );
vec_ste( v2, 0, &min[2] );
vec_ste( v3, 0, &max[0] );
vec_ste( v4, 0, &max[1] );
vec_ste( v5, 0, &max[2] );
}
// cleanup
for ( ; i < count; i++ ) {
v = src[i].xyz;
if ( v[0] < min[0] ) {
min[0] = v[0];
}
if ( v[0] > max[0] ) {
max[0] = v[0];
}
if ( v[1] < min[1] ) {
min[1] = v[1];
}
if ( v[1] > max[1] ) {
max[1] = v[1];
}
if ( v[2] > max[2] ) {
max[2] = v[2];
}
if ( v[2] < min[2] ) {
min[2] = v[2];
}
}
}
#endif /* DRAWVERT_PADDED */
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
idVec3 v;
int i = 0;
register vector float vecMin, vecMax;
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
if ( count >= 4 ) {
vecMin = (vector float)(FLT_MAX);
vecMax = (vector float)(FLT_MIN);
vector unsigned char vertPerm1;
vector unsigned char vertPerm2;
vector unsigned char vertPerm3;
vector unsigned char vertPerm4;
for ( ; i+3 < count; i += 4) {
const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 15, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 15, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 15, vertPtr4 );
v0 = vec_perm( v0, v1, vertPerm1 );
v2 = vec_perm( v2, v3, vertPerm2 );
v4 = vec_perm( v4, v5, vertPerm3 );
v6 = vec_perm( v6, v7, vertPerm4 );
vecMin1 = vec_min( v0, v2 );
vecMin2 = vec_min( v4, v6 );
vecMin1 = vec_min( vecMin1, vecMin2 );
vecMin = vec_min( vecMin, vecMin1 );
vecMax1 = vec_max( v0, v2 );
vecMax2 = vec_max( v4, v6 );
vecMax1 = vec_max( vecMax1, vecMax2 );
vecMax = vec_max( vecMax, vecMax1 );
}
// now we have min/max vectors in X Y Z form, store out
v0 = vec_splat( vecMin, 0 );
v1 = vec_splat( vecMin, 1 );
v2 = vec_splat( vecMin, 2 );
v3 = vec_splat( vecMax, 0 );
v4 = vec_splat( vecMax, 1 );
v5 = vec_splat( vecMax, 2 );
vec_ste( v0, 0, &min[0] );
vec_ste( v1, 0, &min[1] );
vec_ste( v2, 0, &min[2] );
vec_ste( v3, 0, &max[0] );
vec_ste( v4, 0, &max[1] );
vec_ste( v5, 0, &max[2] );
}
// cleanup
for ( ; i < count; i++ ) {
v = src[indexes[i]].xyz;
if ( v[0] < min[0] ) {
min[0] = v[0];
}
if ( v[0] > max[0] ) {
max[0] = v[0];
}
if ( v[1] < min[1] ) {
min[1] = v[1];
}
if ( v[1] > max[1] ) {
max[1] = v[1];
}
if ( v[2] > max[2] ) {
max[2] = v[2];
}
if ( v[2] < min[2] ) {
min[2] = v[2];
}
}
}
#else
/*
============
idSIMD_AltiVec::MinMax
============
*/
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
idVec3 v;
int i = 0;
register vector float vecMin, vecMax;
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
if ( count >= 4 ) {
vecMin = (vector float)(FLT_MAX);
vecMax = (vector float)(FLT_MIN);
vector unsigned char vertPerm1;
vector unsigned char vertPerm2;
vector unsigned char vertPerm3;
vector unsigned char vertPerm4;
for ( ; i+3 < count; i += 4) {
const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
vecMin1 = vec_min( v0, v2 );
vecMin2 = vec_min( v4, v6 );
vecMin1 = vec_min( vecMin1, vecMin2 );
vecMin = vec_min( vecMin, vecMin1 );
vecMax1 = vec_max( v0, v2 );
vecMax2 = vec_max( v4, v6 );
vecMax1 = vec_max( vecMax1, vecMax2 );
vecMax = vec_max( vecMax, vecMax1 );
}
// now we have min/max vectors in X Y Z form, store out
v0 = vec_splat( vecMin, 0 );
v1 = vec_splat( vecMin, 1 );
v2 = vec_splat( vecMin, 2 );
v3 = vec_splat( vecMax, 0 );
v4 = vec_splat( vecMax, 1 );
v5 = vec_splat( vecMax, 2 );
vec_ste( v0, 0, &min[0] );
vec_ste( v1, 0, &min[1] );
vec_ste( v2, 0, &min[2] );
vec_ste( v3, 0, &max[0] );
vec_ste( v4, 0, &max[1] );
vec_ste( v5, 0, &max[2] );
}
// cleanup
for ( ; i < count; i++ ) {
v = src[indexes[i]].xyz;
if ( v[0] < min[0] ) {
min[0] = v[0];
}
if ( v[0] > max[0] ) {
max[0] = v[0];
}
if ( v[1] < min[1] ) {
min[1] = v[1];
}
if ( v[1] > max[1] ) {
max[1] = v[1];
}
if ( v[2] > max[2] ) {
max[2] = v[2];
}
if ( v[2] < min[2] ) {
min[2] = v[2];
}
}
}
#endif /* DRAWVERT_PADDED */
#endif /* ENABLE_MINMAX */
#ifdef ENABLE_CLAMP
/*
============
idSIMD_AltiVec::Clamp
============
*/
void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
//#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
register vector float v0, v1, v2, v3, v4, v5;
register vector unsigned char permVec;
register vector float v0_low, v0_hi, v1_low, v1_hi;
vector unsigned char oneVector = (vector unsigned char)(1);
register vector float minVec, maxVec;
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
}
//splat min/max into a vector
minVec = loadSplatUnalignedScalar( &min );
maxVec = loadSplatUnalignedScalar( &max );
//calculate permute and do first load
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
v1_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
//apply minimum
v2 = vec_max( v0, minVec );
v3 = vec_max( v1, minVec );
//apply maximum
v4 = vec_min( v2, maxVec );
v5 = vec_min( v3, maxVec );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
}
}
/*
============
idSIMD_AltiVec::ClampMin
============
*/
void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
//#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
register vector float v0, v1, v2, v3;
register vector unsigned char permVec;
register vector float v0_low, v0_hi, v1_low, v1_hi;
register vector float constVec;
vector unsigned char oneVector = (vector unsigned char)(1);
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = src[i] < min ? min : src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &min );
//calculate permute and do first load
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
v1_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_max( v0, constVec );
v3 = vec_max( v1, constVec );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src[i] < min ? min : src[i];
}
}
/*
============
idSIMD_AltiVec::ClampMax
============
*/
void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
//#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
register vector float v0, v1, v2, v3;
register vector unsigned char permVec;
register vector float constVec;
register vector float v0_low, v0_hi, v1_low, v1_hi;
vector unsigned char oneVector = (vector unsigned char)(1);
int i = 0;
//handle unaligned at start
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
dst[i] = src[i] < max ? max : src[i];
}
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &max );
//calculate permute and do first load
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
v1_hi = vec_ld( 0, &src[i] );
//vectorize!
for ( ; i+7 < count; i += 8 ) {
//load source
v0_low = v1_hi;
v0_hi = vec_ld( 15, &src[i] );
v1_low = v0_hi;
v1_hi = vec_ld( 31, &src[i] );
v0 = vec_perm( v0_low, v0_hi, permVec );
v1 = vec_perm( v1_low, v1_hi, permVec );
v2 = vec_min( v0, constVec );
v3 = vec_min( v1, constVec );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
//handle cleanup
for ( ; i < count ; i++ ) {
dst[i] = src[i] < max ? max : src[i];
}
}
#endif /* ENABLE_CLAMP */
#ifdef ENABLE_16ROUTINES
/*
============
idSIMD_AltiVec::Zero16
============
*/
void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
memset( dst, 0, count * sizeof( float ) );
}
/*
============
idSIMD_AltiVec::Negate16
Assumptions:
dst is aligned
============
*/
void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
//#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
int i = 0;
vector float v0, v1, v2, v3;
//know its 16-byte aligned
for ( ; i + 7 < count2; i += 8 ) {
v0 = vec_ld( 0, &dst[i] );
v1 = vec_ld( 16, &dst[i] );
v2 = vec_sub( (vector float)(0), v0 );
v3 = vec_sub( (vector float)(0), v1 );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &dst[i] );
v1 = vec_sub( (vector float)(0), v0 );
vec_st( v1, 0, &dst[i] );
}
}
/*
============
idSIMD_AltiVec::Copy16
============
*/
void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
//#define OPER(X) dst[(X)] = src[(X)]
memcpy( dst, src, sizeof(float) * count );
}
/*
============
idSIMD_AltiVec::Add16
Assumptions:
Assumes dst, src1, src2 all start at aligned address
============
*/
void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
//#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// src1 is aligned
assert( IS_16BYTE_ALIGNED( src1[0] ) );
// src2 is aligned
assert( IS_16BYTE_ALIGNED( src2[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
register vector float v0, v1, v2, v3, v4, v5;
int i = 0;
//know all data is 16-byte aligned, so vectorize!
for ( ; i+7 < count2; i += 8 ) {
//load sources
v0 = vec_ld( 0, &src1[i] );
v1 = vec_ld( 16, &src1[i] );
v2 = vec_ld( 0, &src2[i] );
v3 = vec_ld( 16, &src2[i] );
v4 = vec_add( v0, v2 );
v5 = vec_add( v1, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &src1[i] );
v1 = vec_ld( 0, &src2[i] );
v2 = vec_add( v0, v1 );
vec_st( v2, 0, &dst[i] );
}
}
/*
============
idSIMD_AltiVec::Sub16
Assumptions:
Assumes that dst, src1, and src2 all start at aligned address
============
*/
void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
//#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// src1 is aligned
assert( IS_16BYTE_ALIGNED( src1[0] ) );
// src2 is aligned
assert( IS_16BYTE_ALIGNED( src2[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
register vector float v0, v1, v2, v3, v4, v5;
int i = 0;
//know data is aligned, so vectorize!
for ( ; i+7 < count2; i += 8 ) {
//load sources
v0 = vec_ld( 0, &src1[i] );
v1 = vec_ld( 16, &src1[i] );
v2 = vec_ld( 0, &src2[i] );
v3 = vec_ld( 16, &src2[i] );
v4 = vec_sub( v0, v2 );
v5 = vec_sub( v1, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &src1[i] );
v1 = vec_ld( 0, &src2[i] );
v2 = vec_sub( v0, v1 );
vec_st( v2, 0, &dst[i] );
}
}
/*
============
idSIMD_AltiVec::Mul16
Assumptions:
Assumes that dst and src1 start at aligned address
============
*/
void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
//#define OPER(X) dst[(X)] = src1[(X)] * constant
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// src1 is aligned
assert( IS_16BYTE_ALIGNED( src1[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
register vector float v0, v1, v2, v3;
register vector float constVec;
register vector float zeroVector = (vector float)(0.0);
int i = 0;
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//know data is aligned, so vectorize!
for ( ; i+7 < count2; i += 8 ) {
//load source
v0 = vec_ld( 0, &src1[i] );
v1 = vec_ld( 16, &src1[i] );
v2 = vec_madd( constVec, v0, zeroVector );
v3 = vec_madd( constVec, v1, zeroVector );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &src1[i] );
v1 = vec_madd( constVec, v0, zeroVector );
vec_st( v1, 0, &dst[i] );
}
}
/*
============
idSIMD_AltiVec::AddAssign16
Assumptions:
Assumes that dst and src start at aligned address
============
*/
void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
//#define OPER(X) dst[(X)] += src[(X)]
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// src is aligned
assert( IS_16BYTE_ALIGNED( src[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
register vector float v0, v1, v2, v3, v4, v5;
int i = 0;
//vectorize!
for ( ; i+7 < count2; i += 8 ) {
v0 = vec_ld( 0, &src[i] );
v1 = vec_ld( 16, &src[i] );
v2 = vec_ld( 0, &dst[i] );
v3 = vec_ld( 16, &dst[i] );
v4 = vec_add( v0, v2 );
v5 = vec_add( v1, v3 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &src[i] );
v1 = vec_ld( 0, &dst[i] );
v2 = vec_add( v0, v1 );
vec_st( v2, 0, &dst[i] );
}
}
/*
============
idSIMD_AltiVec::SubAssign16
Assumptions:
Assumes that dst and src start at aligned address
============
*/
void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
//#define OPER(X) dst[(X)] -= src[(X)]
register vector float v0, v1, v2, v3, v4, v5;
int i=0;
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// src is aligned
assert( IS_16BYTE_ALIGNED( src[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
//vectorize!
for ( ; i+7 < count2; i += 8 ) {
v0 = vec_ld( 0, &src[i] );
v1 = vec_ld( 16, &src[i] );
v2 = vec_ld( 0, &dst[i] );
v3 = vec_ld( 16, &dst[i] );
v4 = vec_sub( v2, v0 );
v5 = vec_sub( v3, v1 );
ALIGNED_STORE2( &dst[i], v4, v5 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &src[i] );
v1 = vec_ld( 0, &dst[i] );
v2 = vec_sub( v1, v0 );
vec_st( v2, 0, &dst[i] );
}
}
/*
============
idSIMD_AltiVec::MulAssign16
Assumptions:
Assumes that dst starts at aligned address and count is multiple of 4
============
*/
void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
//#define OPER(X) dst[(X)] *= constant
// dst is aligned
assert( IS_16BYTE_ALIGNED( dst[0] ) );
// round count up to next 4 if needbe
int count2 = ( count + 3 ) & ~3;
register vector float v0, v1, v2, v3;
register vector float constVec;
int i = 0;
register vector float zeroVector = (vector float)(0.0);
//splat constant into a vector
constVec = loadSplatUnalignedScalar( &constant );
//vectorize!
for ( ; i+7 < count2; i += 8 ) {
v0 = vec_ld( 0, &dst[i] );
v1 = vec_ld( 16, &dst[i] );
v2 = vec_madd( v0, constVec, zeroVector );
v3 = vec_madd( v1, constVec, zeroVector );
ALIGNED_STORE2( &dst[i], v2, v3 );
}
for ( ; i < count2; i += 4 ) {
v0 = vec_ld( 0, &dst[i] );
v1 = vec_madd( v0, constVec, zeroVector );
vec_st( v1, 0, &dst[i] );
}
}
#endif /* ENABLE_16ROUTINES */
#ifdef ENABLE_LOWER_TRIANGULAR
/*
============
idSIMD_AltiVec::MatX_LowerTriangularSolve
solves x in L * x = b for the first n rows of L
if skip > 0 the first skip elements of x are assumed to be valid already
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
int i, j;
const float *lptr;
const float *lptr2;
const float *lptr3;
const float *lptr4;
float sum;
float sum2;
float sum3;
float sum4;
float tempSum;
float tempSum2;
float tempSum3;
float tempSum4;
vector float vecSum1 = (vector float)(0.0);
vector float vecSum2 = (vector float)(0.0);
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
vector float zeroVector = (vector float)(0.0);
vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
// unrolled this loop a bit
for ( i = skip; i+3 < n; i+=4 ) {
sum = b[i];
sum2 = b[i+1];
sum3 = b[i+2];
sum4 = b[i+3];
vecSum1 = zeroVector;
vecSum2 = zeroVector;
vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
lptr = L[i];
lptr2 = L[i+1];
lptr3 = L[i+2];
lptr4 = L[i+3];
vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
for ( j = 0 ; j+7 < i; j+=8 ) {
v0 = vec_ld( 0, &x[j] );
v1 = vec_ld( 15, &x[j] );
vector float vecExtraX = vec_ld( 31, &x[j] );
v0 = vec_perm( v0, v1, vecPermX );
v1 = vec_perm( v1, vecExtraX, vecPermX );
v2 = vec_ld( 0, lptr + j );
v3 = vec_ld( 15, lptr + j );
vector float vecExtra1 = vec_ld( 31, lptr + j );
v2 = vec_perm( v2, v3, vecPermLptr1 );
v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
v4 = vec_ld( 0, lptr2 + j );
v5 = vec_ld( 15, lptr2 + j );
vector float vecExtra2 = vec_ld( 31, lptr2 + j );
v4 = vec_perm( v4, v5, vecPermLptr2 );
v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
v6 = vec_ld( 0, lptr3 + j );
v7 = vec_ld( 15, lptr3 + j );
vector float vecExtra3 = vec_ld( 31, lptr3 + j );
v6 = vec_perm( v6, v7, vecPermLptr3 );
v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
v8 = vec_ld( 0, lptr4 + j );
v9 = vec_ld( 15, lptr4 + j );
vector float vecExtra4 = vec_ld( 31, lptr4 + j );
v8 = vec_perm( v8, v9, vecPermLptr4 );
v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
vecSum1 = vec_madd( v2, v0, vecSum1 );
vecSum2 = vec_madd( v3, v1, vecSum2 );
vecSum3 = vec_madd( v4, v0, vecSum3 );
vecSum4 = vec_madd( v5, v1, vecSum4 );
vecSum5 = vec_madd( v6, v0, vecSum5 );
vecSum6 = vec_madd( v7, v1, vecSum6 );
vecSum7 = vec_madd( v8, v0, vecSum7 );
vecSum8 = vec_madd( v9, v1, vecSum8 );
}
// if we ran the unrolled code, we need to sum accross the vectors
// to find out how much to subtract from sum
if ( j > 0 ) {
vecSum1 = vec_add( vecSum1, vecSum2 );
vecSum3 = vec_add( vecSum3, vecSum4 );
vecSum5 = vec_add( vecSum5, vecSum6 );
vecSum7 = vec_add( vecSum7, vecSum8 );
//sum accross the vectors
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
//move the result to the FPU
vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
sum -= tempSum;
sum2 -= tempSum2;
sum3 -= tempSum3;
sum4 -= tempSum4;
}
//cleanup
for ( ; j < i; j++ ) {
sum -= lptr[j] * x[j];
sum2 -= lptr2[j] * x[j];
sum3 -= lptr3[j] * x[j];
sum4 -= lptr4[j] * x[j];
}
// store the 4 results at a time
sum2 -= ( lptr2[i] * sum );
sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
x[i] = sum;
x[i+1] = sum2;
x[i+2] = sum3;
x[i+3] = sum4;
}
// cleanup
for ( ; i < n; i++ ) {
sum = b[i];
vecSum1 = zeroVector;
vecSum2 = zeroVector;
lptr = L[i];
vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
for ( j = 0 ; j+7 < i; j+=8 ) {
v0 = vec_ld( 0, &x[j] );
v2 = vec_ld( 15, &x[j] );
vector float vecExtraX = vec_ld( 31, &x[j] );
v0 = vec_perm( v0, v2, vecPermX );
v2 = vec_perm( v2, vecExtraX, vecPermX );
v1 = vec_ld( 0, lptr + j );
v3 = vec_ld( 15, lptr + j );
vector float vecExtra = vec_ld( 31, lptr + j );
v1 = vec_perm( v1, v3, vecPermLptr );
v3 = vec_perm( v3, vecExtra, vecPermLptr );
vecSum1 = vec_madd( v1, v0, vecSum1 );
vecSum2 = vec_madd( v3, v2, vecSum2 );
}
// if we ran the unrolled code, we need to sum accross the vectors
// to find out how much to subtract from sum
if ( j > 0 ) {
//sum accross the vectors
vecSum1 = vec_add( vecSum1, vecSum2 );
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
//move the result to the FPU
vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
sum -= tempSum;
}
//cleanup
for ( ; j < i; j++ ) {
sum -= lptr[j] * x[j];
}
x[i] = sum;
}
}
/*
============
idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
solves x in L.Transpose() * x = b for the first n rows of L
L has to be a lower triangular matrix with (implicit) ones on the diagonal
x == b is allowed
============
*/
void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
int nc;
const float *lptr;
lptr = L.ToFloatPtr();
nc = L.GetNumColumns();
float x0, x1, x2, x3, x4, x5, x6;
// unrolled cases for n < 8
if ( n < 8 ) {
switch( n ) {
// using local variables to avoid aliasing issues
case 0:
return;
case 1:
x[0] = b[0];
return;
case 2:
x1 = b[1];
x0 = b[0] - lptr[1*nc+0] * x1;
x[1] = x1;
x[0] = x0;
return;
case 3:
x2 = b[2];
x1 = b[1] - lptr[2*nc+1] * x2;
x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
x[2] = x2;
x[1] = x1;
x[0] = x0;
return;
case 4:
x3 = b[3];
x2 = b[2] - lptr[3*nc+2] * x3;
x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
x[3] = x3;
x[2] = x2;
x[1] = x1;
x[0] = x0;
return;
case 5:
x4 = b[4];
x3 = b[3] - lptr[4*nc+3] * x4;
x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
x[4] = x4;
x[3] = x3;
x[2] = x2;
x[1] = x1;
x[0] = x0;
return;
case 6:
x5 = b[5];
x4 = b[4] - lptr[5*nc+4] * x5;
x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
x[5] = x5;
x[4] = x4;
x[3] = x3;
x[2] = x2;
x[1] = x1;
x[0] = x0;
return;
case 7:
x6 = b[6];
x5 = b[5] - lptr[6*nc+5] * x6;
x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
x[6] = x6;
x[5] = x5;
x[4] = x4;
x[3] = x3;
x[2] = x2;
x[1] = x1;
x[0] = x0;
return;
}
return;
}
int i, j;
register float s0, s1, s2, s3;
float *xptr;
lptr = L.ToFloatPtr() + n * nc + n - 4;
xptr = x + n;
// process 4 rows at a time
for ( i = n; i >= 4; i -= 4 ) {
s0 = b[i-4];
s1 = b[i-3];
s2 = b[i-2];
s3 = b[i-1];
// process 4x4 blocks
for ( j = 0; j < n-i; j += 4 ) {
s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
}
// process left over of the 4 rows
s0 -= lptr[0-1*nc] * s3;
s1 -= lptr[1-1*nc] * s3;
s2 -= lptr[2-1*nc] * s3;
s0 -= lptr[0-2*nc] * s2;
s1 -= lptr[1-2*nc] * s2;
s0 -= lptr[0-3*nc] * s1;
// store result
xptr[-4] = s0;
xptr[-3] = s1;
xptr[-2] = s2;
xptr[-1] = s3;
// update pointers for next four rows
lptr -= 4 + 4 * nc;
xptr -= 4;
}
// process left over rows
for ( i--; i >= 0; i-- ) {
s0 = b[i];
lptr = L[0] + i;
for ( j = i + 1; j < n; j++ ) {
s0 -= lptr[j*nc] * x[j];
}
x[i] = s0;
}
}
/*
============
idSIMD_AltiVec::MatX_LDLTFactor
============
*/
bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
int i, j, k, nc;
float *v, *diag, *mptr;
float s0, s1, s2, s3, sum, d;
float s0_2, s1_2, s2_2, s3_2, sum_2;
float *mptr2;
v = (float *) _alloca16( n * sizeof( float ) );
diag = (float *) _alloca16( n * sizeof( float ) );
nc = mat.GetNumColumns();
if ( n <= 0 ) {
return true;
}
mptr = mat[0];
sum = mptr[0];
if ( sum == 0.0f ) {
return false;
}
diag[0] = sum;
invDiag[0] = d = 1.0f / sum;
if ( n <= 1 ) {
return true;
}
mptr = mat[0];
for ( j = 1; j < n; j++ ) {
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
}
mptr = mat[1];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
sum = mptr[1] - s0;
if ( sum == 0.0f ) {
return false;
}
mat[1][1] = sum;
diag[1] = sum;
invDiag[1] = d = 1.0f / sum;
if ( n <= 2 ) {
return true;
}
mptr = mat[0];
for ( j = 2; j < n; j++ ) {
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
}
mptr = mat[2];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
sum = mptr[2] - s0 - s1;
if ( sum == 0.0f ) {
return false;
}
mat[2][2] = sum;
diag[2] = sum;
invDiag[2] = d = 1.0f / sum;
if ( n <= 3 ) {
return true;
}
mptr = mat[0];
for ( j = 3; j < n; j++ ) {
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
}
mptr = mat[3];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
sum = mptr[3] - s0 - s1 - s2;
if ( sum == 0.0f ) {
return false;
}
mat[3][3] = sum;
diag[3] = sum;
invDiag[3] = d = 1.0f / sum;
if ( n <= 4 ) {
return true;
}
mptr = mat[0];
for ( j = 4; j < n; j++ ) {
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
}
for ( i = 4; i < n; i++ ) {
mptr = mat[i];
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
for ( k = 4; k < i-3; k += 4 ) {
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
}
switch( i - k ) {
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
}
sum = s3;
sum += s2;
sum += s1;
sum += s0;
sum = mptr[i] - sum;
if ( sum == 0.0f ) {
return false;
}
mat[i][i] = sum;
diag[i] = sum;
invDiag[i] = d = 1.0f / sum;
if ( i + 1 >= n ) {
return true;
}
// unrolling madness!
mptr = mat[i+1];
mptr2 = mat[i+1] + nc;
for ( j = i+1; j+1 < n; j+=2 ) {
s0 = mptr[0] * v[0];
s1 = mptr[1] * v[1];
s2 = mptr[2] * v[2];
s3 = mptr[3] * v[3];
s0_2 = mptr2[0] * v[0];
s1_2 = mptr2[1] * v[1];
s2_2 = mptr2[2] * v[2];
s3_2 = mptr2[3] * v[3];
for ( k = 4; k < i-7; k += 8 ) {
s0 += mptr[k+0] * v[k+0];
s1 += mptr[k+1] * v[k+1];
s2 += mptr[k+2] * v[k+2];
s3 += mptr[k+3] * v[k+3];
s0 += mptr[k+4] * v[k+4];
s1 += mptr[k+5] * v[k+5];
s2 += mptr[k+6] * v[k+6];
s3 += mptr[k+7] * v[k+7];
s0_2 += mptr2[k+0] * v[k+0];
s1_2 += mptr2[k+1] * v[k+1];
s2_2 += mptr2[k+2] * v[k+2];
s3_2 += mptr2[k+3] * v[k+3];
s0_2 += mptr2[k+4] * v[k+4];
s1_2 += mptr2[k+5] * v[k+5];
s2_2 += mptr2[k+6] * v[k+6];
s3_2 += mptr2[k+7] * v[k+7];
}
switch( i - k ) {
case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
}
// disassociate these adds
s3 += s2;
s1 += s0;
sum = s1 + s3;
s3_2 += s2_2;
s1_2 += s0_2;
sum_2 = s1_2 + s3_2;
mptr[i] = ( mptr[i] - sum ) * d;
mptr2[i] = ( mptr2[i] - sum_2 ) * d;
mptr += nc*2;
mptr2 += nc*2;
}
// cleanup
for ( ; j < n; j++ ) {
s0 = mptr[0] * v[0];
s1 = mptr[1] * v[1];
s2 = mptr[2] * v[2];
s3 = mptr[3] * v[3];
for ( k = 4; k < i-7; k += 8 ) {
s0 += mptr[k+0] * v[k+0];
s1 += mptr[k+1] * v[k+1];
s2 += mptr[k+2] * v[k+2];
s3 += mptr[k+3] * v[k+3];
s0 += mptr[k+4] * v[k+4];
s1 += mptr[k+5] * v[k+5];
s2 += mptr[k+6] * v[k+6];
s3 += mptr[k+7] * v[k+7];
}
switch( i - k ) {
case 7: s0 += mptr[k+6] * v[k+6];
case 6: s1 += mptr[k+5] * v[k+5];
case 5: s2 += mptr[k+4] * v[k+4];
case 4: s3 += mptr[k+3] * v[k+3];
case 3: s0 += mptr[k+2] * v[k+2];
case 2: s1 += mptr[k+1] * v[k+1];
case 1: s2 += mptr[k+0] * v[k+0];
}
// disassociate these adds
s3 += s2;
s1 += s0;
sum = s1 + s3;
mptr[i] = ( mptr[i] - sum ) * d;
mptr += nc;
}
}
return true;
}
#endif /* ENABLE_LOWER_TRIANGULAR */
#ifdef LIVE_VICARIOUSLY
/*
============
idSIMD_AltiVec::BlendJoints
============
*/
void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
int i;
// since lerp is a constant, we can special case the two cases if they're true
if ( lerp <= 0.0f ) {
// this sets joints back to joints. No sense in doing no work, so just return
return;
}
if ( lerp >= 1.0f ) {
// this copies each q from blendJoints to joints and copies each t from blendJoints to joints
memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
return;
}
vector float vecLerp = loadSplatUnalignedScalar( &lerp );
vector float zeroVector = (vector float)(0);
for ( i = 0; i+3 < numJoints; i+=4 ) {
int j = index[i];
int j2 = index[i+1];
int j3 = index[i+2];
int j4 = index[i+3];
// slerp
const float *jointPtr = joints[j].q.ToFloatPtr();
const float *blendPtr = blendJoints[j].q.ToFloatPtr();
const float *jointPtr2 = joints[j2].q.ToFloatPtr();
const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
const float *jointPtr3 = joints[j3].q.ToFloatPtr();
const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
const float *jointPtr4 = joints[j4].q.ToFloatPtr();
const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
vector float v12, v13, v14, v15, v16;
vector float vecFromX, vecFromY, vecFromZ, vecFromW;
vector float vecToX, vecToY, vecToZ, vecToW;
// load up the the idJointQuats from joints
v0 = vec_ld( 0, jointPtr );
v1 = vec_ld( 15, jointPtr );
v2 = vec_perm( v0, v1, permVec );
v3 = vec_ld( 0, jointPtr2 );
v4 = vec_ld( 15, jointPtr2 );
v5 = vec_perm( v3, v4, permVec2 );
v6 = vec_ld( 0, jointPtr3 );
v7 = vec_ld( 15, jointPtr3 );
v8 = vec_perm( v6, v7, permVec3 );
v9 = vec_ld( 0, jointPtr4 );
v10 = vec_ld( 15, jointPtr4 );
v11 = vec_perm( v9, v10, permVec4 );
// planarizing, so put each x y z w into its own vector
v0 = vec_mergeh( v2, v8 );
v1 = vec_mergeh( v5, v11 );
v3 = vec_mergel( v2, v8 );
v4 = vec_mergel( v5, v11 );
vecFromX = vec_mergeh( v0, v1 );
vecFromY = vec_mergel( v0, v1 );
vecFromZ = vec_mergeh( v3, v4 );
vecFromW = vec_mergel( v3, v4 );
// load up idJointQuats from blendJoints
v5 = vec_ld( 0, blendPtr );
v6 = vec_ld( 15, blendPtr );
v7 = vec_perm( v5, v6, permVec5 );
v8 = vec_ld( 0, blendPtr2 );
v9 = vec_ld( 15, blendPtr2 );
v10 = vec_perm( v8, v9, permVec6 );
v11 = vec_ld( 0, blendPtr3 );
v12 = vec_ld( 15, blendPtr3 );
v13 = vec_perm( v11, v12, permVec7 );
v14 = vec_ld( 0, blendPtr4 );
v15 = vec_ld( 15, blendPtr4 );
v16 = vec_perm( v14, v15, permVec8 );
// put these into their own vectors too
v5 = vec_mergeh( v7, v13 );
v6 = vec_mergeh( v10, v16 );
v8 = vec_mergel( v7, v13 );
v9 = vec_mergel( v10, v16 );
vecToX = vec_mergeh( v5, v6 );
vecToY = vec_mergel( v5, v6 );
vecToZ = vec_mergeh( v8, v9 );
vecToW = vec_mergel( v8, v9 );
// calculate cosom
vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
// if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
// to
vector bool int vecCmp, vecCmp2;
vecCmp = vec_cmplt( vecCosom, zeroVector );
// negate if needed
vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
// check if we need to calculate scale
vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
vector float vecScale1 = vec_splat( vecLerp, 0 );
vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
// see which ones we have to insert into our scale0 and scale1 vectors
vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
// multiply each element by the scale
vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
// multiply temp by scale and add to result
vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
// do a transform again to get the results back to vectors we can store out
v5 = vec_mergeh( vecFromX, vecFromZ );
v6 = vec_mergeh( vecFromY, vecFromW );
v8 = vec_mergel( vecFromX, vecFromZ );
v9 = vec_mergel( vecFromY, vecFromW );
vecToX = vec_mergeh( v5, v6 );
vecToY = vec_mergel( v5, v6 );
vecToZ = vec_mergeh( v8, v9 );
vecToW = vec_mergel( v8, v9 );
vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
// right rotate the input data
vecToX = vec_perm( vecToX, vecToX, storePerm1 );
vecToY = vec_perm( vecToY, vecToY, storePerm2 );
vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
vecToW = vec_perm( vecToW, vecToW, storePerm4 );
vec_ste( vecToX, 0, (float*) jointPtr );
vec_ste( vecToX, 4, (float*) jointPtr );
vec_ste( vecToX, 8, (float*) jointPtr );
vec_ste( vecToX, 12, (float*) jointPtr );
vec_ste( vecToY, 0, (float*) jointPtr2 );
vec_ste( vecToY, 4, (float*) jointPtr2 );
vec_ste( vecToY, 8, (float*) jointPtr2 );
vec_ste( vecToY, 12, (float*) jointPtr2 );
vec_ste( vecToZ, 0, (float*) jointPtr3 );
vec_ste( vecToZ, 4, (float*) jointPtr3 );
vec_ste( vecToZ, 8, (float*) jointPtr3 );
vec_ste( vecToZ, 12, (float*) jointPtr3 );
vec_ste( vecToW, 0, (float*) jointPtr4 );
vec_ste( vecToW, 4, (float*) jointPtr4 );
vec_ste( vecToW, 8, (float*) jointPtr4 );
vec_ste( vecToW, 12, (float*) jointPtr4 );
// lerp is v1 + l * ( v2 - v1 );
// the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
float *jointVecPtr = (float*)( jointPtr + 4 );
float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
v0 = vec_ld( 0, jointVecPtr );
v1 = vec_ld( 11, jointVecPtr );
vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
v2 = vec_ld( 0, jointVecPtr2 );
v3 = vec_ld( 11, jointVecPtr2 );
vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
v4 = vec_ld( 0, jointVecPtr3 );
v5 = vec_ld( 11, jointVecPtr3 );
vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
v6 = vec_ld( 0, jointVecPtr4 );
v7 = vec_ld( 11, jointVecPtr4 );
vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
vector float vecVecX, vecVecY, vecVecZ;
vecVecX = vecVecY = vecVecZ = zeroVector;
// planarize
v0 = vec_mergeh( vecLd1, vecLd3 );
v1 = vec_mergeh( vecLd2, vecLd4 );
v3 = vec_mergel( vecLd1, vecLd3 );
v4 = vec_mergel( vecLd2, vecLd4 );
vecVecX = vec_mergeh( v0, v1 );
vecVecY = vec_mergel( v0, v1 );
vecVecZ = vec_mergeh( v3, v4 );
// load blend joint idvec3's
float *blendVecPtr = (float*)( blendPtr + 4 );
float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
v0 = vec_ld( 0, blendVecPtr );
v1 = vec_ld( 11, blendVecPtr );
vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
v2 = vec_ld( 0, blendVecPtr2 );
v3 = vec_ld( 11, blendVecPtr2 );
vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
v4 = vec_ld( 0, blendVecPtr3 );
v5 = vec_ld( 11, blendVecPtr3 );
vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
v6 = vec_ld( 0, blendVecPtr4 );
v7 = vec_ld( 11, blendVecPtr4 );
vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
vector float vecBlendX, vecBlendY, vecBlendZ;
vecBlendX = vecBlendY = vecBlendZ = zeroVector;
// planarize
v0 = vec_mergeh( vecLd5, vecLd7 );
v1 = vec_mergeh( vecLd6, vecLd8 );
v3 = vec_mergel( vecLd5, vecLd7 );
v4 = vec_mergel( vecLd6, vecLd8 );
vecBlendX = vec_mergeh( v0, v1 );
vecBlendY = vec_mergel( v0, v1 );
vecBlendZ = vec_mergeh( v3, v4 );
// do subtraction
vecWork1 = vec_sub( vecBlendX, vecVecX );
vecWork2 = vec_sub( vecBlendY, vecVecY );
vecWork3 = vec_sub( vecBlendZ, vecVecZ );
// multiply by lerp and add to v1
vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
// put it back in original form
v0 = vec_mergeh( vecVecX, vecVecZ );
v1 = vec_mergeh( vecVecY, zeroVector );
v3 = vec_mergel( vecVecX, vecVecZ );
v4 = vec_mergel( vecVecY, zeroVector );
// generate vectors to store
vecWork1 = vec_mergeh( v0, v1 );
vecWork2 = vec_mergel( v0, v1 );
vecWork3 = vec_mergeh( v3, v4 );
vector float vecWork4 = vec_mergel( v3, v4 );
// store the T values
storePerm1 = vec_lvsr( 0, jointVecPtr );
storePerm2 = vec_lvsr( 0, jointVecPtr2 );
storePerm3 = vec_lvsr( 0, jointVecPtr3 );
storePerm4 = vec_lvsr( 0, jointVecPtr4 );
// right rotate the input data
vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
vec_ste( vecWork1, 0, (float*) jointVecPtr );
vec_ste( vecWork1, 4, (float*) jointVecPtr );
vec_ste( vecWork1, 8, (float*) jointVecPtr );
vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
}
// cleanup
for ( ; i < numJoints; i++ ) {
int j = index[i];
joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
}
}
/*
============
idSIMD_AltiVec::ConvertJointQuatsToJointMats
============
*/
// SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
// it's not easily parallelizable
void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
for ( int i = 0; i < numJoints; i++ ) {
const float *q = jointQuats[i].q.ToFloatPtr();
float *m = jointMats[i].ToFloatPtr();
m[0*4+3] = q[4];
m[1*4+3] = q[5];
m[2*4+3] = q[6];
float x2 = q[0] + q[0];
float y2 = q[1] + q[1];
float z2 = q[2] + q[2];
{
float xx = q[0] * x2;
float yy = q[1] * y2;
float zz = q[2] * z2;
m[0*4+0] = 1.0f - yy - zz;
m[1*4+1] = 1.0f - xx - zz;
m[2*4+2] = 1.0f - xx - yy;
}
{
float yz = q[1] * z2;
float wx = q[3] * x2;
m[2*4+1] = yz - wx;
m[1*4+2] = yz + wx;
}
{
float xy = q[0] * y2;
float wz = q[3] * z2;
m[1*4+0] = xy - wz;
m[0*4+1] = xy + wz;
}
{
float xz = q[0] * z2;
float wy = q[3] * y2;
m[0*4+2] = xz - wy;
m[2*4+0] = xz + wy;
}
}
}
/*
============
idSIMD_AltiVec::ConvertJointMatsToJointQuats
============
*/
void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
int index;
// Since we use very little of the data we have to pull in for the altivec version, we end up with
// a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
// of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
// bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
// my function so everyone can benefit on G5.
for ( index = 0; index < numJoints; index++ ) {
idJointQuat jq;
float trace;
float s;
float t;
int i;
int j;
int k;
static int next[3] = { 1, 2, 0 };
float *mat = (float*)( jointMats[index].ToFloatPtr() );
trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
if ( trace > 0.0f ) {
t = trace + 1.0f;
//s = idMath::InvSqrt( t ) * 0.5f;
s = FastScalarInvSqrt( t ) * 0.5f;
jq.q[3] = s * t;
jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
} else {
i = 0;
if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
i = 1;
}
if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
i = 2;
}
j = next[i];
k = next[j];
t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
//s = idMath::InvSqrt( t ) * 0.5f;
s = FastScalarInvSqrt( t ) * 0.5f;
jq.q[i] = s * t;
jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
}
jq.t[0] = mat[0 * 4 + 3];
jq.t[1] = mat[1 * 4 + 3];
jq.t[2] = mat[2 * 4 + 3];
jointQuats[index] = jq;
}
}
/*
============
idSIMD_AltiVec::TransformJoints
============
*/
void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
int i;
#if 0
for( i = firstJoint; i <= lastJoint; i++ ) {
assert( parents[i] < i );
jointMats[i] *= jointMats[parents[i]];
}
#else
// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
// on what the parents array looks like. This is true in the test code.
for ( i = firstJoint; i <= lastJoint; i++ ) {
assert( parents[i] < i );
float *jointPtr = jointMats[i].ToFloatPtr();
float *parentPtr = jointMats[parents[i]].ToFloatPtr();
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
vector float v0, v1, v2, v3, v4, v5, v6, v7;
// we need to load up 12 float elements that make up the Mat
v0 = vec_ld( 0, jointPtr );
v1 = vec_ld( 15, jointPtr );
v2 = vec_ld( 31, jointPtr );
v3 = vec_ld( 47, jointPtr );
// load parents
v4 = vec_ld( 0, parentPtr );
v5 = vec_ld( 15, parentPtr );
v6 = vec_ld( 31, parentPtr );
v7 = vec_ld( 47, parentPtr );
// permute into vectors
vector float vecJointMat1 = vec_perm( v0, v1, permVec );
vector float vecJointMat2 = vec_perm( v1, v2, permVec );
vector float vecJointMat3 = vec_perm( v2, v3, permVec );
vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
vector float zero = (vector float)(0);
vector float C1, C2, C3;
// matrix multiply
C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
// do the addition at the end
vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
// store results
UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
}
#endif
}
/*
============
idSIMD_AltiVec::UntransformJoints
============
*/
void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
int i;
#if 0
for( i = lastJoint; i >= firstJoint; i-- ) {
assert( parents[i] < i );
jointMats[i] /= jointMats[parents[i]];
}
#else
// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
// on what the parents array looks like. This is true in the test code.
for ( i = lastJoint; i >= firstJoint; i-- ) {
assert( parents[i] < i );
float *jointPtr = jointMats[i].ToFloatPtr();
float *parentPtr = jointMats[parents[i]].ToFloatPtr();
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
vector float v0, v1, v2, v3, v4, v5, v6, v7;
// we need to load up 12 float elements that make up the Mat
v0 = vec_ld( 0, jointPtr );
v1 = vec_ld( 15, jointPtr );
v2 = vec_ld( 31, jointPtr );
v3 = vec_ld( 47, jointPtr );
// load parents
v4 = vec_ld( 0, parentPtr );
v5 = vec_ld( 15, parentPtr );
v6 = vec_ld( 31, parentPtr );
v7 = vec_ld( 47, parentPtr );
// permute into vectors
vector float vecJointMat1 = vec_perm( v0, v1, permVec );
vector float vecJointMat2 = vec_perm( v1, v2, permVec );
vector float vecJointMat3 = vec_perm( v2, v3, permVec );
vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
vector float zero = (vector float)(0);
vector float C1, C2, C3;
// do subtraction at the beginning
vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
// matrix multiply
C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
// store results back
vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
// right rotate the input data
C1 = vec_perm( C1, C1, storePerm );
C2 = vec_perm( C2, C2, storePerm );
C3 = vec_perm( C3, C3, storePerm );
vec_ste( C1, 0, (float*) jointPtr );
vec_ste( C1, 4, (float*) jointPtr );
vec_ste( C1, 8, (float*) jointPtr );
vec_ste( C1, 12, (float*) jointPtr );
vec_ste( C2, 16, (float*) jointPtr );
vec_ste( C2, 20, (float*) jointPtr );
vec_ste( C2, 24, (float*) jointPtr );
vec_ste( C2, 28, (float*) jointPtr );
vec_ste( C3, 32, (float*) jointPtr );
vec_ste( C3, 36, (float*) jointPtr );
vec_ste( C3, 40, (float*) jointPtr );
vec_ste( C3, 44, (float*) jointPtr );
}
#endif
}
/*
============
idSIMD_AltiVec::TransformVerts
============
*/
// Here we don't have much for the vector unit to do, and the gain we get from doing the math
// in parallel is eaten by doing unaligned stores.
void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
int i, j;
const byte *jointsPtr = (byte *)joints;
for( j = i = 0; i < numVerts; i++ ) {
idVec3 v;
float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
float *weightPtr = (float*) weights[j].ToFloatPtr();
v[0] = matPtrOrig[0] * weightPtr[0];
v[0] += matPtrOrig[1] * weightPtr[1];
v[0] += matPtrOrig[2] * weightPtr[2];
v[0] += matPtrOrig[3] * weightPtr[3];
v[1] = matPtrOrig[4] * weightPtr[0];
v[1] += matPtrOrig[5] * weightPtr[1];
v[1] += matPtrOrig[6] * weightPtr[2];
v[1] += matPtrOrig[7] * weightPtr[3];
v[2] = matPtrOrig[8] * weightPtr[0];
v[2] += matPtrOrig[9] * weightPtr[1];
v[2] += matPtrOrig[10] * weightPtr[2];
v[2] += matPtrOrig[11] * weightPtr[3];
while( index[j*2+1] == 0 ) {
j++;
float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
weightPtr = (float*) weights[j].ToFloatPtr();
v[0] += matPtr[0] * weightPtr[0];
v[0] += matPtr[1] * weightPtr[1];
v[0] += matPtr[2] * weightPtr[2];
v[0] += matPtr[3] * weightPtr[3];
v[1] += matPtr[4] * weightPtr[0];
v[1] += matPtr[5] * weightPtr[1];
v[1] += matPtr[6] * weightPtr[2];
v[1] += matPtr[7] * weightPtr[3];
v[2] += matPtr[8] * weightPtr[0];
v[2] += matPtr[9] * weightPtr[1];
v[2] += matPtr[10] * weightPtr[2];
v[2] += matPtr[11] * weightPtr[3];
}
j++;
verts[i].xyz = v;
}
}
#endif /* LIVE_VICARIOUSLY */
#ifdef ENABLE_CULL
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::TracePointCull
============
*/
void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
byte tOr;
tOr = 0;
// pointers
const float *planePtr = planes[0].ToFloatPtr();
vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
vector unsigned char vecPerm;
vector float v0, v1, v2, v3, v4, v5, v6, v7;
vector float zeroVector = (vector float)(0);
vector float vecRadius;
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
vector bool int oneIntVector = (vector bool int)(1);
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
vector unsigned int vecTotals;
vector unsigned int tempIntSum;
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
// populate planes
v0 = vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
vecPlane0 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 4 );
v3 = vec_ld( 15, planePtr + 4 );
vecPlane1 = vec_perm( v2, v3, vecPerm );
v0 = vec_ld( 0, planePtr + 8 );
v1 = vec_ld( 15, planePtr + 8 );
vecPlane2 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 12 );
v3 = vec_ld( 15, planePtr + 12 );
vecPlane3 = vec_perm( v2, v3, vecPerm );
// transpose
v0 = vec_mergeh( vecPlane0, vecPlane2 );
v1 = vec_mergeh( vecPlane1, vecPlane3 );
v2 = vec_mergel( vecPlane0, vecPlane2 );
v3 = vec_mergel( vecPlane1, vecPlane3 );
vecPlane0 = vec_mergeh( v0, v1 );
vecPlane1 = vec_mergel( v0, v1 );
vecPlane2 = vec_mergeh( v2, v3 );
vecPlane3 = vec_mergel( v2, v3 );
// load constants
vecRadius = loadSplatUnalignedScalar( &radius );
unsigned int cullBitVal[4];
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
int i = 0;
// every fourth one will have the same alignment. Make sure we've got enough here
if ( i+3 < numVerts ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 15, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 15, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 15, vertPtr4 );
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
// vec1Sum1 now holds d0, d1, d2, d3. calculate the
// difference with +radius and -radius
vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
// do compare
vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1 = vec_and( vecCmp1, oneIntVector );
vecCmp2 = vec_and( vecCmp2, oneIntVector );
vecCmp3 = vec_and( vecCmp3, oneIntVector );
vecCmp4 = vec_and( vecCmp4, oneIntVector );
vecCmp5 = vec_and( vecCmp5, oneIntVector );
vecCmp6 = vec_and( vecCmp6, oneIntVector );
vecCmp7 = vec_and( vecCmp7, oneIntVector );
vecCmp8 = vec_and( vecCmp8, oneIntVector );
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
// OR (add) them all together
vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
vecTotals = vec_mergeh( vecTotals, tempIntSum );
tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
// store out results
vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
vec_ste( tempSt, 0, &cullBitVal[0] );
vec_ste( tempSt, 4, &cullBitVal[0] );
vec_ste( tempSt, 8, &cullBitVal[0] );
vec_ste( tempSt, 12, &cullBitVal[0] );
tOr |= cullBitVal[0];
tOr |= cullBitVal[1];
tOr |= cullBitVal[2];
tOr |= cullBitVal[3];
cullBits[i] = cullBitVal[0];
cullBits[i+1] = cullBitVal[1];
cullBits[i+2] = cullBitVal[2];
cullBits[i+3] = cullBitVal[3];
}
// cleanup
for ( ; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, t;
const idVec3 &v = verts[i].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
t = d0 + radius;
bits = FLOATSIGNBITSET( t ) << 0;
t = d1 + radius;
bits |= FLOATSIGNBITSET( t ) << 1;
t = d2 + radius;
bits |= FLOATSIGNBITSET( t ) << 2;
t = d3 + radius;
bits |= FLOATSIGNBITSET( t ) << 3;
t = d0 - radius;
bits |= FLOATSIGNBITSET( t ) << 4;
t = d1 - radius;
bits |= FLOATSIGNBITSET( t ) << 5;
t = d2 - radius;
bits |= FLOATSIGNBITSET( t ) << 6;
t = d3 - radius;
bits |= FLOATSIGNBITSET( t ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
totalOr = tOr;
}
#else
/*
============
idSIMD_AltiVec::TracePointCull
============
*/
void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
byte tOr;
tOr = 0;
// pointers
const float *planePtr = planes[0].ToFloatPtr();
vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
vector unsigned char vecPerm;
vector float v0, v1, v2, v3, v4, v5, v6, v7;
vector float zeroVector = (vector float)(0);
vector float vecRadius;
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
vector bool int oneIntVector = (vector bool int)(1);
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
vector unsigned int vecTotals;
vector unsigned int tempIntSum;
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
// populate planes
v0 = vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
vecPlane0 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 4 );
v3 = vec_ld( 15, planePtr + 4 );
vecPlane1 = vec_perm( v2, v3, vecPerm );
v0 = vec_ld( 0, planePtr + 8 );
v1 = vec_ld( 15, planePtr + 8 );
vecPlane2 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 12 );
v3 = vec_ld( 15, planePtr + 12 );
vecPlane3 = vec_perm( v2, v3, vecPerm );
// transpose
v0 = vec_mergeh( vecPlane0, vecPlane2 );
v1 = vec_mergeh( vecPlane1, vecPlane3 );
v2 = vec_mergel( vecPlane0, vecPlane2 );
v3 = vec_mergel( vecPlane1, vecPlane3 );
vecPlane0 = vec_mergeh( v0, v1 );
vecPlane1 = vec_mergel( v0, v1 );
vecPlane2 = vec_mergeh( v2, v3 );
vecPlane3 = vec_mergel( v2, v3 );
// load constants
vecRadius = loadSplatUnalignedScalar( &radius );
unsigned int cullBitVal[4];
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
int i = 0;
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
vecXYZ1 = vec_ld( 0, vertPtr );
vecXYZ2 = vec_ld( 0, vertPtr2 );
vecXYZ3 = vec_ld( 0, vertPtr3 );
vecXYZ4 = vec_ld( 0, vertPtr4 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
// vec1Sum1 now holds d0, d1, d2, d3. calculate the
// difference with +radius and -radius
vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
// do compare
vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1 = vec_and( vecCmp1, oneIntVector );
vecCmp2 = vec_and( vecCmp2, oneIntVector );
vecCmp3 = vec_and( vecCmp3, oneIntVector );
vecCmp4 = vec_and( vecCmp4, oneIntVector );
vecCmp5 = vec_and( vecCmp5, oneIntVector );
vecCmp6 = vec_and( vecCmp6, oneIntVector );
vecCmp7 = vec_and( vecCmp7, oneIntVector );
vecCmp8 = vec_and( vecCmp8, oneIntVector );
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
// OR (add) them all together
vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
vecTotals = vec_mergeh( vecTotals, tempIntSum );
tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
// store out results
vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
vec_ste( tempSt, 0, &cullBitVal[0] );
vec_ste( tempSt, 4, &cullBitVal[0] );
vec_ste( tempSt, 8, &cullBitVal[0] );
vec_ste( tempSt, 12, &cullBitVal[0] );
tOr |= cullBitVal[0];
tOr |= cullBitVal[1];
tOr |= cullBitVal[2];
tOr |= cullBitVal[3];
cullBits[i] = cullBitVal[0];
cullBits[i+1] = cullBitVal[1];
cullBits[i+2] = cullBitVal[2];
cullBits[i+3] = cullBitVal[3];
}
// cleanup
for ( ; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, t;
const idVec3 &v = verts[i].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
t = d0 + radius;
bits = FLOATSIGNBITSET( t ) << 0;
t = d1 + radius;
bits |= FLOATSIGNBITSET( t ) << 1;
t = d2 + radius;
bits |= FLOATSIGNBITSET( t ) << 2;
t = d3 + radius;
bits |= FLOATSIGNBITSET( t ) << 3;
t = d0 - radius;
bits |= FLOATSIGNBITSET( t ) << 4;
t = d1 - radius;
bits |= FLOATSIGNBITSET( t ) << 5;
t = d2 - radius;
bits |= FLOATSIGNBITSET( t ) << 6;
t = d3 - radius;
bits |= FLOATSIGNBITSET( t ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
totalOr = tOr;
}
#endif /* DRAWVERT_PADDED */
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::DecalPointCull
============
*/
void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
int i;
const float *planePtr = planes[0].ToFloatPtr();
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
vector float zeroVector = (vector float)(0.0);
vector unsigned char vecPerm;
vector float v0, v1, v2, v3, v4, v5, v6, v7;
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
// populate planes
v0 = vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
vecPlane0 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 4 );
v3 = vec_ld( 15, planePtr + 4 );
vecPlane1 = vec_perm( v2, v3, vecPerm );
v0 = vec_ld( 0, planePtr + 8 );
v1 = vec_ld( 15, planePtr + 8 );
vecPlane2 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 12 );
v3 = vec_ld( 15, planePtr + 12 );
vecPlane3 = vec_perm( v2, v3, vecPerm );
v0 = vec_ld( 0, planePtr + 16 );
v1 = vec_ld( 15, planePtr + 16 );
vecPlane4 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 20 );
v3 = vec_ld( 15, planePtr + 20 );
vecPlane5 = vec_perm( v2, v3, vecPerm );
// transpose
v0 = vec_mergeh( vecPlane0, vecPlane2 );
v1 = vec_mergeh( vecPlane1, vecPlane3 );
v2 = vec_mergel( vecPlane0, vecPlane2 );
v3 = vec_mergel( vecPlane1, vecPlane3 );
vecPlane0 = vec_mergeh( v0, v1 );
vecPlane1 = vec_mergel( v0, v1 );
vecPlane2 = vec_mergeh( v2, v3 );
vecPlane3 = vec_mergel( v2, v3 );
v0 = vec_mergeh( vecPlane4, zeroVector );
v1 = vec_mergeh( vecPlane5, zeroVector );
v2 = vec_mergel( vecPlane4, zeroVector );
v3 = vec_mergel( vecPlane5, zeroVector );
vecPlane4 = vec_mergeh( v0, v1 );
vecPlane5 = vec_mergel( v0, v1 );
vecPlane6 = vec_mergeh( v2, v3 );
vecPlane7 = vec_mergel( v2, v3 );
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
vector bool int oneIntVector = (vector bool int)(1);
vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
vector unsigned int vecR1, vecR2, vecR3, vecR4;
vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
unsigned int vBits[4];
vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
i = 0;
// every fourth one will have the same alignment. Make sure we've got enough here
if ( i+3 < numVerts ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 15, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 15, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 15, vertPtr4 );
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1 = vec_and( vecCmp1, oneIntVector );
vecCmp2 = vec_and( vecCmp2, oneIntVector );
vecCmp3 = vec_and( vecCmp3, oneIntVector );
vecCmp4 = vec_and( vecCmp4, oneIntVector );
vecCmp5 = vec_and( vecCmp5, oneIntVector );
vecCmp6 = vec_and( vecCmp6, oneIntVector );
vecCmp7 = vec_and( vecCmp7, oneIntVector );
vecCmp8 = vec_and( vecCmp8, oneIntVector );
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
//OR them all together (this is the same as adding them, since they're all only 1 bit set)
vecR1 = (vector unsigned int)(0); //zeroIntVector;
vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
vecR1 = vec_add(vecR1, vecBitShifted2 );
vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
vecR2 = (vector unsigned int)(0); //zeroIntVector;
vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
vecR2 = vec_add(vecR2, vecBitShifted4 );
vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
vecR3 = (vector unsigned int)(0); //zeroIntVector;
vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
vecR3 = vec_add(vecR3, vecBitShifted6 );
vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
vecR4 = (vector unsigned int)(0); //zeroIntVector;
vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
vecR4 = vec_add(vecR4, vecBitShifted8 );
vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
// take the first element from each vector and put them into vecR1
vecR1 = vec_mergeh( vecR1, vecR2 );
vecR3 = vec_mergeh( vecR3, vecR4 );
vecR1 = vec_perm( vecR1, vecR3, permHalves );
// XOR with 0x3F to flip lower 6 bits
vecR1 = vec_xor( vecR1, vecFlipBits );
// store out results. don't have 16 at a time so let's just
// do this and avoid alignment concerns
vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
vec_ste( vecR1, 0, &vBits[0] );
vec_ste( vecR1, 4, &vBits[0] );
vec_ste( vecR1, 8, &vBits[0] );
vec_ste( vecR1, 12, &vBits[0] );
cullBits[i] = vBits[0];
cullBits[i+1] = vBits[1];
cullBits[i+2] = vBits[2];
cullBits[i+3] = vBits[3];
}
for ( ; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, d4, d5;
const idVec3 &v = verts[i].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
d4 = planes[4].Distance( v );
d5 = planes[5].Distance( v );
// they check if the sign bit is set by casting as long and shifting right 31 places.
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 2;
bits |= FLOATSIGNBITSET( d3 ) << 3;
bits |= FLOATSIGNBITSET( d4 ) << 4;
bits |= FLOATSIGNBITSET( d5 ) << 5;
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
}
}
#else
/*
============
idSIMD_AltiVec::DecalPointCull
============
*/
void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
int i;
const float *planePtr = planes[0].ToFloatPtr();
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
vector float zeroVector = (vector float)(0.0);
vector unsigned char vecPerm;
vector float v0, v1, v2, v3, v4, v5, v6, v7;
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
// populate planes
v0 = vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
vecPlane0 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 4 );
v3 = vec_ld( 15, planePtr + 4 );
vecPlane1 = vec_perm( v2, v3, vecPerm );
v0 = vec_ld( 0, planePtr + 8 );
v1 = vec_ld( 15, planePtr + 8 );
vecPlane2 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 12 );
v3 = vec_ld( 15, planePtr + 12 );
vecPlane3 = vec_perm( v2, v3, vecPerm );
v0 = vec_ld( 0, planePtr + 16 );
v1 = vec_ld( 15, planePtr + 16 );
vecPlane4 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 0, planePtr + 20 );
v3 = vec_ld( 15, planePtr + 20 );
vecPlane5 = vec_perm( v2, v3, vecPerm );
// transpose
v0 = vec_mergeh( vecPlane0, vecPlane2 );
v1 = vec_mergeh( vecPlane1, vecPlane3 );
v2 = vec_mergel( vecPlane0, vecPlane2 );
v3 = vec_mergel( vecPlane1, vecPlane3 );
vecPlane0 = vec_mergeh( v0, v1 );
vecPlane1 = vec_mergel( v0, v1 );
vecPlane2 = vec_mergeh( v2, v3 );
vecPlane3 = vec_mergel( v2, v3 );
v0 = vec_mergeh( vecPlane4, zeroVector );
v1 = vec_mergeh( vecPlane5, zeroVector );
v2 = vec_mergel( vecPlane4, zeroVector );
v3 = vec_mergel( vecPlane5, zeroVector );
vecPlane4 = vec_mergeh( v0, v1 );
vecPlane5 = vec_mergel( v0, v1 );
vecPlane6 = vec_mergeh( v2, v3 );
vecPlane7 = vec_mergel( v2, v3 );
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
vector bool int oneIntVector = (vector bool int)(1);
vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
vector unsigned int vecR1, vecR2, vecR3, vecR4;
vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
unsigned int vBits[4];
vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
i = 0;
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1 = vec_and( vecCmp1, oneIntVector );
vecCmp2 = vec_and( vecCmp2, oneIntVector );
vecCmp3 = vec_and( vecCmp3, oneIntVector );
vecCmp4 = vec_and( vecCmp4, oneIntVector );
vecCmp5 = vec_and( vecCmp5, oneIntVector );
vecCmp6 = vec_and( vecCmp6, oneIntVector );
vecCmp7 = vec_and( vecCmp7, oneIntVector );
vecCmp8 = vec_and( vecCmp8, oneIntVector );
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
//OR them all together (this is the same as adding them, since they're all only 1 bit set)
vecR1 = (vector unsigned int)(0); //zeroIntVector;
vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
vecR1 = vec_add(vecR1, vecBitShifted2 );
vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
vecR2 = (vector unsigned int)(0); //zeroIntVector;
vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
vecR2 = vec_add(vecR2, vecBitShifted4 );
vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
vecR3 = (vector unsigned int)(0); //zeroIntVector;
vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
vecR3 = vec_add(vecR3, vecBitShifted6 );
vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
vecR4 = (vector unsigned int)(0); //zeroIntVector;
vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
vecR4 = vec_add(vecR4, vecBitShifted8 );
vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
// take the first element from each vector and put them into vecR1
vecR1 = vec_mergeh( vecR1, vecR2 );
vecR3 = vec_mergeh( vecR3, vecR4 );
vecR1 = vec_perm( vecR1, vecR3, permHalves );
// XOR with 0x3F to flip lower 6 bits
vecR1 = vec_xor( vecR1, vecFlipBits );
// store out results. don't have 16 at a time so let's just
// do this and avoid alignment concerns
vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
vec_ste( vecR1, 0, &vBits[0] );
vec_ste( vecR1, 4, &vBits[0] );
vec_ste( vecR1, 8, &vBits[0] );
vec_ste( vecR1, 12, &vBits[0] );
cullBits[i] = vBits[0];
cullBits[i+1] = vBits[1];
cullBits[i+2] = vBits[2];
cullBits[i+3] = vBits[3];
}
for ( ; i < numVerts; i++ ) {
byte bits;
float d0, d1, d2, d3, d4, d5;
const idVec3 &v = verts[i].xyz;
d0 = planes[0].Distance( v );
d1 = planes[1].Distance( v );
d2 = planes[2].Distance( v );
d3 = planes[3].Distance( v );
d4 = planes[4].Distance( v );
d5 = planes[5].Distance( v );
// they check if the sign bit is set by casting as long and shifting right 31 places.
bits = FLOATSIGNBITSET( d0 ) << 0;
bits |= FLOATSIGNBITSET( d1 ) << 1;
bits |= FLOATSIGNBITSET( d2 ) << 2;
bits |= FLOATSIGNBITSET( d3 ) << 3;
bits |= FLOATSIGNBITSET( d4 ) << 4;
bits |= FLOATSIGNBITSET( d5 ) << 5;
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
}
}
#endif /*DRAWVERT_PADDED */
#ifndef DRAWVERT_PADDED
/*
============
idSIMD_AltiVec::OverlayPointCull
============
*/
void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
int i;
float p0x, p0y, p0z, p0d;
float p1x, p1y, p1z, p1d;
const float *planePtr = planes[0].ToFloatPtr();
const float *vertPtr = verts[0].xyz.ToFloatPtr();
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
vector float v0, v1, v2, v3, v4, v5, v6, v7;
vector unsigned char vecPerm;
vector float zeroVector = (vector float)(0);
p0x = *(planePtr + 0);
p0y = *(planePtr + 1);
p0z = *(planePtr + 2);
p0d = *(planePtr + 3);
p1x = *(planePtr + 4);
p1y = *(planePtr + 5);
p1z = *(planePtr + 6);
p1d = *(planePtr + 7);
// populate the planes
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
v0 = vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
vecPlane0 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 31, planePtr );
vecPlane1 = vec_perm( v1, v2, vecPerm );
// transpose
v0 = vec_mergeh( vecPlane0, vecPlane0 );
v1 = vec_mergeh( vecPlane1, vecPlane1 );
v2 = vec_mergel( vecPlane0, vecPlane0 );
v3 = vec_mergel( vecPlane1, vecPlane1);
vecPlane0 = vec_mergeh( v0, v1 );
vecPlane1 = vec_mergel( v0, v1 );
vecPlane2 = vec_mergeh( v2, v3 );
vecPlane3 = vec_mergel( v2, v3 );
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
vector float oneVector = (vector float)(1);
vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
vector float negTwoVector = (vector float)(-2);
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
vector bool int oneIntVector = (vector bool int)(1);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
unsigned int cullBitVal[4];
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
i = 0;
// every fourth one will have the same alignment. Make sure we've got enough here
if ( i+3 < numVerts ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 15, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 15, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 15, vertPtr4 );
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
// like a splat, but only doing halves
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
vecSum1 = vec_add( vecSum1, vecPlane3 );
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
vecSum2 = vec_add( vecSum2, vecPlane3 );
// store out results
UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
// bit manipulation
vecCmp1 = vec_cmplt( vecSum1, zeroVector );
vecCmp2 = vec_cmplt( vecSum2, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1 = vec_and( vecCmp1, oneIntVector );
vecCmp2 = vec_and( vecCmp2, oneIntVector );
// store out and write to cullBits
// finally, a use for algebra! 1-x = x + 1 - 2x
vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
vecSum1Inv = vec_add( vecSum1Inv, oneVector );
vecSum2Inv = vec_add( vecSum2Inv, oneVector );
// do the same comparisons for the inverted d0/d1
vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
// shift them as needed
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
// OR them all together. since only 1 bit is set for each value, thats
// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
vector unsigned int vecResult;
vector unsigned int vecResult2;
vector unsigned int vecResult3;
vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
// vecResult now holds the values without the inverses yet, so add those
vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
vecResult = vec_add( vecResult, vecResult2 );
//store out results
vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
vec_ste( vecResult, 0, &cullBitVal[0] );
vec_ste( vecResult, 4, &cullBitVal[0] );
vec_ste( vecResult, 8, &cullBitVal[0] );
vec_ste( vecResult, 12, &cullBitVal[0] );
cullBits[i] = cullBitVal[0];
cullBits[i+1] = cullBitVal[1];
cullBits[i+2] = cullBitVal[2];
cullBits[i+3] = cullBitVal[3];
}
// cleanup
for ( ; i < numVerts; i++ ) {
byte bits;
float d0, d1;
float vx, vy, vz;
vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
texCoords[i][0] = d0;
texCoords[i][1] = d1;
bits = ( d0 >= 0 ) ? 0 : 1;
d0 = 1.0f - d0;
bits |= ( d1 >= 0 ) ? 0 : 1*2;
d1 = 1.0f - d1;
bits |= ( d0 >= 0 ) ? 0: 1*4;
bits |= ( d1 >= 0 ) ? 0: 1*8;
cullBits[i] = bits;
}
}
#else
/*
============
idSIMD_AltiVec::OverlayPointCull
============
*/
void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
int i;
float p0x, p0y, p0z, p0d;
float p1x, p1y, p1z, p1d;
const float *planePtr = planes[0].ToFloatPtr();
const float *vertPtr = verts[0].xyz.ToFloatPtr();
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
vector float v0, v1, v2, v3, v4, v5, v6, v7;
vector unsigned char vecPerm;
vector float zeroVector = (vector float)(0);
p0x = *(planePtr + 0);
p0y = *(planePtr + 1);
p0z = *(planePtr + 2);
p0d = *(planePtr + 3);
p1x = *(planePtr + 4);
p1y = *(planePtr + 5);
p1z = *(planePtr + 6);
p1d = *(planePtr + 7);
// populate the planes
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
v0 = vec_ld( 0, planePtr );
v1 = vec_ld( 15, planePtr );
vecPlane0 = vec_perm( v0, v1, vecPerm );
v2 = vec_ld( 31, planePtr );
vecPlane1 = vec_perm( v1, v2, vecPerm );
// transpose
v0 = vec_mergeh( vecPlane0, vecPlane0 );
v1 = vec_mergeh( vecPlane1, vecPlane1 );
v2 = vec_mergel( vecPlane0, vecPlane0 );
v3 = vec_mergel( vecPlane1, vecPlane1);
vecPlane0 = vec_mergeh( v0, v1 );
vecPlane1 = vec_mergel( v0, v1 );
vecPlane2 = vec_mergeh( v2, v3 );
vecPlane3 = vec_mergel( v2, v3 );
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
vector float oneVector = (vector float)(1);
vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
vector float negTwoVector = (vector float)(-2);
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
vector bool int oneIntVector = (vector bool int)(1);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
unsigned int cullBitVal[4];
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
i = 0;
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
vecXYZ1 = vec_ld( 0, vertPtr );
vecXYZ2 = vec_ld( 0, vertPtr2 );
vecXYZ3 = vec_ld( 0, vertPtr3 );
vecXYZ4 = vec_ld( 0, vertPtr4 );
// like a splat, but only doing halves
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
vecSum1 = vec_add( vecSum1, vecPlane3 );
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
vecSum2 = vec_add( vecSum2, vecPlane3 );
// store out results
UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
// bit manipulation
vecCmp1 = vec_cmplt( vecSum1, zeroVector );
vecCmp2 = vec_cmplt( vecSum2, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1 = vec_and( vecCmp1, oneIntVector );
vecCmp2 = vec_and( vecCmp2, oneIntVector );
// store out and write to cullBits
// finally, a use for algebra! 1-x = x + 1 - 2x
vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
vecSum1Inv = vec_add( vecSum1Inv, oneVector );
vecSum2Inv = vec_add( vecSum2Inv, oneVector );
// do the same comparisons for the inverted d0/d1
vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
//and it with 1 so we multiply by 1 not 1111's
vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
// shift them as needed
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
// OR them all together. since only 1 bit is set for each value, thats
// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
vector unsigned int vecResult;
vector unsigned int vecResult2;
vector unsigned int vecResult3;
vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
// vecResult now holds the values without the inverses yet, so add those
vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
vecResult = vec_add( vecResult, vecResult2 );
//store out results
vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
vec_ste( vecResult, 0, &cullBitVal[0] );
vec_ste( vecResult, 4, &cullBitVal[0] );
vec_ste( vecResult, 8, &cullBitVal[0] );
vec_ste( vecResult, 12, &cullBitVal[0] );
cullBits[i] = cullBitVal[0];
cullBits[i+1] = cullBitVal[1];
cullBits[i+2] = cullBitVal[2];
cullBits[i+3] = cullBitVal[3];
}
// cleanup
for ( ; i < numVerts; i++ ) {
byte bits;
float d0, d1;
float vx, vy, vz;
vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
texCoords[i][0] = d0;
texCoords[i][1] = d1;
bits = ( d0 >= 0 ) ? 0 : 1;
d0 = 1.0f - d0;
bits |= ( d1 >= 0 ) ? 0 : 1*2;
d1 = 1.0f - d1;
bits |= ( d0 >= 0 ) ? 0: 1*4;
bits |= ( d1 >= 0 ) ? 0: 1*8;
cullBits[i] = bits;
}
}
#endif /* DRAWVERT_PADDED */
#endif /* ENABLE_CULL */
#ifdef ENABLE_DERIVE
/*
============
idSIMD_AltiVec::DeriveTriPlanes
Derives a plane equation for each triangle.
============
*/
void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
// idPlane size
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
int i;
vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
vector float vecVertA, vecVertB, vecVertC;
vector float vecVertA2, vecVertB2, vecVertC2;
vector float vecVertA3, vecVertB3, vecVertC3;
vector float vecVertA4, vecVertB4, vecVertC4;
vector float vecN, vecN2, vecN3, vecN4;
vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
vector float vecF;
vector float vecF1, vecF2, vecF3, vecF4;
vector float zeroVector = (vector float)(0);
vector float vecNegOne = (vector float)(-1);
vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
vector unsigned char oneVector = (vector unsigned char)(1);
vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
const float *xyzPtr = verts[0].xyz.ToFloatPtr();
float *planePtr = planes[0].ToFloatPtr();
int j;
for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
#ifndef DRAWVERT_PADDED
// calculate permute vectors to load as needed. these are all
// triangle indexes and are usaully pretty close together but
// not guaranteed to be in any particular order
vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
#endif
#ifndef DRAWVERT_PADDED
// load first A B C
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
// set the last element to 0
vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
// load second A B C
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
// set the last element to 0
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
// load third A B C
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
// set the last element to 0
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
// load the fourth A B C
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
// set the last element to 0
vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
#else
// load first A B C
vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
// set the last element to 0
vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
// load second A B C
vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
// set the last element to 0
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
// load third A B C
vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
// set the last element to 0
vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
// load the fourth A B C
vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
// set the last element to 0
vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
#endif
// calculate d0 and d1 for each
vecD0 = vec_sub( vecVertB, vecVertA );
vecD1 = vec_sub( vecVertC, vecVertA );
vecD2 = vec_sub( vecVertB2, vecVertA2 );
vecD3 = vec_sub( vecVertC2, vecVertA2 );
vecD4 = vec_sub( vecVertB3, vecVertA3 );
vecD5 = vec_sub( vecVertC3, vecVertA3 );
vecD6 = vec_sub( vecVertB4, vecVertA4 );
vecD7 = vec_sub( vecVertC4, vecVertA4 );
vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
// transpose vecNs
vector float v0, v1, v2, v3;
v0 = vec_mergeh( vecN, vecN3 );
v1 = vec_mergeh( vecN2, vecN4 );
v2 = vec_mergel( vecN, vecN3 );
v3 = vec_mergel( vecN2, vecN4 );
vecN = vec_mergeh( v0, v1 );
vecN2 = vec_mergel( v0, v1 );
vecN3 = vec_mergeh( v2, v3 );
vecN4 = vec_mergel( v2, v3 );
vecF = vec_madd( vecN, vecN, zeroVector );
vecF = vec_madd( vecN2, vecN2, vecF );
vecF = vec_madd( vecN3, vecN3, vecF );
vecF = ReciprocalSquareRoot( vecF );
vecF1 = vec_madd( vecF, vecN, zeroVector );
vecF2 = vec_madd( vecF, vecN2, zeroVector );
vecF3 = vec_madd( vecF, vecN3, zeroVector );
vecF4 = vec_madd( vecF, vecN4, zeroVector );
vector float v8, v9, v10, v11;
v8 = vecF1;
v9 = vecF2;
v10 = vecF3;
v11 = vecF4;
// transpose vecVerts
v0 = vec_mergeh( vecVertA, vecVertA3 );
v1 = vec_mergeh( vecVertA2, vecVertA4 );
v2 = vec_mergel( vecVertA, vecVertA3 );
v3 = vec_mergel( vecVertA2, vecVertA4 );
vecVertA = vec_mergeh( v0, v1 );
vecVertA2 = vec_mergel( v0, v1 );
vecVertA3 = vec_mergeh( v2, v3 );
vecVertA4 = vec_mergel( v2, v3 );
vector float vecTotals;
vecTotals = vec_madd( vecVertA, v8, zeroVector );
vecTotals = vec_madd( vecVertA2, v9, vecTotals );
vecTotals = vec_madd( vecVertA3, v10, vecTotals );
vecTotals = vec_madd( vecVertA4, v11, vecTotals );
vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
// transpose vecFs
v0 = vec_mergeh( vecF1, vecF3 );
v1 = vec_mergeh( vecF2, vecF );
v2 = vec_mergel( vecF1, vecF3 );
v3 = vec_mergel( vecF2, vecF );
vecF1 = vec_mergeh( v0, v1 );
vecF2 = vec_mergel( v0, v1 );
vecF3 = vec_mergeh( v2, v3 );
vecF4 = vec_mergel( v2, v3 );
// store results
UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
}
// cleanup
for ( ; i < numIndexes; i += 3, j++ ) {
const idDrawVert *a, *b, *c;
float d0[3], d1[3], f;
idVec3 n;
a = verts + indexes[i + 0];
b = verts + indexes[i + 1];
c = verts + indexes[i + 2];
d0[0] = b->xyz[0] - a->xyz[0];
d0[1] = b->xyz[1] - a->xyz[1];
d0[2] = b->xyz[2] - a->xyz[2];
d1[0] = c->xyz[0] - a->xyz[0];
d1[1] = c->xyz[1] - a->xyz[1];
d1[2] = c->xyz[2] - a->xyz[2];
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
//idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
n.x *= f;
n.y *= f;
n.z *= f;
planes[j].SetNormal( n );
planes[j].FitThroughPoint( a->xyz );
}
}
/*
============
idSIMD_AltiVec::DeriveTangents
Derives the normal and orthogonal tangent vectors for the triangle vertices.
For each vertex the normal and tangent vectors are derived from all triangles
using the vertex which results in smooth tangents across the mesh.
In the process the triangle planes are calculated as well.
============
*/
void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
int i;
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
memset( used, 0, numVerts * sizeof( used[0] ) );
idPlane *planesPtr = planes;
for ( i = 0; i < numIndexes; i += 3 ) {
idDrawVert *a, *b, *c;
// unsigned long signBit;
float d0[5], d1[5], area;
idVec3 n, t0, t1;
float f1, f2, f3;
int v0 = indexes[i + 0];
int v1 = indexes[i + 1];
int v2 = indexes[i + 2];
a = verts + v0;
b = verts + v1;
c = verts + v2;
d0[0] = b->xyz[0] - a->xyz[0];
d0[1] = b->xyz[1] - a->xyz[1];
d0[2] = b->xyz[2] - a->xyz[2];
d0[3] = b->st[0] - a->st[0];
d0[4] = b->st[1] - a->st[1];
d1[0] = c->xyz[0] - a->xyz[0];
d1[1] = c->xyz[1] - a->xyz[1];
d1[2] = c->xyz[2] - a->xyz[2];
d1[3] = c->st[0] - a->st[0];
d1[4] = c->st[1] - a->st[1];
// normal
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
f1 = n.x * n.x + n.y * n.y + n.z * n.z;
// area sign bit
area = d0[3] * d1[4] - d0[4] * d1[3];
// first tangent
t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
// second tangent
t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
// Behold! The power of the pipeline
FastScalarInvSqrt_x3( &f1, &f2, &f3 );
#ifdef PPC_INTRINSICS
f2 = __fsel( area, f2, -f2 );
f3 = __fsel( area, f3, -f3 );
#else
f2 = ( area < 0.0f ) ? -f2 : f2;
f3 = ( area < 0.0f ) ? -f3 : f3;
#endif
t0.x *= f2;
t0.y *= f2;
t0.z *= f2;
n.x *= f1;
n.y *= f1;
n.z *= f1;
planesPtr->SetNormal( n );
planesPtr->FitThroughPoint( a->xyz );
planesPtr++;
t1.x *= f3;
t1.y *= f3;
t1.z *= f3;
if ( used[v0] ) {
a->normal += n;
a->tangents[0] += t0;
a->tangents[1] += t1;
} else {
a->normal = n;
a->tangents[0] = t0;
a->tangents[1] = t1;
used[v0] = true;
}
if ( used[v1] ) {
b->normal += n;
b->tangents[0] += t0;
b->tangents[1] += t1;
} else {
b->normal = n;
b->tangents[0] = t0;
b->tangents[1] = t1;
used[v1] = true;
}
if ( used[v2] ) {
c->normal += n;
c->tangents[0] += t0;
c->tangents[1] += t1;
} else {
c->normal = n;
c->tangents[0] = t0;
c->tangents[1] = t1;
used[v2] = true;
}
}
}
#ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
/*
============
idSIMD_AltiVec::DeriveUnsmoothedTangents
Derives the normal and orthogonal tangent vectors for the triangle vertices.
For each vertex the normal and tangent vectors are derived from a single dominant triangle.
============
*/
#define DERIVE_UNSMOOTHED_BITANGENT
void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
int i;
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
// drawverts aligned
assert( IS_16BYTE_ALIGNED( verts[0] ) );
vector float vecVertA, vecVertB, vecVertC;
vector float vecVertA2, vecVertB2, vecVertC2;
vector float vecVertA3, vecVertB3, vecVertC3;
vector float vecVertA4, vecVertB4, vecVertC4;
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
vector float vecS0, vecS1, vecS2;
vector float vecS0_2, vecS1_2, vecS2_2;
vector float vecS0_3, vecS1_3, vecS2_3;
vector float vecS0_4, vecS1_4, vecS2_4;
vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
vector float vecN, vecN2, vecN3, vecN4;
vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
vector float zeroVector = (vector float)(0);
vector float vecNegOne = (vector float)(-1.0);
vector float vecStore1, vecStore2, vecStore3;
vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
vector float vecLd1, vecLd2, vecLd3;
vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
float *normalPtr = verts[0].normal.ToFloatPtr();
float *xyzPtr = verts[0].xyz.ToFloatPtr();
vector float vecFirstHalf, vecSecondHalf;
vector float vecFirstHalf2, vecSecondHalf2;
vector float vecFirstHalf3, vecSecondHalf3;
vector float vecFirstHalf4, vecSecondHalf4;
for ( i = 0; i+3 < numVerts; i+=4 ) {
int bOffset1, bOffset2, bOffset3, bOffset4;
int cOffset1, cOffset2, cOffset3, cOffset4;
bOffset1 = dominantTris[i].v2;
cOffset1 = dominantTris[i].v3;
bOffset2 = dominantTris[i+1].v2;
cOffset2 = dominantTris[i+1].v3;
bOffset3 = dominantTris[i+2].v2;
cOffset3 = dominantTris[i+2].v3;
bOffset4 = dominantTris[i+3].v2;
cOffset4 = dominantTris[i+3].v3;
vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
vecVertA = vec_perm( v0, v1, vecPerm0 );
vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
vecVertB = vec_perm( v2, v3, vecPerm1 );
vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
vecVertC = vec_perm( v4, v5, vecPerm2 );
// put remainder into v2
v1 = vec_perm( v1, v1, vecPerm0 );
v3 = vec_perm( v3, v3, vecPerm1 );
v5 = vec_perm( v5, v5, vecPerm2 );
v1 = vec_mergeh( v1, v5 );
v2 = vec_mergeh( v3, zeroVector );
v2 = vec_mergeh( v1, v2 );
v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
// load second one
vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
vecVertA2 = vec_perm( v0, v1, vecPerm0 );
vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
vecVertB2 = vec_perm( v3, v4, vecPerm3 );
vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
vecVertC2 = vec_perm( v5, v6, vecPerm4 );
// put remainder into v3
v1 = vec_perm( v1, v1, vecPerm0 );
v4 = vec_perm( v4, v4, vecPerm3 );
v5 = vec_perm( v6, v6, vecPerm4 );
v1 = vec_mergeh( v1, v5 );
v3 = vec_mergeh( v4, zeroVector );
v3 = vec_mergeh( v1, v3 );
v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
// load third one
vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
vecVertA3 = vec_perm( v0, v1, vecPerm0 );
vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
vecVertB3 = vec_perm( v4, v5, vecPerm1 );
vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
vecVertC3 = vec_perm( v6, v7, vecPerm2 );
// put remainder into v4
v1 = vec_perm( v1, v1, vecPerm0 );
v5 = vec_perm( v5, v5, vecPerm1 );
v7 = vec_perm( v7, v7, vecPerm2 );
v1 = vec_mergeh( v1, v7 );
v4 = vec_mergeh( v5, zeroVector );
v4 = vec_mergeh( v1, v4 );
v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
// load fourth one
vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
vecVertA4 = vec_perm( v0, v1, vecPerm0 );
vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
vecVertB4 = vec_perm( v5, v6, vecPerm3 );
vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
vecVertC4 = vec_perm( v7, v8, vecPerm4 );
// put remainder into v5
v1 = vec_perm( v1, v1, vecPerm0 );
v6 = vec_perm( v6, v6, vecPerm3 );
v8 = vec_perm( v8, v8, vecPerm4 );
v1 = vec_mergeh( v1, v8 );
v5 = vec_mergeh( v6, zeroVector );
v5 = vec_mergeh( v1, v5 );
v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
// remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
//vecD1 now holds d0, d1, d2, d3
vecD1 = vec_sub( vecVertB, vecVertA );
vecD4 = vec_sub( vecVertB2, vecVertA2 );
vecD7 = vec_sub( vecVertB3, vecVertA3 );
vecD10 = vec_sub( vecVertB4, vecVertA4 );
// vecD2 how holds d5, d6, d7, d8
vecD2 = vec_sub( vecVertC, vecVertA );
vecD5 = vec_sub( vecVertC2, vecVertA2 );
vecD8 = vec_sub( vecVertC3, vecVertA3 );
vecD11 = vec_sub( vecVertC4, vecVertA4 );
// vecD3 now holds d4, crap, d9, crap
vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
// get permute vectors for loading from dt
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
// load S values from dominantTris
v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
v0 = vec_perm( v0, v1, vecPerm1 );
v2 = vec_perm( v2, v3, vecPerm2 );
v4 = vec_perm( v4, v5, vecPerm3 );
v6 = vec_perm( v6, v7, vecPerm4 );
vecS0 = vec_splat( v0, 0 );
vecS1 = vec_splat( v0, 1 );
vecS2 = vec_splat( v0, 2 );
vecS0_2 = vec_splat( v2, 0);
vecS1_2 = vec_splat( v2, 1 );
vecS2_2 = vec_splat( v2, 2 );
vecS0_3 = vec_splat( v4, 0 );
vecS1_3 = vec_splat( v4, 1 );
vecS2_3 = vec_splat( v4, 2 );
vecS0_4 = vec_splat( v6, 0 );
vecS1_4 = vec_splat( v6, 1 );
vecS2_4 = vec_splat( v6, 2 );
// do calculation
vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
// calculate N values
vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
// calculate both halves of the calculation for t
vecWork1 = vecD1;
vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
vecWork3 = vecD4;
vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
vecWork5 = vecD7;
vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
vecWork7 = vecD10;
vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
vecWork1 = vecD2;
vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
vecWork3 = vecD5;
vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
vecWork5 = vecD8;
vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
vecWork7 = vecD11;
vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
// calculate T values
vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
#ifndef DERIVE_UNSMOOTHED_BITANGENT
vecWork1 = vecD1;
vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
vecWork3 = vecD4;
vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
vecWork5 = vecD7;
vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
vecWork7 = vecD10;
vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
vecWork2 = vecD2;
vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
vecWork4 = vecD5;
vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
vecWork6 = vecD8;
vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
vecWork8 = vecD11;
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
#else
vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
#endif
// finish the calculation
vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
// Store results
// read values that we need to preserve
vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
//generate vectors to store
vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
// store out results
ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
// read values that we need to preserve
vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
// generate vectors to store
vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
// instead of doing permute, shift it where it needs to be and use vec_ste
// store out vectors
ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
// read values that we need to preserve
vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
// generate vectors to store
vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
// store out vectors
ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
// read values that we need to preserve
vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
// generate vectors to store
vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
// store out vectors
ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
}
// cleanup
for ( ; i < numVerts; i++ ) {
idDrawVert *a, *b, *c;
float d0, d1, d2, d3, d4;
float d5, d6, d7, d8, d9;
float s0, s1, s2;
float n0, n1, n2;
float t0, t1, t2;
float t3, t4, t5;
const dominantTri_s &dt = dominantTris[i];
a = verts + i;
b = verts + dt.v2;
c = verts + dt.v3;
d0 = b->xyz[0] - a->xyz[0];
d1 = b->xyz[1] - a->xyz[1];
d2 = b->xyz[2] - a->xyz[2];
d3 = b->st[0] - a->st[0];
d4 = b->st[1] - a->st[1];
d5 = c->xyz[0] - a->xyz[0];
d6 = c->xyz[1] - a->xyz[1];
d7 = c->xyz[2] - a->xyz[2];
d8 = c->st[0] - a->st[0];
d9 = c->st[1] - a->st[1];
s0 = dt.normalizationScale[0];
s1 = dt.normalizationScale[1];
s2 = dt.normalizationScale[2];
n0 = s2 * ( d6 * d2 - d7 * d1 );
n1 = s2 * ( d7 * d0 - d5 * d2 );
n2 = s2 * ( d5 * d1 - d6 * d0 );
t0 = s0 * ( d0 * d9 - d4 * d5 );
t1 = s0 * ( d1 * d9 - d4 * d6 );
t2 = s0 * ( d2 * d9 - d4 * d7 );
#ifndef DERIVE_UNSMOOTHED_BITANGENT
t3 = s1 * ( d3 * d5 - d0 * d8 );
t4 = s1 * ( d3 * d6 - d1 * d8 );
t5 = s1 * ( d3 * d7 - d2 * d8 );
#else
t3 = s1 * ( n2 * t1 - n1 * t2 );
t4 = s1 * ( n0 * t2 - n2 * t0 );
t5 = s1 * ( n1 * t0 - n0 * t1 );
#endif
a->normal[0] = n0;
a->normal[1] = n1;
a->normal[2] = n2;
a->tangents[0][0] = t0;
a->tangents[0][1] = t1;
a->tangents[0][2] = t2;
a->tangents[1][0] = t3;
a->tangents[1][1] = t4;
a->tangents[1][2] = t5;
}
}
#else
/*
============
idSIMD_AltiVec::DeriveUnsmoothedTangents
Derives the normal and orthogonal tangent vectors for the triangle vertices.
For each vertex the normal and tangent vectors are derived from a single dominant triangle.
============
*/
#define DERIVE_UNSMOOTHED_BITANGENT
void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
int i;
for ( i = 0; i < numVerts; i++ ) {
idDrawVert *a, *b, *c;
float d0, d1, d2, d3, d4;
float d5, d6, d7, d8, d9;
float s0, s1, s2;
float n0, n1, n2;
float t0, t1, t2;
float t3, t4, t5;
const dominantTri_s &dt = dominantTris[i];
a = verts + i;
b = verts + dt.v2;
c = verts + dt.v3;
d0 = b->xyz[0] - a->xyz[0];
d1 = b->xyz[1] - a->xyz[1];
d2 = b->xyz[2] - a->xyz[2];
d3 = b->st[0] - a->st[0];
d4 = b->st[1] - a->st[1];
d5 = c->xyz[0] - a->xyz[0];
d6 = c->xyz[1] - a->xyz[1];
d7 = c->xyz[2] - a->xyz[2];
d8 = c->st[0] - a->st[0];
d9 = c->st[1] - a->st[1];
s0 = dt.normalizationScale[0];
s1 = dt.normalizationScale[1];
s2 = dt.normalizationScale[2];
n0 = s2 * ( d6 * d2 - d7 * d1 );
n1 = s2 * ( d7 * d0 - d5 * d2 );
n2 = s2 * ( d5 * d1 - d6 * d0 );
t0 = s0 * ( d0 * d9 - d4 * d5 );
t1 = s0 * ( d1 * d9 - d4 * d6 );
t2 = s0 * ( d2 * d9 - d4 * d7 );
#ifndef DERIVE_UNSMOOTHED_BITANGENT
t3 = s1 * ( d3 * d5 - d0 * d8 );
t4 = s1 * ( d3 * d6 - d1 * d8 );
t5 = s1 * ( d3 * d7 - d2 * d8 );
#else
t3 = s1 * ( n2 * t1 - n1 * t2 );
t4 = s1 * ( n0 * t2 - n2 * t0 );
t5 = s1 * ( n1 * t0 - n0 * t1 );
#endif
a->normal[0] = n0;
a->normal[1] = n1;
a->normal[2] = n2;
a->tangents[0][0] = t0;
a->tangents[0][1] = t1;
a->tangents[0][2] = t2;
a->tangents[1][0] = t3;
a->tangents[1][1] = t4;
a->tangents[1][2] = t5;
}
}
#endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
/*
============
idSIMD_AltiVec::NormalizeTangents
Normalizes each vertex normal and projects and normalizes the
tangent vectors onto the plane orthogonal to the vertex normal.
============
*/
void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
float *addr = verts[0].normal.ToFloatPtr();
float *tAddr = verts[0].tangents[0].ToFloatPtr();
// v0 through v3 maintain originally loaded values so we don't take
// as much hit for unaligned stores
vector float v0, v1, v2, v3;
// v5 through v8 are the "working" values of the vectors
vector float v5, v6, v7, v8;
// working values
vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
vector float vecF, vecF2;
vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
register vector float zeroVector = (vector float)(0.0);
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
vector unsigned char storeT41, storeT42;
int i = 0;
if ( i+3 < numVerts ) {
// for loading normal from idDrawVert
vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
// for loading tangents from idDrawVert
vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
// generate permute vectors to store normals
storePerm0 = vec_lvsr( 0, addr );
storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
// generate permute vectors to store tangents
storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
}
for ( ; i+3 < numVerts; i+=4 ) {
// load normals
vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
// zero out the last element of each useless vector
v0 = vec_perm( v0, zeroVector, vecPermLast );
v1 = vec_perm( v1, zeroVector, vecPermLast );
v2 = vec_perm( v2, zeroVector, vecPermLast );
v3 = vec_perm( v3, zeroVector, vecPermLast );
// got 4 vectors in v0 through v3, sum them each accross
// and put into one vector
vecTemp = vec_madd( v0, v0, zeroVector );
vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
// element 0 of vecSum now has sum of v0
vecTemp2 = vec_madd( v1, v1, zeroVector );
tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
// put this into vecSum
vecSum = vec_mergeh( vecSum, tempSum );
vecTemp3 = vec_madd( v2, v2, zeroVector );
tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
// put this into vecSum
vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
vecTemp4 = vec_madd( v3, v3, zeroVector );
tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
// put this into vecSum
vecSum = vec_perm( vecSum, tempSum, vecPermLast );
// take reciprocal square roots of these
vecF = ReciprocalSquareRoot( vecSum );
// multiply each vector by f
v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
// load tangents as unaligned
vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
//zero out last element of tangents
vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
// all tangents[0]
tempSum = zeroVector;
tempSum = vec_madd( vec1T0, v5, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
// put tempSum splatted accross vecTSum1
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
//vec1T0 now contains what needs to be rsqrt'd and multiplied by f
vec1T0 = vec_sub( vec1T0, vecTSum1 );
tempSum = zeroVector;
tempSum = vec_madd( vec2T0, v6, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
vec2T0 = vec_sub( vec2T0, vecTSum1 );
tempSum = zeroVector;
tempSum = vec_madd( vec3T0, v7, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
vec3T0 = vec_sub( vec3T0, vecTSum1 );
tempSum = zeroVector;
tempSum = vec_madd( vec4T0, v8, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
vec4T0 = vec_sub( vec4T0, vecTSum1 );
// all tangents[1]
tempSum = zeroVector;
tempSum = vec_madd( vec1T1, v5, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
//vec1T0 now contains what needs to be rsqrt'd and multiplied by f
vec1T1 = vec_sub( vec1T1, vecTSum1 );
tempSum = zeroVector;
tempSum = vec_madd( vec2T1, v6, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
vec2T1 = vec_sub( vec2T1, vecTSum1 );
tempSum = zeroVector;
tempSum = vec_madd( vec3T1, v7, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
vec3T1 = vec_sub( vec3T1, vecTSum1 );
tempSum = zeroVector;
tempSum = vec_madd( vec4T1, v8, tempSum );
//sum accross tempSum
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
vec4T1 = vec_sub( vec4T1, vecTSum1 );
// sum accross vectors and put into one vector
vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
// element 0 of vecSum now has sum of v0
vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
// put this into vecSum
vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
// put this into vecSum
vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
// put this into vecSum
vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
// element 0 of vecSum now has sum of v0
vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
// put this into vecSum
vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
// put this into vecSum
vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
// put this into vecSum
vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
// tangents[0]
vecF = ReciprocalSquareRoot( vecTSum1 );
// tangents[1]
vecF2 = ReciprocalSquareRoot( vecTSum2 );
// multiply each tangent vector by f
vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
// rotate input data
v5 = vec_perm( v5, v5, storePerm0 );
v6 = vec_perm( v6, v6, storePerm1 );
v7 = vec_perm( v7, v7, storePerm2 );
v8 = vec_perm( v8, v8, storePerm3 );
vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
// store tangents[0] and tangents[1]
vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
// store second tangents[0] and tangents[1]
vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
// store third tangents[0] and tangents[1]
vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
// store fourth tangents[0] and tangents[1]
vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
}
// cleanup
for ( ; i < numVerts; i++ ) {
idVec3 &v = verts[i].normal;
float f;
//f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
v.x *= f; v.y *= f; v.z *= f;
for ( int j = 0; j < 2; j++ ) {
idVec3 &t = verts[i].tangents[j];
t -= ( t * v ) * v;
// f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
t.x *= f; t.y *= f; t.z *= f;
}
}
}
#endif /* ENABLE_DERIVE */
#ifdef ENABLE_CREATE
/*
============
idSIMD_AltiVec::CreateTextureSpaceLightVectors
Calculates light vectors in texture space for the given triangle vertices.
For each vertex the direction towards the light origin is projected onto texture space.
The light vectors are only calculated for the vertices referenced by the indexes.
============
*/
void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
memset( used, 0, numVerts * sizeof( used[0] ) );
int i;
for ( i = 0; i+7 < numIndexes; i+= 8 ) {
used[indexes[i]] = true;
used[indexes[i+1]] = true;
used[indexes[i+2]] = true;
used[indexes[i+3]] = true;
used[indexes[i+4]] = true;
used[indexes[i+5]] = true;
used[indexes[i+6]] = true;
used[indexes[i+7]] = true;
}
for ( ; i < numIndexes; i++ ) {
used[indexes[i]] = true;
}
for ( i = 0; i+1 < numVerts; i+=2 ) {
const idDrawVert *v = &verts[i];
const idDrawVert *v2 = &verts[i+1];
float x, y, z;
float x2, y2, z2;
idVec3 lightDir, lightDir2;
lightDir[0] = lightOrigin[0] - v->xyz[0];
lightDir[1] = lightOrigin[1] - v->xyz[1];
lightDir[2] = lightOrigin[2] - v->xyz[2];
lightDir2[0] = lightOrigin[0] - v2->xyz[0];
lightDir2[1] = lightOrigin[1] - v2->xyz[1];
lightDir2[2] = lightOrigin[2] - v2->xyz[2];
x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
if ( used[i] ) {
lightVectors[i][0] = x;
lightVectors[i][1] = y;
lightVectors[i][2] = z;
}
if ( used[i+1] ) {
lightVectors[i+1][0] = x2;
lightVectors[i+1][1] = y2;
lightVectors[i+1][2] = z2;
}
}
// cleanup
for ( ; i < numVerts; i++ ) {
if ( !used[i] ) {
continue;
}
const idDrawVert *v = &verts[i];
idVec3 lightDir;
lightDir[0] = lightOrigin[0] - v->xyz[0];
lightDir[1] = lightOrigin[1] - v->xyz[1];
lightDir[2] = lightOrigin[2] - v->xyz[2];
lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
}
}
#if 1
/*
============
idSIMD_AltiVec::CreateSpecularTextureCoords
Calculates specular texture coordinates for the given triangle vertices.
For each vertex the normalized direction towards the light origin is added to the
normalized direction towards the view origin and the result is projected onto texture space.
The texture coordinates are only calculated for the vertices referenced by the indexes.
============
*/
void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
memset( used, 0, numVerts * sizeof( used[0] ) );
int i;
for ( i = 0; i+7 < numIndexes; i+= 8 ) {
used[indexes[i]] = true;
used[indexes[i+1]] = true;
used[indexes[i+2]] = true;
used[indexes[i+3]] = true;
used[indexes[i+4]] = true;
used[indexes[i+5]] = true;
used[indexes[i+6]] = true;
used[indexes[i+7]] = true;
}
for ( ; i < numIndexes; i++ ) {
used[indexes[i]] = true;
}
// load lightOrigin and viewOrigin into vectors
const float *lightOriginPtr = lightOrigin.ToFloatPtr();
const float *viewOriginPtr = viewOrigin.ToFloatPtr();
vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
vector float v0 = vec_ld( 0, lightOriginPtr );
vector float v1 = vec_ld( 15, lightOriginPtr );
vector float v2 = vec_ld( 0, viewOriginPtr );
vector float v3 = vec_ld( 15, viewOriginPtr );
vector float vecLightOrigin = vec_perm( v0, v1, permVec );
vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
const vector float zeroVector = (vector float)(0);
int index;
for ( index = 0; index+1 < numVerts; index+=2 ) {
const float *vertPtr = verts[index].xyz.ToFloatPtr();
const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
vector float v2 = vec_ld( 31, vertPtr );
vector float v3 = vec_ld( 47, vertPtr );
vector float v4 = vec_ld( 63, vertPtr );
vector float v5 = vec_ld( 0, vertPtr2 );
vector float v6 = vec_ld( 15, vertPtr2 );
vector float v7 = vec_ld( 31, vertPtr2 );
vector float v8 = vec_ld( 47, vertPtr2 );
vector float v9 = vec_ld( 63, vertPtr2 );
// figure out what values go where
vector float vecXYZ = vec_perm( v0, v1, permVec );
vector float vecNormal = vec_perm( v1, v2, permVec );
vecNormal = vec_sld( vecNormal, vecNormal, 4 );
const vector float vecTangent0 = vec_perm( v2, v3, permVec );
permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
const vector float vecTangent1 = vec_perm( v3, v4, permVec );
vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
// calculate lightDir
vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
// calculate distance
vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
// sum accross first 3 elements of vector
vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
// splat sum accross the whole vector
vecTempLight = vec_splat( vecTempLight, 0 );
vecTempView = vec_splat( vecTempView, 0 );
vecTempLight2 = vec_splat( vecTempLight2, 0 );
vecTempView2 = vec_splat( vecTempView2, 0 );
vecTempLight = ReciprocalSquareRoot( vecTempLight );
vecTempView = ReciprocalSquareRoot( vecTempView );
vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
// modify light and view vectors based on ilength
vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
// calculate what to store in each texture coord
vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
// sum accross first 3 elements of vector
vector float tempSum3;
tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
vecTC0 = vec_splat( vecTC0, 0 );
vecTC1 = vec_splat( vecTC1, 0 );
vecTC2 = vec_splat( vecTC2, 0 );
vecTC3 = vec_splat( vecTC3, 0 );
vecTC4 = vec_splat( vecTC4, 0 );
vecTC5 = vec_splat( vecTC5, 0 );
if ( used[index] ) {
// store out results
vec_ste( vecTC0, 0, &texCoords[index][0] );
vec_ste( vecTC1, 0, &texCoords[index][1] );
vec_ste( vecTC2, 0, &texCoords[index][2] );
vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
}
if ( used[index+1] ) {
vec_ste( vecTC3, 0, &texCoords[index+1][0] );
vec_ste( vecTC4, 0, &texCoords[index+1][1] );
vec_ste( vecTC5, 0, &texCoords[index+1][2] );
vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
}
}
// cleanup
for ( ; index < numVerts; index++ ) {
if ( !used[index] ) {
continue;
}
const float *vertPtr = verts[index].xyz.ToFloatPtr();
permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
vector float v2 = vec_ld( 31, vertPtr );
vector float v3 = vec_ld( 47, vertPtr );
vector float v4 = vec_ld( 63, vertPtr );
// figure out what values go where
vector float vecXYZ = vec_perm( v0, v1, permVec );
vector float vecNormal = vec_perm( v1, v2, permVec );
vecNormal = vec_sld( vecNormal, vecNormal, 4 );
const vector float vecTangent0 = vec_perm( v2, v3, permVec );
permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
const vector float vecTangent1 = vec_perm( v3, v4, permVec );
// calculate lightDir
vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
// calculate distance
vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
// sum accross first 3 elements of vector
vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
// splat sum accross the whole vector
vecTempLight = vec_splat( vecTempLight, 0 );
vecTempView = vec_splat( vecTempView, 0 );
vecTempLight = ReciprocalSquareRoot( vecTempLight );
vecTempView = ReciprocalSquareRoot( vecTempView );
// modify light and view vectors based on ilength
vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
// calculate what to store in each texture coord
vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
// sum accross first 3 elements of vector
vector float tempSum3;
tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
vecTC0 = vec_splat( vecTC0, 0 );
vecTC1 = vec_splat( vecTC1, 0 );
vecTC2 = vec_splat( vecTC2, 0 );
// store out results
vec_ste( vecTC0, 0, &texCoords[index][0] );
vec_ste( vecTC1, 0, &texCoords[index][1] );
vec_ste( vecTC2, 0, &texCoords[index][2] );
vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
}
}
#endif /* 0 for disable spec coord */
#if 1
#ifdef VERTEXCACHE_ALIGNED
/*
============
idSIMD_AltiVec::CreateShadowCache
============
*/
int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
int outVerts = 0;
int i = 0;
assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
register vector float zeroVector = (vector float)(0.0);
register vector float oneVector = (vector float)(1);
register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
const float *lPtr = lightOrigin.ToFloatPtr();
const float *vPtr;
const float *vPtr2;
const float *vPtr3;
const float *vPtr4;
// put values into a vector
vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
v0 = vec_ld( 0, lPtr );
v1 = vec_ld( 15, lPtr );
v0 = vec_perm( v0, v1, vecPerm );
v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
//v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
for ( ; i+3 < numVerts; i+= 4 ) {
if ( ! vertRemap[i] ) {
vPtr = verts[i].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
v2 = vec_ld( 0, vPtr );
v3 = vec_ld( 15, vPtr );
v7 = vec_perm( v2, v3, vecPerm2 );
#else
v7 = vec_ld( 0, vPtr );
#endif
v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
v3 = vec_perm( v7, oneVector, vecPermZeroLast );
v1 = vec_sub( v2, v0 );
vec_st( v3, 0, &vertexCache[outVerts][0] );
vec_st( v1, 0, &vertexCache[outVerts+1][0] );
vertRemap[i] = outVerts;
outVerts += 2;
}
if ( ! vertRemap[i+1] ) {
vPtr2 = verts[i+1].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
v4 = vec_ld( 0, vPtr2 );
v5 = vec_ld( 15, vPtr2 );
v6 = vec_perm( v4, v5, vecPerm3 );
#else
v6 = vec_ld( 0, vPtr2 );
#endif
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
v6 = vec_sub( v4, v0 );
vec_st( v5, 0, &vertexCache[outVerts][0] );
vec_st( v6, 0, &vertexCache[outVerts+1][0] );
vertRemap[i+1] = outVerts;
outVerts += 2;
}
if ( ! vertRemap[i+2] ) {
vPtr3 = verts[i+2].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
v1 = vec_ld( 0, vPtr3 );
v2 = vec_ld( 15, vPtr3 );
v3 = vec_perm( v1, v2, vecPerm4 );
#else
v3 = vec_ld( 0, vPtr3 );
#endif
v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
v2 = vec_perm( v3, oneVector, vecPermZeroLast );
v3 = vec_sub( v1, v0 );
vec_st( v2, 0, &vertexCache[outVerts][0] );
vec_st( v3, 0, &vertexCache[outVerts+1][0] );
vertRemap[i+2] = outVerts;
outVerts += 2;
}
if ( ! vertRemap[i+3] ) {
vPtr4 = verts[i+3].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
v4 = vec_ld( 0, vPtr4 );
v5 = vec_ld( 16, vPtr4 );
v6 = vec_perm( v4, v5, vecPerm5 );
#else
v6 = vec_ld( 0, vPtr4 );
#endif
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
v6 = vec_sub( v4, v0 );
vec_st( v5, 0, &vertexCache[outVerts][0] );
vec_st( v6, 0, &vertexCache[outVerts+1][0] );
vertRemap[i+3] = outVerts;
outVerts += 2;
}
}
// cleanup
for (; i < numVerts; i++ ) {
if ( vertRemap[i] ) {
continue;
}
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[outVerts+0][0] = v[0];
vertexCache[outVerts+0][1] = v[1];
vertexCache[outVerts+0][2] = v[2];
vertexCache[outVerts+0][3] = 1.0f;
// R_SetupProjection() builds the projection matrix with a slight crunch
// for depth, which keeps this w=0 division from rasterizing right at the
// wrap around point and causing depth fighting with the rear caps
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
vertexCache[outVerts+1][3] = 0.0f;
vertRemap[i] = outVerts;
outVerts += 2;
}
return outVerts;
}
#else
/*
============
idSIMD_AltiVec::CreateShadowCache
============
*/
int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
int outVerts = 0;
int i = 0;
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
register vector float zeroVector = (vector float)(0.0);
register vector float oneVector = (vector float)(1);
register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
const float *lPtr = lightOrigin.ToFloatPtr();
const float *vPtr;
const float *vPtr2;
const float *vPtr3;
const float *vPtr4;
// put values into a vector
vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
v0 = vec_ld( 0, lPtr );
v1 = vec_ld( 15, lPtr );
v0 = vec_perm( v0, v1, vecPerm );
v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
//v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
for ( ; i+3 < numVerts; i+= 4 ) {
if ( ! vertRemap[i] ) {
vPtr = verts[i].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
v2 = vec_ld( 0, vPtr );
v3 = vec_ld( 15, vPtr );
v7 = vec_perm( v2, v3, vecPerm2 );
#else
v7 = vec_ld( 0, vPtr );
#endif
v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
v3 = vec_perm( v7, oneVector, vecPermZeroLast );
v1 = vec_sub( v2, v0 );
// store results
UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
vertRemap[i] = outVerts;
outVerts += 2;
}
if ( ! vertRemap[i+1] ) {
vPtr2 = verts[i+1].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
v4 = vec_ld( 0, vPtr2 );
v5 = vec_ld( 15, vPtr2 );
v6 = vec_perm( v4, v5, vecPerm3 );
#else
v6 = vec_ld( 0, vPtr2 );
#endif
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
v6 = vec_sub( v4, v0 );
// store results
UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
vertRemap[i+1] = outVerts;
outVerts += 2;
}
if ( ! vertRemap[i+2] ) {
vPtr3 = verts[i+2].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
v1 = vec_ld( 0, vPtr3 );
v2 = vec_ld( 15, vPtr3 );
v3 = vec_perm( v1, v2, vecPerm4 );
#else
v3 = vec_ld( 0, vPtr3 );
#endif
v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
v2 = vec_perm( v3, oneVector, vecPermZeroLast );
v3 = vec_sub( v1, v0 );
// store results
UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
vertRemap[i+2] = outVerts;
outVerts += 2;
}
if ( ! vertRemap[i+3] ) {
vPtr4 = verts[i+3].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
v4 = vec_ld( 0, vPtr4 );
v5 = vec_ld( 16, vPtr4 );
v6 = vec_perm( v4, v5, vecPerm5 );
#else
v6 = vec_ld( 0, vPtr4 );
#endif
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
v6 = vec_sub( v4, v0 );
// store results
UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
vertRemap[i+3] = outVerts;
outVerts += 2;
}
}
// cleanup
for (; i < numVerts; i++ ) {
if ( vertRemap[i] ) {
continue;
}
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[outVerts+0][0] = v[0];
vertexCache[outVerts+0][1] = v[1];
vertexCache[outVerts+0][2] = v[2];
vertexCache[outVerts+0][3] = 1.0f;
// R_SetupProjection() builds the projection matrix with a slight crunch
// for depth, which keeps this w=0 division from rasterizing right at the
// wrap around point and causing depth fighting with the rear caps
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
vertexCache[outVerts+1][3] = 0.0f;
vertRemap[i] = outVerts;
outVerts += 2;
}
return outVerts;
}
#endif /* VERTEXCACHE_ALIGNED */
#endif /* 0 to disable shadow cache */
#if 1
#ifdef VERTEXCACHE_ALIGNED
/*
============
idSIMD_AltiVec::CreateVertexProgramShadowCache
============
*/
int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
// vertexCache aligned
assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
// idVec4 size
assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float zeroVector = (vector float)(0.0);
register vector float oneVector = (vector float)(1);
register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
int i = 0;
#ifndef DRAWVERT_PADDED
// every fourth one will have the same alignment. Make sure we've got enough here
if ( i+3 < numVerts ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
#endif
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 15, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 15, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 15, vertPtr4 );
v0 = vec_perm( v0, v1, vertPerm1 );
v1 = vec_perm( v2, v3, vertPerm2 );
v2 = vec_perm( v4, v5, vertPerm3 );
v3 = vec_perm( v6, v7, vertPerm4 );
#else
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 0, vertPtr2 );
v2 = vec_ld( 0, vertPtr3 );
v3 = vec_ld( 0, vertPtr4 );
#endif
v0 = vec_perm( v0, oneVector, vecPermThreeOne );
v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
v1 = vec_perm( v1, oneVector, vecPermThreeOne );
v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
v2 = vec_perm( v2, oneVector, vecPermThreeOne );
v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
v3 = vec_perm( v3, oneVector, vecPermThreeOne );
v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
// store results
ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
}
// cleanup
for ( ; i < numVerts; i++ ) {
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[i*2+0][0] = v[0];
vertexCache[i*2+1][0] = v[0];
vertexCache[i*2+0][1] = v[1];
vertexCache[i*2+1][1] = v[1];
vertexCache[i*2+0][2] = v[2];
vertexCache[i*2+1][2] = v[2];
vertexCache[i*2+0][3] = 1.0f;
vertexCache[i*2+1][3] = 0.0f;
}
return numVerts * 2;
}
#else
/*
============
idSIMD_AltiVec::CreateVertexProgramShadowCache
============
*/
int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
// idDrawVert size
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
// idVec4 size
assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector float zeroVector = (vector float)(0.0);
register vector float oneVector = (vector float)(1);
register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
int i = 0;
#ifndef DRAWVERT_PADDED
// every fourth one will have the same alignment. Make sure we've got enough here
if ( i+3 < numVerts ) {
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
}
#endif
for ( ; i+3 < numVerts; i+=4 ) {
const float *vertPtr = verts[i].xyz.ToFloatPtr();
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
#ifndef DRAWVERT_PADDED
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 15, vertPtr );
v2 = vec_ld( 0, vertPtr2 );
v3 = vec_ld( 15, vertPtr2 );
v4 = vec_ld( 0, vertPtr3 );
v5 = vec_ld( 15, vertPtr3 );
v6 = vec_ld( 0, vertPtr4 );
v7 = vec_ld( 15, vertPtr4 );
v0 = vec_perm( v0, v1, vertPerm1 );
v1 = vec_perm( v2, v3, vertPerm2 );
v2 = vec_perm( v4, v5, vertPerm3 );
v3 = vec_perm( v6, v7, vertPerm4 );
#else
v0 = vec_ld( 0, vertPtr );
v1 = vec_ld( 0, vertPtr2 );
v2 = vec_ld( 0, vertPtr3 );
v3 = vec_ld( 0, vertPtr4 );
#endif
v0 = vec_perm( v0, oneVector, vecPermThreeOne );
v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
v1 = vec_perm( v1, oneVector, vecPermThreeOne );
v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
v2 = vec_perm( v2, oneVector, vecPermThreeOne );
v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
v3 = vec_perm( v3, oneVector, vecPermThreeOne );
v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
// store results as unaligned
vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
// right rotate input data
v0 = vec_perm( v0, v0, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v1 = vec_perm( v1, v1, storePerm );
v5 = vec_perm( v5, v5, storePerm );
v2 = vec_perm( v2, v2, storePerm );
v6 = vec_perm( v6, v6, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v7 = vec_perm( v7, v7, storePerm );
vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
}
// cleanup
for ( ; i < numVerts; i++ ) {
const float *v = verts[i].xyz.ToFloatPtr();
vertexCache[i*2+0][0] = v[0];
vertexCache[i*2+1][0] = v[0];
vertexCache[i*2+0][1] = v[1];
vertexCache[i*2+1][1] = v[1];
vertexCache[i*2+0][2] = v[2];
vertexCache[i*2+1][2] = v[2];
vertexCache[i*2+0][3] = 1.0f;
vertexCache[i*2+1][3] = 0.0f;
}
return numVerts * 2;
}
#endif /* VERTEXCACHE_ALIGNED */
#endif /* 0 to kill VP shader cache */
#endif /* ENABLE_CREATE */
#ifdef ENABLE_SOUND_ROUTINES
#ifdef SOUND_DEST_ALIGNED
/*
============
idSIMD_AltiVec::UpSamplePCMTo44kHz
Duplicate samples for 44kHz output.
Assumptions:
Assumes that dest starts at aligned address
============
*/
void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
// dest is aligned
assert( IS_16BYTE_ALIGNED( dest[0] ) );
vector signed short vs0, vs1;
register vector signed int vi0, vi1;
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
// permute vectors
register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
// If this can be assumed true, we can eliminate another conditional that checks to see if we can
// load up a vector before the loop
assert( numSamples >= 12 );
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
// 8 at a time
int i = 0;
vector signed short vsOld = vec_ld( 0, &src[i] );
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
for ( ; i+7 < numSamples; i+= 8 ) {
// load src
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// permute into vectors in the order to store
v2 = vec_splat( v0, 0 );
v3 = vec_splat( v0, 1 );
v4 = vec_splat( v0, 2 );
v5 = vec_splat( v0, 3 );
v6 = vec_splat( v1, 0 );
v7 = vec_splat( v1, 1 );
v8 = vec_splat( v1, 2 );
v9 = vec_splat( v1, 3 );
// store results
ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
}
// cleanup
for (; i < numSamples; i++ ) {
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
}
} else {
int i = 0;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( ; i+7 < numSamples; i += 8 ) {
// load src
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// put into vectors in order to store
v2 = vec_perm( v0, v0, vecFirstHalf );
v3 = v2;
v4 = vec_perm( v0, v0, vecSecondHalf );
v5 = v4;
v6 = vec_perm( v1, v1, vecFirstHalf );
v7 = v6;
v8 = vec_perm (v1, v1, vecSecondHalf );
v9 = v8;
// store results
ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
}
for ( ; i < numSamples; i += 2 ) {
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
}
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
int i;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load src
vs1 = vec_ld( 0, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// put into vectors in order to store
v2 = vec_perm( v0, v0, vecBottom );
v3 = vec_perm( v0, v0, vecTop );
v4 = vec_perm( v1, v1, vecBottom );
v5 = vec_perm (v1, v1, vecTop );
// store results
ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
}
} else {
int i;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load src
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// put into vectors in order to store
v2 = vec_perm( v0, v0, vecFirstHalf );
v3 = vec_perm( v0, v0, vecSecondHalf );
v4 = vec_perm( v1, v1, vecFirstHalf );
v5 = vec_perm (v1, v1, vecSecondHalf );
// store results
ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
}
// cleanup
for ( ; i < numSamples; i += 2 ) {
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
}
}
} else if ( kHz == 44100 ) {
int i;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( i = 0; i+7 < numSamples; i += 8 ) {
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
//unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
//convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
//store results
ALIGNED_STORE2( &dest[i], v0, v1 );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i] = (float) src[i];
}
} else {
assert( 0 );
}
}
#else
/*
============
idSIMD_AltiVec::UpSamplePCMTo44kHz
Duplicate samples for 44kHz output.
Assumptions:
No assumptions
============
*/
void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
vector signed short vs0, vs1;
register vector signed int vi0, vi1;
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
// permute vectors
register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
// calculate perm vector and masks for stores
vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
// original values of dest
vector float vecDest = vec_ld( 0, &dest[0] );
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
// 8 at a time
int i = 0;
vector signed short vsOld = vec_ld( 0, &src[i] );
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
for ( ; i+7 < numSamples; i+= 8 ) {
// load src
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// permute into vectors in the order to store
v2 = vec_splat( v0, 0 );
v3 = vec_splat( v0, 1 );
v4 = vec_splat( v0, 2 );
v5 = vec_splat( v0, 3 );
v6 = vec_splat( v1, 0 );
v7 = vec_splat( v1, 1 );
v8 = vec_splat( v1, 2 );
v9 = vec_splat( v1, 3 );
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
v6 = vec_perm( v6, v6, storePerm );
v7 = vec_perm( v7, v7, storePerm );
v8 = vec_perm( v8, v8, storePerm );
v9 = vec_perm( v9, v9, storePerm );
// store results
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
vecDest = vec_sel( v9, vecDestEnd, mask );
vec_st( vecDest, 127, &dest[i*4] );
}
// cleanup
for (; i < numSamples; i++ ) {
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
}
} else {
int i = 0;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( ; i+7 < numSamples; i += 8 ) {
// load src
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// put into vectors in order to store
v2 = vec_perm( v0, v0, vecFirstHalf );
v3 = v2;
v4 = vec_perm( v0, v0, vecSecondHalf );
v5 = v4;
v6 = vec_perm( v1, v1, vecFirstHalf );
v7 = v6;
v8 = vec_perm (v1, v1, vecSecondHalf );
v9 = v8;
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
v6 = vec_perm( v6, v6, storePerm );
v7 = vec_perm( v7, v7, storePerm );
v8 = vec_perm( v8, v8, storePerm );
v9 = vec_perm( v9, v9, storePerm );
// store results
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
vecDest = vec_sel( v9, vecDestEnd, mask );
vec_st( vecDest, 127, &dest[i*4] );
}
for ( ; i < numSamples; i += 2 ) {
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
}
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
int i;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load src
vs1 = vec_ld( 0, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// put into vectors in order to store
v2 = vec_perm( v0, v0, vecBottom );
v3 = vec_perm( v0, v0, vecTop );
v4 = vec_perm( v1, v1, vecBottom );
v5 = vec_perm (v1, v1, vecTop );
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
// store results
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
vecDest = vec_sel( v5, vecDestEnd, mask );
vec_st( vecDest, 63, &dest[i*2] );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
}
} else {
int i;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load src
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
// unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
// convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
// put into vectors in order to store
v2 = vec_perm( v0, v0, vecFirstHalf );
v3 = vec_perm( v0, v0, vecSecondHalf );
v4 = vec_perm( v1, v1, vecFirstHalf );
v5 = vec_perm (v1, v1, vecSecondHalf );
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
// store results
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
vecDest = vec_sel( v5, vecDestEnd, mask );
vec_st( vecDest, 63, &dest[i*2] );
}
// cleanup
for ( ; i < numSamples; i += 2 ) {
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
}
}
} else if ( kHz == 44100 ) {
int i;
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
vector signed short vsOld = vec_ld( 0, &src[0] );
for ( i = 0; i+7 < numSamples; i += 8 ) {
//vs0 = vec_ld( 0, &src[i] );
vs1 = vec_ld( 15, &src[i] );
vs0 = vec_perm( vsOld, vs1, permVec );
vsOld = vs1;
vector float vecDestEnd = vec_ld( 31, &dest[i] );
//unpack shorts to ints
vi0 = vec_unpackh( vs0 );
vi1 = vec_unpackl( vs0 );
//convert ints to floats
v0 = vec_ctf( vi0, 0 );
v1 = vec_ctf( vi1, 0 );
v0 = vec_perm( v0, v0, storePerm );
v1 = vec_perm( v1, v1, storePerm );
// store results
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
vecDest = vec_sel( v1, vecDestEnd, mask );
vec_st( vecDest, 31, &dest[i] );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i] = (float) src[i];
}
} else {
assert( 0 );
}
}
#endif
#ifdef SOUND_DEST_ALIGNED
/*
============
idSIMD_AltiVec::UpSampleOGGTo44kHz
Duplicate samples for 44kHz output.
Assumptions:
Assumes that dest starts at aligned address
============
*/
void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
// dest is aligned
assert( IS_16BYTE_ALIGNED( dest[0] ) );
register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
register vector float constVec, zeroVector;
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
vector unsigned char vecPerm1;
vector unsigned char vecPerm2;
vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
constVec = (vector float)(32768.0f);
zeroVector = (vector float)(0.0);
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
v10 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+7 < numSamples; i += 8 ) {
// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
v8 = v10;
v9 = vec_ld( 15, &ogg[0][i] );
v10 = vec_ld( 31, &ogg[0][i] );
v0 = vec_perm( v8, v9, vecPerm1 );
v1 = vec_perm( v9, v10, vecPerm1 );
// now we have the elements in a vector, we want
// to splat them each accross their own vector
oggVec1 = vec_splat( v0, 0 );
oggVec2 = vec_splat( v0, 1 );
oggVec3 = vec_splat( v0, 2 );
oggVec4 = vec_splat( v0, 3 );
oggVec5 = vec_splat( v1, 0 );
oggVec6 = vec_splat( v1, 1 );
oggVec7 = vec_splat( v1, 2 );
oggVec8 = vec_splat( v1, 3 );
v0 = vec_madd( oggVec1, constVec, zeroVector );
v1 = vec_madd( oggVec2, constVec, zeroVector );
v2 = vec_madd( oggVec3, constVec, zeroVector );
v3 = vec_madd( oggVec4, constVec, zeroVector );
v4 = vec_madd( oggVec5, constVec, zeroVector );
v5 = vec_madd( oggVec6, constVec, zeroVector );
v6 = vec_madd( oggVec7, constVec, zeroVector );
v7 = vec_madd( oggVec8, constVec, zeroVector );
//store results
ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
}
//cleanup
for ( ; i < numSamples; i++ ) {
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
}
} else {
// calculate perm vec for ogg
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
v7 = vec_ld( 0, &ogg[1][0] );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
v8 = v9;
v9 = vec_ld( 15, &ogg[0][i] );
v0 = vec_perm( v8, v9, vecPerm1 );
// now we have the elements in a vector, we want
// to splat them each accross their own vector
oggVec1 = vec_splat( v0, 0 );
oggVec2 = vec_splat( v0, 1 );
oggVec3 = vec_splat( v0, 2 );
oggVec4 = vec_splat( v0, 3 );
// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
v6 = v7;
v7 = vec_ld( 15, &ogg[1][i] );
v1 = vec_perm( v6, v7, vecPerm2 );
// now we have the elements in a vector, we want
// to splat them each accross their own vector
oggVec5 = vec_splat( v1, 0 );
oggVec6 = vec_splat( v1, 1 );
oggVec7 = vec_splat( v1, 2 );
oggVec8 = vec_splat( v1, 3 );
oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
//merge generates the interleaved pattern that we want and it
//doesn't require a permute vector, so use that instead
v0 = vec_mergeh( oggVec1, oggVec5 );
v1 = vec_mergel( oggVec1, oggVec5 );
v2 = vec_mergeh( oggVec2, oggVec6 );
v3 = vec_mergel( oggVec2, oggVec6 );
v4 = vec_mergeh( oggVec3, oggVec7 );
v5 = vec_mergel( oggVec3, oggVec7 );
v6 = vec_mergeh( oggVec4, oggVec8 );
v10 = vec_mergel( oggVec4, oggVec8 );
//store results
ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
}
//cleanup
for ( ; i < numSamples >> 1; i++ ) {
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
}
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
v10 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load values from ogg
v8 = v10;
v9 = vec_ld( 15, &ogg[0][i] );
v10 = vec_ld( 31, &ogg[0][i] );
v0 = vec_perm( v8, v9, vecPerm1 );
v1 = vec_perm( v9, v10, vecPerm1 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// permute into results vectors to store
v5 = vec_perm( v0, v0, vecOneTwo );
v6 = vec_perm( v0, v0, vecThreeFour);
v7 = vec_perm( v1, v1, vecOneTwo );
v8 = vec_perm( v1, v1, vecThreeFour );
//store results
ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
}
} else {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
v7 = vec_ld( 0, &ogg[1][0] );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
// load ogg[0][i] to ogg[0][i+4]
v8 = v9;
v9 = vec_ld( 15, &ogg[0][i] );
v0 = vec_perm( v8, v9, vecPerm1 );
// load ogg[1][i] to ogg[1][i+3]
v6 = v7;
v7 = vec_ld( 15, &ogg[1][i] );
v1 = vec_perm( v6, v7, vecPerm2 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// generate result vectors to store
v2 = vec_perm( v0, v1, vecFirst );
v3 = vec_perm( v0, v1, vecSecond );
v4 = vec_perm( v0, v1, vecThird );
v5 = vec_perm( v0, v1, vecFourth );
// store results
ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
}
// cleanup
for ( ; i < numSamples >> 1; i++ ) {
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
}
}
} else if ( kHz == 44100 ) {
if ( numChannels == 1 ) {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load values from ogg
v8 = v9;
v7 = vec_ld( 15, &ogg[0][i] );
v6 = v7;
v9 = vec_ld( 31, &ogg[0][i] );
v0 = vec_perm( v8, v7, vecPerm1 );
v1 = vec_perm( v6, v9, vecPerm1 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
ALIGNED_STORE2( &dest[i], v0, v1 );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i*1+0] = ogg[0][i] * 32768.0f;
}
} else {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
v7 = vec_ld( 0, &ogg[1][0] );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
v8 = v9;
v9 = vec_ld( 15, &ogg[0][i] );
v0 = vec_perm( v8, v9, vecPerm1 );
// load ogg[1][i] to ogg[1][i+3]
v6 = v7;
v7 = vec_ld( 15, &ogg[1][i] );
v1 = vec_perm( v6, v7, vecPerm2 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// generate result vectors
v2 = vec_mergeh( v0, v1 );
v3 = vec_mergel( v0, v1 );
// store results
ALIGNED_STORE2( &dest[i*2], v2, v3 );
}
// cleanup
for ( ; i < numSamples >> 1; i++ ) {
dest[i*2+0] = ogg[0][i] * 32768.0f;
dest[i*2+1] = ogg[1][i] * 32768.0f;
}
}
} else {
assert( 0 );
}
}
#else
/*
============
idSIMD_AltiVec::UpSampleOGGTo44kHz
Duplicate samples for 44kHz output.
Assumptions:
No assumptions
============
*/
void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
register vector float constVec, zeroVector;
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
vector unsigned char vecPerm1;
vector unsigned char vecPerm2;
vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
vector unsigned char storePerm;
constVec = (vector float)(32768.0f);
zeroVector = (vector float)(0.0);
// calculate perm vector and masks for stores
storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
// original values of dest
vector float vecDest = vec_ld( 0, &dest[0] );
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
if ( kHz == 11025 ) {
if ( numChannels == 1 ) {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
v10 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+7 < numSamples; i += 8 ) {
// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
v8 = v10;
v9 = vec_ld( 15, &ogg[0][i] );
v10 = vec_ld( 31, &ogg[0][i] );
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
v0 = vec_perm( v8, v9, vecPerm1 );
v1 = vec_perm( v9, v10, vecPerm1 );
// now we have the elements in a vector, we want
// to splat them each accross their own vector
oggVec1 = vec_splat( v0, 0 );
oggVec2 = vec_splat( v0, 1 );
oggVec3 = vec_splat( v0, 2 );
oggVec4 = vec_splat( v0, 3 );
oggVec5 = vec_splat( v1, 0 );
oggVec6 = vec_splat( v1, 1 );
oggVec7 = vec_splat( v1, 2 );
oggVec8 = vec_splat( v1, 3 );
v0 = vec_madd( oggVec1, constVec, zeroVector );
v1 = vec_madd( oggVec2, constVec, zeroVector );
v2 = vec_madd( oggVec3, constVec, zeroVector );
v3 = vec_madd( oggVec4, constVec, zeroVector );
v4 = vec_madd( oggVec5, constVec, zeroVector );
v5 = vec_madd( oggVec6, constVec, zeroVector );
v6 = vec_madd( oggVec7, constVec, zeroVector );
v7 = vec_madd( oggVec8, constVec, zeroVector );
// rotate input data
v0 = vec_perm( v0, v0, storePerm );
v1 = vec_perm( v1, v1, storePerm );
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
v6 = vec_perm( v6, v6, storePerm );
v7 = vec_perm( v7, v7, storePerm );
// store results
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
vecDest = vec_sel( v7, vecDestEnd, mask );
vec_st( vecDest, 127, &dest[i*4] );
}
//cleanup
for ( ; i < numSamples; i++ ) {
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
}
} else {
// calculate perm vec for ogg
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
v7 = vec_ld( 0, &ogg[1][0] );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
v8 = v9;
v9 = vec_ld( 15, &ogg[0][i] );
vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
v0 = vec_perm( v8, v9, vecPerm1 );
// now we have the elements in a vector, we want
// to splat them each accross their own vector
oggVec1 = vec_splat( v0, 0 );
oggVec2 = vec_splat( v0, 1 );
oggVec3 = vec_splat( v0, 2 );
oggVec4 = vec_splat( v0, 3 );
// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
v6 = v7;
v7 = vec_ld( 15, &ogg[1][i] );
v1 = vec_perm( v6, v7, vecPerm2 );
// now we have the elements in a vector, we want
// to splat them each accross their own vector
oggVec5 = vec_splat( v1, 0 );
oggVec6 = vec_splat( v1, 1 );
oggVec7 = vec_splat( v1, 2 );
oggVec8 = vec_splat( v1, 3 );
oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
//merge generates the interleaved pattern that we want and it
//doesn't require a permute vector, so use that instead
v0 = vec_mergeh( oggVec1, oggVec5 );
v1 = vec_mergel( oggVec1, oggVec5 );
v2 = vec_mergeh( oggVec2, oggVec6 );
v3 = vec_mergel( oggVec2, oggVec6 );
v4 = vec_mergeh( oggVec3, oggVec7 );
v5 = vec_mergel( oggVec3, oggVec7 );
v6 = vec_mergeh( oggVec4, oggVec8 );
v10 = vec_mergel( oggVec4, oggVec8 );
// rotate input data
v0 = vec_perm( v0, v0, storePerm );
v1 = vec_perm( v1, v1, storePerm );
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
v6 = vec_perm( v6, v6, storePerm );
v10 = vec_perm( v10, v10, storePerm );
// store results
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
vecDest = vec_sel( v10, vecDestEnd, mask );
vec_st( vecDest, 127, &dest[i*8] );
}
//cleanup
for ( ; i < numSamples >> 1; i++ ) {
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
}
}
} else if ( kHz == 22050 ) {
if ( numChannels == 1 ) {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
v10 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load values from ogg
v8 = v10;
v9 = vec_ld( 15, &ogg[0][i] );
v10 = vec_ld( 31, &ogg[0][i] );
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
v0 = vec_perm( v8, v9, vecPerm1 );
v1 = vec_perm( v9, v10, vecPerm1 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// permute into results vectors to store
v5 = vec_perm( v0, v0, vecOneTwo );
v6 = vec_perm( v0, v0, vecThreeFour);
v7 = vec_perm( v1, v1, vecOneTwo );
v8 = vec_perm( v1, v1, vecThreeFour );
// rotate input data
v5 = vec_perm( v5, v5, storePerm );
v6 = vec_perm( v6, v6, storePerm );
v7 = vec_perm( v7, v7, storePerm );
v8 = vec_perm( v8, v8, storePerm );
// store results
vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
vecDest = vec_sel( v8, vecDestEnd, mask );
vec_st( vecDest, 63, &dest[i*2] );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
}
} else {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
v7 = vec_ld( 0, &ogg[1][0] );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
// load ogg[0][i] to ogg[0][i+4]
v8 = v9;
v9 = vec_ld( 15, &ogg[0][i] );
vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
v0 = vec_perm( v8, v9, vecPerm1 );
// load ogg[1][i] to ogg[1][i+3]
v6 = v7;
v7 = vec_ld( 15, &ogg[1][i] );
v1 = vec_perm( v6, v7, vecPerm2 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// generate result vectors to store
v2 = vec_perm( v0, v1, vecFirst );
v3 = vec_perm( v0, v1, vecSecond );
v4 = vec_perm( v0, v1, vecThird );
v5 = vec_perm( v0, v1, vecFourth );
// rotate input data
v2 = vec_perm( v2, v2, storePerm );
v3 = vec_perm( v3, v3, storePerm );
v4 = vec_perm( v4, v4, storePerm );
v5 = vec_perm( v5, v5, storePerm );
// store results
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
vecDest = vec_sel( v5, vecDestEnd, mask );
vec_st( vecDest, 63, &dest[i*4] );
}
// cleanup
for ( ; i < numSamples >> 1; i++ ) {
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
}
}
} else if ( kHz == 44100 ) {
if ( numChannels == 1 ) {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+7 < numSamples; i += 8 ) {
// load values from ogg
v8 = v9;
v7 = vec_ld( 15, &ogg[0][i] );
v6 = v7;
v9 = vec_ld( 31, &ogg[0][i] );
vector float vecDestEnd = vec_ld( 31, &dest[i] );
v0 = vec_perm( v8, v7, vecPerm1 );
v1 = vec_perm( v6, v9, vecPerm1 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// rotate data
v0 = vec_perm( v0, v0, storePerm );
v1 = vec_perm( v1, v1, storePerm );
// store results
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
vecDest = vec_sel( v1, vecDestEnd, mask );
vec_st( vecDest, 31, &dest[i] );
}
// cleanup
for ( ; i < numSamples; i++ ) {
dest[i*1+0] = ogg[0][i] * 32768.0f;
}
} else {
// calculate perm vector and do first load
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
v7 = vec_ld( 0, &ogg[1][0] );
v9 = vec_ld( 0, &ogg[0][0] );
int i;
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
v8 = v9;
v9 = vec_ld( 15, &ogg[0][i] );
v0 = vec_perm( v8, v9, vecPerm1 );
// load ogg[1][i] to ogg[1][i+3]
v6 = v7;
v7 = vec_ld( 15, &ogg[1][i] );
v1 = vec_perm( v6, v7, vecPerm2 );
// multiply
v0 = vec_madd( v0, constVec, zeroVector );
v1 = vec_madd( v1, constVec, zeroVector );
// generate result vectors
v2 = vec_mergeh( v0, v1 );
v3 = vec_mergel( v0, v1 );
// store results
UNALIGNED_STORE2( &dest[i*2], v2, v3 );
}
// cleanup
for ( ; i < numSamples >> 1; i++ ) {
dest[i*2+0] = ogg[0][i] * 32768.0f;
dest[i*2+1] = ogg[1][i] * 32768.0f;
}
}
} else {
assert( 0 );
}
}
#endif /* SOUND_DEST_ALIGNED */
#ifdef SOUND_DEST_ALIGNED
/*
============
idSIMD_AltiVec::MixSoundTwoSpeakerMono
Assumptions:
Assumes that mixBuffer starts at aligned address
============
*/
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
// mixBuffer is aligned
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
int i;
float inc[2];
float spkr[4];
register vector float vecInc;
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
register vector float vecSamplesLd1, vecSamplesLd2;
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
//constants
vector float fourVec = (vector float)(4.0);
vector float zeroVec = (vector float)(0.0);
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
spkr[0] = lastV[0];
spkr[1] = lastV[1];
spkr[2] = lastV[0] + inc[0];
spkr[3] = lastV[1] + inc[1];
assert( numSamples == MIXBUFFER_SAMPLES );
inc[0] *= 2;
inc[1] *= 2;
//load data into registers
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
vecInc = vec_mergeh( v0, v1 );
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
// load spkr array
v0 = vec_mergeh( v2, v4 );
v1 = vec_mergeh( v3, v5 );
vecSpeaker1 = vec_mergeh( v0, v1 );
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
vecInc = vec_madd( vecInc, fourVec, zeroVec );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
//need a cleanup loop
for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
//load samples and mix buffers
vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
vecSamplesLd2 = vec_ld( 15, &samples[i] );
vecSamplesLast = vec_ld( 31, &samples[i] );
vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
// store results
ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
//add for next iteration
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
}
}
#else
/*
============
idSIMD_AltiVec::MixSoundTwoSpeakerMono
Assumptions:
No assumptions
============
*/
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
int i;
float inc[2];
float spkr[4];
register vector float vecInc;
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
register vector float vecSamplesLd1, vecSamplesLd2;
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
//constants
vector float fourVec = (vector float)(4.0);
vector float zeroVec = (vector float)(0.0);
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
spkr[0] = lastV[0];
spkr[1] = lastV[1];
spkr[2] = lastV[0] + inc[0];
spkr[3] = lastV[1] + inc[1];
assert( numSamples == MIXBUFFER_SAMPLES );
inc[0] *= 2;
inc[1] *= 2;
//load data into registers
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
vecInc = vec_mergeh( v0, v1 );
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
// load spkr array
v0 = vec_mergeh( v2, v4 );
v1 = vec_mergeh( v3, v5 );
vecSpeaker1 = vec_mergeh( v0, v1 );
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
vecInc = vec_madd( vecInc, fourVec, zeroVec );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
//need a cleanup loop
for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
//load samples and mix buffers
vecSamplesLd1 = vecSamplesLast;
vecSamplesLd2 = vec_ld( 15, &samples[i] );
vecSamplesLast = vec_ld( 31, &samples[i] );
vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
vecMixBuffer1 = vecDest;
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
// store results
UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
//add for next iteration
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
}
}
#endif /* SOUND_DEST_ALIGNED */
#ifdef SOUND_DEST_ALIGNED
/*
============
idSIMD_AltiVec::MixSoundTwoSpeakerStereo
Assumptions:
Assumes that mixBuffer starts at aligned address
============
*/
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
// mixBuffer is aligned
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
int i, k;
float inc[2];
float spkr[4];
// loading buffers
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
// loading buffers
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
register vector float vecInc;
vector float fourVec = (vector float)(4.0);
vector float zeroVec = (vector float)(0.0);
assert( numSamples == MIXBUFFER_SAMPLES );
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
spkr[0] = lastV[0];
spkr[1] = lastV[1];
spkr[2] = lastV[0] + inc[0];
spkr[3] = lastV[1] + inc[1];
for ( k = 0; k < 2; k++ ) {
inc[k] *= 2;
}
// load data in vectors
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
vecInc = vec_mergeh( v0, v1 );
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
// load spkr array
v0 = vec_mergeh( v2, v4 );
v1 = vec_mergeh( v3, v5 );
vecSpeaker1 = vec_mergeh( v0, v1 );
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
vecInc = vec_madd( vecInc, fourVec, zeroVec );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
//need a cleanup loop
for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
// load mix buffers and samples
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
vecSamples1 = vecSamplesLast;
vecSamples2 = vec_ld( 15, &samples[i*2] );
vecSamples3 = vec_ld( 31, &samples[i*2] );
vecSamples4 = vec_ld( 47, &samples[i*2] );
vecSamplesLast = vec_ld( 63, &samples[i*2] );
vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
//store results
ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
}
}
#else
/*
============
idSIMD_AltiVec::MixSoundTwoSpeakerStereo
Assumptions:
No assumptions
============
*/
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
int i, k;
float inc[2];
float spkr[4];
// loading buffers
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
// loading buffers
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
register vector float vecInc;
vector float fourVec = (vector float)(4.0);
vector float zeroVec = (vector float)(0.0);
assert( numSamples == MIXBUFFER_SAMPLES );
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
spkr[0] = lastV[0];
spkr[1] = lastV[1];
spkr[2] = lastV[0] + inc[0];
spkr[3] = lastV[1] + inc[1];
for ( k = 0; k < 2; k++ ) {
inc[k] *= 2;
}
// load data in vectors
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
vecInc = vec_mergeh( v0, v1 );
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
// load spkr array
v0 = vec_mergeh( v2, v4 );
v1 = vec_mergeh( v3, v5 );
vecSpeaker1 = vec_mergeh( v0, v1 );
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
vecInc = vec_madd( vecInc, fourVec, zeroVec );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
//need a cleanup loop
for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
// load mix buffers and samples
vecMixBuffer1 = vecDest;
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
vecSamples1 = vecSamplesLast;
vecSamples2 = vec_ld( 15, &samples[i*2] );
vecSamples3 = vec_ld( 31, &samples[i*2] );
vecSamples4 = vec_ld( 47, &samples[i*2] );
vecSamplesLast = vec_ld( 63, &samples[i*2] );
vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
// store results
UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
}
}
#endif /* SOUND_DEST_ALIGNED */
#ifdef SOUND_DEST_ALIGNED
/*
============
idSIMD_AltiVec::MixSoundSixSpeakerMono
Assumptions:
Assumes that mixBuffer starts at aligned address
============
*/
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
// mixBuffer is aligned
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
float incL[24];
float sL[24];
int i, k;
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
vector float vecSamplesLd;
vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
// permute vectors for sample
vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
assert( numSamples == MIXBUFFER_SAMPLES );
assert( SPEAKER_RIGHT == 1 );
assert( SPEAKER_BACKRIGHT == 5 );
// incL array, 6 elements repeated
incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
// sL array repeated
for ( k = 0; k < 6; k++ ) {
sL[k] = lastV[k];
}
for ( k = 6; k < 12; k++ ) {
sL[k] = lastV[k-6] + incL[k];
}
for ( k = 12; k < 18; k++ ) {
sL[k] = lastV[k-12] + incL[k] + incL[k];
}
for ( k = 18; k < 24; k++ ) {
sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
}
// multiply by 2 since doing 12 at a time
for ( k = 0; k < 24; k++ ) {
incL[k] *= 4;
}
//load the data
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
vecIncl1 = vec_ld( 0, &incL[0] );
vecIncl2 = vec_ld( 15, &incL[0] );
vecIncl3 = vec_ld( 31, &incL[0] );
vecIncl4 = vec_ld( 47, &incL[0] );
vecIncl5 = vec_ld( 63, &incL[0] );
vecIncl6 = vec_ld( 79, &incL[0] );
vecIncl7 = vec_ld( 95, &incL[0] );
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
vecSL1 = vec_ld( 0, &sL[0] );
vecSL2 = vec_ld( 15, &sL[0] );
vecSL3 = vec_ld( 31, &sL[0] );
vecSL4 = vec_ld( 47, &sL[0] );
vecSL5 = vec_ld( 63, &sL[0] );
vecSL6 = vec_ld( 79, &sL[0] );
vecSL7 = vec_ld( 95, &sL[0] );
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
//need a cleanup loop
for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
//load mix buffer into vectors, assume aligned
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
//load samples into vector
vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
vecSamplesLast = vecSamplesLd2;
//permute to get them ordered how we want
vecSamples1 = vec_splat( vecSamplesLd, 0 );
vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
vecSamples3 = vec_splat( vecSamplesLd, 1 );
vecSamples4 = vec_splat( vecSamplesLd, 2 );
vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
vecSamples6 = vec_splat( vecSamplesLd, 3 );
//do calculation
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
//store out results
ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
// add for next iteration
vecSL1 = vec_add( vecSL1, vecIncl1 );
vecSL2 = vec_add( vecSL2, vecIncl2 );
vecSL3 = vec_add( vecSL3, vecIncl3 );
vecSL4 = vec_add( vecSL4, vecIncl4 );
vecSL5 = vec_add( vecSL5, vecIncl5 );
vecSL6 = vec_add( vecSL6, vecIncl6 );
}
}
#else
/*
============
idSIMD_AltiVec::MixSoundSixSpeakerMono
Assumptions:
No assumptions
============
*/
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
float incL[24];
float sL[24];
int i, k;
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
vector float vecSamplesLd;
vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
// permute vectors for sample
register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
assert( numSamples == MIXBUFFER_SAMPLES );
assert( SPEAKER_RIGHT == 1 );
assert( SPEAKER_BACKRIGHT == 5 );
// incL array, 6 elements repeated
incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
// sL array repeated
for ( k = 0; k < 6; k++ ) {
sL[k] = lastV[k];
}
for ( k = 6; k < 12; k++ ) {
sL[k] = lastV[k-6] + incL[k];
}
for ( k = 12; k < 18; k++ ) {
sL[k] = lastV[k-12] + incL[k] + incL[k];
}
for ( k = 18; k < 24; k++ ) {
sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
}
// multiply by 2 since doing 12 at a time
for ( k = 0; k < 24; k++ ) {
incL[k] *= 4;
}
// load the data
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
vecIncl1 = vec_ld( 0, &incL[0] );
vecIncl2 = vec_ld( 15, &incL[0] );
vecIncl3 = vec_ld( 31, &incL[0] );
vecIncl4 = vec_ld( 47, &incL[0] );
vecIncl5 = vec_ld( 63, &incL[0] );
vecIncl6 = vec_ld( 79, &incL[0] );
vecIncl7 = vec_ld( 95, &incL[0] );
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
vecSL1 = vec_ld( 0, &sL[0] );
vecSL2 = vec_ld( 15, &sL[0] );
vecSL3 = vec_ld( 31, &sL[0] );
vecSL4 = vec_ld( 47, &sL[0] );
vecSL5 = vec_ld( 63, &sL[0] );
vecSL6 = vec_ld( 79, &sL[0] );
vecSL7 = vec_ld( 95, &sL[0] );
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
//need a cleanup loop
for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
//load mix buffer into vectors
vecMixBuffer1 = vecDest;
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
//load samples into vector
vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
vecSamplesLast = vecSamplesLd2;
//permute to get them ordered how we want
vecSamples1 = vec_splat( vecSamplesLd, 0 );
vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
vecSamples3 = vec_splat( vecSamplesLd, 1 );
vecSamples4 = vec_splat( vecSamplesLd, 2 );
vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
vecSamples6 = vec_splat( vecSamplesLd, 3 );
//do calculation
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
// store results
UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
// add for next iteration
vecSL1 = vec_add( vecSL1, vecIncl1 );
vecSL2 = vec_add( vecSL2, vecIncl2 );
vecSL3 = vec_add( vecSL3, vecIncl3 );
vecSL4 = vec_add( vecSL4, vecIncl4 );
vecSL5 = vec_add( vecSL5, vecIncl5 );
vecSL6 = vec_add( vecSL6, vecIncl6 );
}
}
#endif /* SOUND_DEST_ALIGNED */
#ifdef SOUND_DEST_ALIGNED
/*
============
idSIMD_AltiVec::MixSoundSixSpeakerStereo
Assumptions:
Assumes that mixBuffer starts at aligned address
============
*/
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
// mixBuffer is aligned
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
float incL[12];
float sL[12];
int i;
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
vector float vecSL1, vecSL2, vecSL3, vecSL4;
vector float vecSamplesLd;
vector float vecSamples1, vecSamples2, vecSamples3;
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
// permute vectors for sample
vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
assert( numSamples == MIXBUFFER_SAMPLES );
assert( SPEAKER_RIGHT == 1 );
assert( SPEAKER_BACKRIGHT == 5 );
// incL array, 6 elements repeated
incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
// sL array repeated
sL[0] = lastV[0];
sL[1] = lastV[1];
sL[2] = lastV[2];
sL[3] = lastV[3];
sL[4] = lastV[4];
sL[5] = lastV[5];
sL[6] = lastV[0] + incL[0];
sL[7] = lastV[1] + incL[1];
sL[8] = lastV[2] + incL[2];
sL[9] = lastV[3] + incL[3];
sL[10] = lastV[4] + incL[4];
sL[11] = lastV[5] + incL[5];
// multiply by 2 since doing 12 at a time
incL[0] *= 2;
incL[1] *= 2;
incL[2] *= 2;
incL[3] *= 2;
incL[4] *= 2;
incL[5] *= 2;
incL[6] *= 2;
incL[7] *= 2;
incL[8] *= 2;
incL[9] *= 2;
incL[10] *= 2;
incL[11] *= 2;
//we aligned this data, so load it up
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
vecIncl1 = vec_ld( 0, &incL[0] );
vecIncl2 = vec_ld( 15, &incL[0] );
vecIncl3 = vec_ld( 31, &incL[0] );
vecIncl4 = vec_ld( 47, &incL[0] );
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
vecSL1 = vec_ld( 0, &sL[0] );
vecSL2 = vec_ld( 15, &sL[0] );
vecSL3 = vec_ld( 31, &sL[0] );
vecSL4 = vec_ld( 47, &sL[0] );
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
//load mix buffer into vectors, assume aligned
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
//load samples into vector
vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
vecSamplesLast = vecSamplesLd2;
//permute to get them ordered how we want. For the 2nd vector,
//the order happens to be the same as the order we loaded them
//in, so there's no need to permute that one
vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
vecSamples2 = vecSamplesLd;
vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
//do calculation
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
//store out results
ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
// add for next iteration
vecSL1 = vec_add( vecSL1, vecIncl1 );
vecSL2 = vec_add( vecSL2, vecIncl2 );
vecSL3 = vec_add( vecSL3, vecIncl3 );
}
}
#else
/*
============
idSIMD_AltiVec::MixSoundSixSpeakerStereo
Assumptions:
No assumptions
============
*/
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
float incL[12];
float sL[12];
int i;
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
vector float vecSL1, vecSL2, vecSL3, vecSL4;
vector float vecSamplesLd;
vector float vecSamples1, vecSamples2, vecSamples3;
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
// permute vectors for sample
vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
assert( numSamples == MIXBUFFER_SAMPLES );
assert( SPEAKER_RIGHT == 1 );
assert( SPEAKER_BACKRIGHT == 5 );
// incL array, 6 elements repeated
incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
// sL array repeated
sL[0] = lastV[0];
sL[1] = lastV[1];
sL[2] = lastV[2];
sL[3] = lastV[3];
sL[4] = lastV[4];
sL[5] = lastV[5];
sL[6] = lastV[0] + incL[0];
sL[7] = lastV[1] + incL[1];
sL[8] = lastV[2] + incL[2];
sL[9] = lastV[3] + incL[3];
sL[10] = lastV[4] + incL[4];
sL[11] = lastV[5] + incL[5];
// multiply by 2 since doing 12 at a time
incL[0] *= 2;
incL[1] *= 2;
incL[2] *= 2;
incL[3] *= 2;
incL[4] *= 2;
incL[5] *= 2;
incL[6] *= 2;
incL[7] *= 2;
incL[8] *= 2;
incL[9] *= 2;
incL[10] *= 2;
incL[11] *= 2;
// load the data
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
vecIncl1 = vec_ld( 0, &incL[0] );
vecIncl2 = vec_ld( 15, &incL[0] );
vecIncl3 = vec_ld( 31, &incL[0] );
vecIncl4 = vec_ld( 47, &incL[0] );
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
vecSL1 = vec_ld( 0, &sL[0] );
vecSL2 = vec_ld( 15, &sL[0] );
vecSL3 = vec_ld( 31, &sL[0] );
vecSL4 = vec_ld( 47, &sL[0] );
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
//load mix buffer into vectors
vecMixBuffer1 = vecDest;
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
//load samples into vector
vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
vecSamplesLast = vecSamplesLd2;
//permute to get them ordered how we want. For the 2nd vector,
//the order happens to be the same as the order we loaded them
//in, so there's no need to permute that one
vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
vecSamples2 = vecSamplesLd;
vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
//do calculation
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
// store results
UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
// add for next iteration
vecSL1 = vec_add( vecSL1, vecIncl1 );
vecSL2 = vec_add( vecSL2, vecIncl2 );
vecSL3 = vec_add( vecSL3, vecIncl3 );
}
}
#endif
/*
============
idSIMD_AltiVec::MixedSoundToSamples
============
*/
void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
//this is basically a clamp for sound mixing
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
register vector signed int vi0, vi1, vi2, vi3;
register vector signed short vs0, vs1;
register vector float minVec, maxVec, constVec;
int i = 0;
//unaligned at start, since samples is not 16-byte aligned
for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
}
constVec = (vector float)(65536.0f);
//splat min/max into a vector
minVec = (vector float)(-32768.0f);
maxVec = (vector float)(32767.0f);
vector float vecOld = vec_ld( 0, &mixBuffer[i] );
vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
//vectorize!
for ( ; i+15 < numSamples; i += 16 ) {
//load source
v0 = vecOld;
v1 = vec_ld( 15, &mixBuffer[i] );
v2 = vec_ld( 31, &mixBuffer[i] );
v3 = vec_ld( 31, &mixBuffer[i] );
vecOld = vec_ld( 47, &mixBuffer[i] );
v0 = vec_perm( v0, v1, permVec );
v1 = vec_perm( v1, v2, permVec );
v2 = vec_perm( v2, v3, permVec );
v3 = vec_perm( v3, vecOld, permVec );
//apply minimum
v4 = vec_max( v0, minVec );
v5 = vec_max( v1, minVec );
v6 = vec_max( v2, minVec );
v7 = vec_max( v3, minVec );
//apply maximum
v4 = vec_min( v4, maxVec );
v5 = vec_min( v5, maxVec );
v6 = vec_min( v6, maxVec );
v7 = vec_min( v7, maxVec );
// convert floats to ints
vi0 = vec_cts( v4, 0 );
vi1 = vec_cts( v5, 0 );
vi2 = vec_cts( v6, 0 );
vi3 = vec_cts( v7, 0 );
// pack ints into shorts
vs0 = vec_pack( vi0, vi1 );
vs1 = vec_pack( vi2, vi3 );
ALIGNED_STORE2( &samples[i], vs0, vs1 );
}
//handle cleanup
for ( ; i < numSamples ; i++ ) {
samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
}
}
#endif /* ENABLE_SOUND_ROUTINES */
#endif /* MACOS_X */