mirror of
https://github.com/dhewm/dhewm3-sdk.git
synced 2024-11-27 23:02:32 +00:00
afebd7e1e5
Don't include the lazy precompiled.h everywhere, only what's required for the compilation unit. platform.h needs to be included instead to provide all essential defines and types. All includes use the relative path to the neo or the game specific root. Move all idlib related includes from idlib/Lib.h to precompiled.h. precompiled.h still exists for the MFC stuff in tools/. Add some missing header guards.
11238 lines
364 KiB
C++
11238 lines
364 KiB
C++
/*
|
|
===========================================================================
|
|
|
|
Doom 3 GPL Source Code
|
|
Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
|
|
|
|
This file is part of the Doom 3 GPL Source Code ("Doom 3 Source Code").
|
|
|
|
Doom 3 Source Code is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
Doom 3 Source Code is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
|
|
|
|
If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
|
|
|
|
===========================================================================
|
|
*/
|
|
|
|
#include <math.h>
|
|
#include <float.h>
|
|
|
|
#include "sys/platform.h"
|
|
|
|
#include "idlib/math/Simd_AltiVec.h"
|
|
|
|
// Doom3 SIMD Library version 0.5
|
|
// Patrick Flanagan (pflanagan@apple.com)
|
|
// Sanjay Patel (spatel@apple.com)
|
|
// Architecture & Performance Group, Apple Computer
|
|
|
|
|
|
//===============================================================
|
|
//
|
|
// AltiVec implementation of idSIMDProcessor
|
|
//
|
|
//===============================================================
|
|
|
|
#if defined(__GNUC__) && defined(__ALTIVEC__)
|
|
|
|
#ifdef PPC_INTRINSICS
|
|
// for square root estimate instruction
|
|
#include <ppc_intrinsics.h>
|
|
#endif
|
|
|
|
// Data struct sizes
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// 60 bytes, 15 floats at 4 bytes each
|
|
#define DRAWVERT_OFFSET 15
|
|
#else
|
|
// 64 bytes, 16 floats
|
|
#define DRAWVERT_OFFSET 16
|
|
#endif
|
|
// 16 bytes each, 4 floats
|
|
#define PLANE_OFFSET 4
|
|
// 16 bytes each, 4 floats
|
|
#define IDVEC4_OFFSET 4
|
|
|
|
// Alignment tests
|
|
#define IS_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F ) == 0 )
|
|
#define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned int)&x & 0x0F) != 0 )
|
|
|
|
// Aligned storing floats
|
|
#define ALIGNED_STORE2( ADDR, V0, V1 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR )
|
|
|
|
#define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR )
|
|
|
|
#define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR ); \
|
|
vec_st( V3, 48, ADDR )
|
|
|
|
#define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR ); \
|
|
vec_st( V3, 48, ADDR ); \
|
|
vec_st( V4, 64, ADDR ); \
|
|
vec_st( V5, 80, ADDR )
|
|
|
|
#define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR ); \
|
|
vec_st( V3, 48, ADDR ); \
|
|
vec_st( V4, 64, ADDR ); \
|
|
vec_st( V5, 80, ADDR ); \
|
|
vec_st( V6, 96, ADDR ); \
|
|
vec_st( V7, 112, ADDR )
|
|
|
|
// Unaligned storing floats. These assume that we can trash the input
|
|
#define UNALIGNED_STORE1( ADDR, V0 ) { \
|
|
/* use store element */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
vec_ste( V0, 0, ADDR ); \
|
|
vec_ste( V0, 4, ADDR ); \
|
|
vec_ste( V0, 8, ADDR ); \
|
|
vec_ste( V0, 12, ADDR ); \
|
|
}
|
|
|
|
#define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
|
|
ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); \
|
|
vec_st( ULStoreVal5, 63, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
|
|
V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
|
|
V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
|
|
ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
|
|
ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
|
|
ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); \
|
|
vec_st( ULStoreVal5, 63, ADDR ); \
|
|
vec_st( ULStoreVal6, 79, ADDR ); \
|
|
vec_st( ULStoreVal7, 95, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
|
|
V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
|
|
V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
|
|
V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
|
|
V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
|
|
V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
|
|
vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
|
|
ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
|
|
ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
|
|
ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
|
|
ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
|
|
ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
|
|
ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); \
|
|
vec_st( ULStoreVal5, 63, ADDR ); \
|
|
vec_st( ULStoreVal6, 79, ADDR ); \
|
|
vec_st( ULStoreVal7, 95, ADDR ); \
|
|
vec_st( ULStoreVal8, 111, ADDR ); \
|
|
vec_st( ULStoreVal9, 127, ADDR ); \
|
|
vec_st( ULStoreVal10, 143, ADDR ); }
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::GetName
|
|
============
|
|
*/
|
|
const char *idSIMD_AltiVec::GetName( void ) const {
|
|
return "AltiVec";
|
|
}
|
|
|
|
/*
|
|
Helper Functions
|
|
*/
|
|
#if 0
|
|
// Prints the values of a vector, useful for debugging but
|
|
// should never be called in real code
|
|
inline void debugPrintVector( vector float v, char *msg ) {
|
|
printf("%s -- %vf\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector unsigned int v, char *msg ) {
|
|
printf("%s -- %vd\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector bool int v, char *msg ) {
|
|
printf("%s -- %vi\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector unsigned char v, char *msg ) {
|
|
printf("%s -- %vuc\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector unsigned short v, char *msg ) {
|
|
printf("%s -- %vs\n", msg, v );
|
|
}
|
|
#endif
|
|
/*
|
|
===============
|
|
Reciprocal
|
|
|
|
For each element in vector:
|
|
n = 1 / n
|
|
===============
|
|
*/
|
|
|
|
// Use Newton-Raphson to calculate reciprocal of a vector
|
|
inline vector float Reciprocal( vector float v ) {
|
|
//Get the reciprocal estimate
|
|
vector float estimate = vec_re( v );
|
|
//One round of Newton-Raphson refinement
|
|
return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
ReciprocalSquareRoot
|
|
|
|
For each element in vector:
|
|
n = 1 / sqrt(n)
|
|
===============
|
|
*/
|
|
// Reciprocal square root estimate of a vector
|
|
inline vector float ReciprocalSquareRoot( vector float v ) {
|
|
//Get the square root reciprocal estimate
|
|
vector float zero = (vector float)(0);
|
|
vector float oneHalf = (vector float)(0.5);
|
|
vector float one = (vector float)(1.0);
|
|
vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
|
|
|
|
//One round of Newton-Raphson refinement
|
|
vector float estimateSquared = vec_madd( estimate, estimate, zero );
|
|
vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
|
|
return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
|
|
}
|
|
|
|
|
|
/*
|
|
===============
|
|
Divide
|
|
|
|
For each element in vectors:
|
|
n = a / b
|
|
===============
|
|
*/
|
|
// Use reciprocal estimate and multiply to divide a vector
|
|
inline vector float Divide( vector float a, vector float b ) {
|
|
return vec_madd( a, Reciprocal( b ), (vector float)(0) );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
loadSplatUnalignedScalar
|
|
|
|
For each element in vector:
|
|
n = s
|
|
===============
|
|
*/
|
|
inline vector float loadSplatUnalignedScalar( const float *s ) {
|
|
vector unsigned char splatMap = vec_lvsl( 0, s );
|
|
vector float v = vec_ld( 0, s );
|
|
splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
|
|
return vec_perm( v, v, splatMap );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
VectorATan16
|
|
|
|
For each element in vector:
|
|
n = idMath::ATan16( x, y )
|
|
===============
|
|
*/
|
|
// calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
|
|
inline vector float VectorATan16( vector float x, vector float y ) {
|
|
|
|
vector float xDivY = Divide( x, y );
|
|
vector float yDivX = Divide( y, x );
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
|
|
vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
|
|
vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
|
|
vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
|
|
|
|
// do calculation for S
|
|
vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
|
|
|
|
// get the regular S value
|
|
vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
|
|
|
|
// calculate what to return if y > x
|
|
vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
|
|
vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
|
|
vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
|
|
|
|
return vec_sel( modRet, vecS, vecCmp );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
VectorSin16
|
|
|
|
For each element in vector:
|
|
n = idMath::Sin16( v )
|
|
===============
|
|
*/
|
|
inline vector float VectorSin16( vector float v ) {
|
|
vector float zero = (vector float)(0);
|
|
|
|
#if 0
|
|
// load up half PI and use it to calculate the rest of the values. This is
|
|
// sometimes cheaper than loading them from memory
|
|
|
|
vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
|
|
vector float PI = vec_add( halfPI, halfPI );
|
|
vector float oneandhalfPI = vec_add( PI, halfPI );
|
|
vector float twoPI = vec_add( oneandhalfPI, halfPI );
|
|
#else
|
|
vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
|
|
vector float PI = (vector float)(3.14159265358979323846f);
|
|
vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
|
|
vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
|
|
#endif
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
|
|
|
|
vector float vecMod;
|
|
vector float vecResult;
|
|
|
|
// fix the range if needbe
|
|
vecMod = vec_floor( Divide( v, twoPI ) );
|
|
vecResult = vec_nmsub( vecMod, twoPI, v );
|
|
|
|
vector float vecPIminusA = vec_sub( PI, vecResult );
|
|
vector float vecAminus2PI = vec_sub( vecResult, twoPI );
|
|
|
|
vecCmp1 = vec_cmplt( vecResult, PI );
|
|
vecCmp2 = vec_cmpgt( vecResult, halfPI );
|
|
|
|
// these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
|
|
vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
|
|
|
|
// we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
|
|
vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
|
|
|
|
// these are ones where a < PI and a > HALF_PI so we set a = PI - a
|
|
vecCmp1 = vec_and( vecCmp1, vecCmp2 );
|
|
vecCmp1 = vec_or( vecCmp1, vecCmp4 );
|
|
|
|
// put the correct values into place
|
|
vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
|
|
vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
|
|
|
|
// calculate answer
|
|
vector float vecASquared = vec_madd( vecResult, vecResult, zero );
|
|
vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
|
|
return vec_madd( vecResult, vecEst, zero );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
vecSplatWithRunTime
|
|
|
|
For each element in vector:
|
|
n = v(i)
|
|
===============
|
|
*/
|
|
// splats an element across a vector using a runtime variable
|
|
inline vector float vecSplatWithRunTime( vector float v, int i ) {
|
|
vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
|
|
v = vec_perm( v, v, rotate );
|
|
return vec_splat( v, 0 );
|
|
}
|
|
|
|
|
|
/*
|
|
===============
|
|
FastScalarInvSqrt
|
|
|
|
n = 1 / sqrt( f )
|
|
===============
|
|
*/
|
|
inline float FastScalarInvSqrt( float f ) {
|
|
#ifdef PPC_INTRINSICS
|
|
float estimate;
|
|
const float kSmallestFloat = FLT_MIN;
|
|
|
|
//Calculate a 5 bit starting estimate for the reciprocal sqrt
|
|
estimate = __frsqrte ( f + kSmallestFloat );
|
|
|
|
//if you require less precision, you may reduce the number of loop iterations.
|
|
// This will do 2 rounds of NR
|
|
estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
|
|
estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
|
|
return estimate;
|
|
#else
|
|
return idMath::InvSqrt( f );
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
===============
|
|
FastScalarInvSqrt_x3
|
|
|
|
arg1 = 1 / sqrt( arg1 )
|
|
arg2 = 1 / sqrt( arg2 )
|
|
arg3 = 1 / sqrt( arg3 )
|
|
===============
|
|
*/
|
|
inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
|
|
#ifdef PPC_INTRINSICS
|
|
register float estimate1, estimate2, estimate3;
|
|
const float kSmallestFloat = FLT_MIN;
|
|
|
|
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
|
|
estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
|
|
estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
|
|
estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
|
|
|
|
// two rounds newton-raphson
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
|
|
*arg1 = estimate1;
|
|
*arg2 = estimate2;
|
|
*arg3 = estimate3;
|
|
#else
|
|
*arg1 = idMath::InvSqrt( *arg1 );
|
|
*arg2 = idMath::InvSqrt( *arg2 );
|
|
*arg3 = idMath::InvSqrt( *arg3 );
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
===============
|
|
FastScalarInvSqrt_x6
|
|
|
|
arg1 = 1 / sqrt( arg1 )
|
|
arg2 = 1 / sqrt( arg2 )
|
|
arg3 = 1 / sqrt( arg3 )
|
|
arg4 = 1 / sqrt( arg4 )
|
|
arg5 = 1 / sqrt( arg5 )
|
|
arg6 = 1 / sqrt( arg6 )
|
|
|
|
On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
|
|
===============
|
|
*/
|
|
inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
|
|
#ifdef PPC_INTRINSICS
|
|
register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
|
|
const float kSmallestFloat = FLT_MIN;
|
|
|
|
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
|
|
estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
|
|
estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
|
|
estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
|
|
estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
|
|
estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
|
|
estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
|
|
|
|
// two rounds newton-raphson
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
|
|
estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
|
|
estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
|
|
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
|
|
estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
|
|
estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
|
|
|
|
*arg1 = estimate1;
|
|
*arg2 = estimate2;
|
|
*arg3 = estimate3;
|
|
*arg4 = estimate4;
|
|
*arg5 = estimate5;
|
|
*arg6 = estimate6;
|
|
#else
|
|
*arg1 = idMath::InvSqrt( *arg1 );
|
|
*arg2 = idMath::InvSqrt( *arg2 );
|
|
*arg3 = idMath::InvSqrt( *arg3 );
|
|
*arg4 = idMath::InvSqrt( *arg4 );
|
|
*arg5 = idMath::InvSqrt( *arg5 );
|
|
*arg6 = idMath::InvSqrt( *arg6 );
|
|
#endif
|
|
}
|
|
|
|
|
|
// End Helper Functions
|
|
|
|
#ifdef ENABLE_SIMPLE_MATH
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Add
|
|
|
|
dst[i] = constant + src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
|
|
vector float v0, v1, v2, v3;
|
|
vector float v0_low, v0_hi, v1_hi;
|
|
vector unsigned char permVec;
|
|
vector float constVec;
|
|
int i;
|
|
|
|
// handle unaligned cases at beginning
|
|
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant + src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v0_hi, v1_hi, permVec );
|
|
|
|
v2 = vec_add( v0, constVec );
|
|
v3 = vec_add( v1, constVec );
|
|
|
|
// store results
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant + src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Add
|
|
|
|
dst[i] = src0[i] + src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
|
|
int i;
|
|
|
|
//unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src0[i] + src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
v4 = vec_add( v0, v1 );
|
|
v5 = vec_add( v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] + src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Sub
|
|
|
|
dst[i] = constant - src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
register vector unsigned char permVec;
|
|
register vector float constVec;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant - src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute vector and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = vec_sub( constVec, v0 );
|
|
v3 = vec_sub( constVec, v1 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant - src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Sub
|
|
|
|
dst[i] = src0[i] - src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src0[i] - src1[i];
|
|
}
|
|
|
|
//calculate permute and do first loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
v4 = vec_sub( v0, v1 );
|
|
v5 = vec_sub( v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] - src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Mul
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
|
|
register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
|
|
register vector float constVec;
|
|
register vector unsigned char permVec;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
int i;
|
|
|
|
// handle unaligned data at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = constant * src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = vec_madd( constVec, v0, zeroVector );
|
|
v3 = vec_madd( constVec, v1, zeroVector );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant * src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Mul
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
register vector float constVec = (vector float)(0.0);
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] * src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//no such thing as regular multiply so we do
|
|
//multiply then add zero
|
|
v4 = vec_madd( v0, v1, constVec );
|
|
v5 = vec_madd( v2, v3, constVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] * src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Div
|
|
|
|
dst[i] = constant / divisor[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
register vector unsigned char permVec;
|
|
register vector float constVec;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = constant / divisor[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do first loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &divisor[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &divisor[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &divisor[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = Divide( constVec, v0 );
|
|
v3 = Divide( constVec, v1 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant / divisor[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Div
|
|
|
|
dst[i] = src0[i] / src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] / src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
v4 = Divide( v0, v1 );
|
|
v5 = Divide( v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] / src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulAdd
|
|
|
|
dst[i] += constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
register vector float constVec;
|
|
//src
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] += constant * src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
|
|
// at this point, dst is known to be aligned
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
|
|
v4 = vec_madd( constVec, v0, v1 );
|
|
v5 = vec_madd( constVec, v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] += constant * src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulAdd
|
|
|
|
dst[i] += src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
|
|
int i;
|
|
|
|
//unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] += src0[i] * src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
// load sources
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//we know dst is aligned because we handled unaligned cases
|
|
//up front
|
|
v4 = vec_ld( 0, &dst[i] );
|
|
v5 = vec_ld( 16, &dst[i] );
|
|
|
|
v6 = vec_madd( v0, v1, v4 );
|
|
v7 = vec_madd( v2, v3, v5 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v6, v7 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] += src0[i] * src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulSub
|
|
|
|
dst[i] -= constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
register vector float constVec;
|
|
//src
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] -= constant * src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
|
|
//we know dst will be aligned here because we already handled the preceeding
|
|
//unaligned cases
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
|
|
v4 = vec_nmsub( v0, constVec, v1 );
|
|
v5 = vec_nmsub( v2, constVec, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] -= constant * src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulSub
|
|
|
|
dst[i] -= src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] -= src0[i] * src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
// load sources
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//we know dst is aligned because we handled unaligned cases
|
|
//up front
|
|
v4 = vec_ld( 0, &dst[i] );
|
|
v5 = vec_ld( 16, &dst[i] );
|
|
|
|
v6 = vec_nmsub( v0, v1, v4 );
|
|
v7 = vec_nmsub( v2, v3, v5 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v6, v7 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] -= src0[i] * src1[i];
|
|
}
|
|
}
|
|
|
|
#endif /* ENABLE_SIMPLE_MATH */
|
|
|
|
#ifdef ENABLE_DOT
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
register vector float vecX, vecY, vecZ;
|
|
vector float vecX2, vecY2, vecZ2;
|
|
const float *addr = src[0].ToFloatPtr();
|
|
float tempVal[4];
|
|
float constVal[4];
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
|
|
// permute vectors
|
|
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
|
|
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
|
|
|
|
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
|
|
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
|
|
|
|
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
|
|
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
|
|
|
|
int i;
|
|
|
|
// for scalar cleanup, if necessary
|
|
constVal[0] = constant[0];
|
|
constVal[1] = constant[1];
|
|
constVal[2] = constant[2];
|
|
constVal[3] = 0;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
|
|
vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
|
|
|
|
|
|
// populate const vectors
|
|
vecConstX = vec_splat( vecLd1, 0 );
|
|
vecConstY = vec_splat( vecLd1, 1 );
|
|
vecConstZ = vec_splat( vecLd1, 2 );
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, addr );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i];
|
|
}
|
|
|
|
for ( ; i + 7 < count; i += 8 ) {
|
|
float *vecPtr = (float*)( addr + (i*3) );
|
|
vector float v0, v1, v2, v3, v4, v5;
|
|
|
|
v0 = vecOld; //vec_ld( 0, vecPtr );
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
v3 = vec_ld( 47, vecPtr );
|
|
v4 = vec_ld( 63, vecPtr );
|
|
v5 = vec_ld( 79, vecPtr );
|
|
vecOld = vec_ld( 95, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, v3, permVec );
|
|
|
|
vecLd4 = vec_perm( v3, v4, permVec );
|
|
vecLd5 = vec_perm( v4, v5, permVec );
|
|
vecLd6 = vec_perm( v5, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors
|
|
vecX = vec_perm( vecLd1, vecLd2, permX1 );
|
|
vecY = vec_perm( vecLd1, vecLd2, permY1 );
|
|
vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
|
|
vecX = vec_perm( vecX, vecLd3, permX2 );
|
|
vecY = vec_perm( vecY, vecLd3, permY2 );
|
|
vecZ = vec_perm( vecZ, vecLd3, permZ2 );
|
|
|
|
vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
|
|
vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
|
|
vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
|
|
vecX2 = vec_perm( vecX2, vecLd6, permX2 );
|
|
vecY2 = vec_perm( vecY2, vecLd6, permY2 );
|
|
vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
|
|
|
|
// do multiply
|
|
vecX = vec_madd( vecX, vecConstX, zeroVector );
|
|
vecY = vec_madd( vecY, vecConstY, vecX );
|
|
vecZ = vec_madd( vecZ, vecConstZ, vecY );
|
|
|
|
vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
|
|
vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
|
|
vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < count; i++ ) {
|
|
// look up whats at the address we want, cast it as float pointer, then
|
|
// dereference that pointer
|
|
tempVal[0] = *( addr + (i*3) + 0 );
|
|
tempVal[1] = *( addr + (i*3) + 1 );
|
|
tempVal[2] = *( addr + (i*3) + 2 );
|
|
dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
|
|
|
|
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
float constVal[4];
|
|
float srcVal[3];
|
|
float srcI3;
|
|
float tempVal;
|
|
|
|
vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
|
|
vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
|
|
vector float vecX, vecY, vecZ, vecI3;
|
|
vector float vecX2, vecY2, vecZ2, vecI32;
|
|
vector float vecConstX, vecConstY, vecConstZ;
|
|
|
|
constVal[0] = constant[0];
|
|
constVal[1] = constant[1];
|
|
constVal[2] = constant[2];
|
|
constVal[3] = 1;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
|
|
vector float vecConst = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( vecConst, 0 );
|
|
vecConstY = vec_splat( vecConst, 1 );
|
|
vecConstZ = vec_splat( vecConst, 2 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
}
|
|
|
|
const float *addr = src[i].ToFloatPtr();
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, addr );
|
|
|
|
for ( ; i + 7 < count; i += 8 ) {
|
|
float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
v0 = vecOld; //vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
v2 = vec_ld( 31, planePtr );
|
|
v3 = vec_ld( 47, planePtr );
|
|
v4 = vec_ld( 63, planePtr );
|
|
v5 = vec_ld( 79, planePtr );
|
|
v6 = vec_ld( 95, planePtr );
|
|
v7 = vec_ld( 111, planePtr );
|
|
vecOld = vec_ld( 127, planePtr );
|
|
|
|
vecPlaneLd1 = vec_perm( v0, v1, permVec );
|
|
vecPlaneLd2 = vec_perm( v1, v2, permVec );
|
|
vecPlaneLd3 = vec_perm( v2, v3, permVec );
|
|
vecPlaneLd4 = vec_perm( v3, v4, permVec );
|
|
|
|
vecPlaneLd5 = vec_perm( v4, v5, permVec );
|
|
vecPlaneLd6 = vec_perm( v5, v6, permVec );
|
|
vecPlaneLd7 = vec_perm( v6, v7, permVec );
|
|
vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors, since this is square its basically
|
|
// a matrix transpose
|
|
v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
|
|
v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
|
|
v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
|
|
v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
|
|
|
|
vecX = vec_mergeh( v0, v1 );
|
|
vecY = vec_mergel( v0, v1 );
|
|
vecZ = vec_mergeh( v2, v3 );
|
|
vecI3 = vec_mergel( v2, v3 );
|
|
|
|
v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
|
|
v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
|
|
v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
|
|
v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
|
|
|
|
vecX2 = vec_mergeh( v4, v5 );
|
|
vecY2 = vec_mergel( v4, v5 );
|
|
vecZ2 = vec_mergeh( v6, v7 );
|
|
vecI32 = vec_mergel( v6, v7 );
|
|
|
|
// do calculation
|
|
v6 = vec_madd( vecZ, vecConstZ, vecI3 );
|
|
v5 = vec_madd( vecY, vecConstY, v6 );
|
|
v4 = vec_madd( vecX, vecConstX, v5 );
|
|
|
|
v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
|
|
v1 = vec_madd( vecY2, vecConstY, v0 );
|
|
v2 = vec_madd( vecX2, vecConstX, v1 );
|
|
|
|
// store results
|
|
ALIGNED_STORE2( &dst[i], v4, v2 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
// populate srcVal with src X Y Z
|
|
srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
|
|
srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
|
|
srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
|
|
|
|
// put src[i][3] into srcI3
|
|
srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
|
|
|
|
tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
|
|
dst[i] = tempVal + srcI3;
|
|
}
|
|
}
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i].xyz;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
|
|
|
|
// idDrawVert size is 60 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
int i;
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 11, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
// permute into constant vectors
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < count ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
for ( ; i+3 < count; i += 4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 11, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 11, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 11, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 11, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
|
|
// store results
|
|
vec_st( vecSrcZ1, 0, &dst[i] );
|
|
}
|
|
|
|
for ( ; i < count; i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i].xyz;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
|
|
|
|
// idDrawVert size is 64 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
int i;
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 11, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
// permute into constant vectors
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
|
|
for ( ; i+3 < count; i += 4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
|
|
// store results
|
|
vec_st( vecSrcZ1, 0, &dst[i] );
|
|
}
|
|
|
|
for ( ; i < count; i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i] + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
register vector float vecConst3;
|
|
|
|
idVec3 constNormal = constant.Normal();
|
|
float const3 = constant[3];
|
|
|
|
// permute vectors
|
|
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
|
|
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
|
|
|
|
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
|
|
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
|
|
|
|
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
|
|
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
|
|
|
|
int i;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
|
|
vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
|
|
|
|
// populate const vec
|
|
vecConstX = vec_splat( vecLd1, 0 );
|
|
vecConstY = vec_splat( vecLd1, 1 );
|
|
vecConstZ = vec_splat( vecLd1, 2 );
|
|
|
|
// put constant to add in vector
|
|
vecConst3 = loadSplatUnalignedScalar( &const3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i] + constant[3];
|
|
}
|
|
|
|
const float *addr = src[i].ToFloatPtr();
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, addr );
|
|
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
float *vecPtr = (float*)( addr + (i*3) );
|
|
vector float v0, v1, v2, v3, v4, v5;
|
|
|
|
v0 = vecOld; //vec_ld( 0, vecPtr );
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
v3 = vec_ld( 47, vecPtr );
|
|
v4 = vec_ld( 63, vecPtr );
|
|
v5 = vec_ld( 79, vecPtr );
|
|
vecOld = vec_ld( 95, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, v3, permVec );
|
|
|
|
vecLd4 = vec_perm( v3, v4, permVec );
|
|
vecLd5 = vec_perm( v4, v5, permVec );
|
|
vecLd6 = vec_perm( v5, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors
|
|
vecX = vec_perm( vecLd1, vecLd2, permX1 );
|
|
vecY = vec_perm( vecLd1, vecLd2, permY1 );
|
|
vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
|
|
vecX = vec_perm( vecX, vecLd3, permX2 );
|
|
vecY = vec_perm( vecY, vecLd3, permY2 );
|
|
vecZ = vec_perm( vecZ, vecLd3, permZ2 );
|
|
|
|
vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
|
|
vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
|
|
vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
|
|
vecX2 = vec_perm( vecX2, vecLd6, permX2 );
|
|
vecY2 = vec_perm( vecY2, vecLd6, permY2 );
|
|
vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
|
|
|
|
// calculate dot product
|
|
vecX = vec_madd( vecX, vecConstX, zeroVector );
|
|
vecY = vec_madd( vecY, vecConstY, vecX );
|
|
vecZ = vec_madd( vecZ, vecConstZ, vecY );
|
|
|
|
vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
|
|
vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
|
|
vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
|
|
|
|
// add in constant[3]
|
|
vecZ = vec_add( vecZ, vecConst3 );
|
|
vecZ2 = vec_add( vecZ2, vecConst3 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < count; i++ ) {
|
|
dst[i] = constNormal * src[i] + const3;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
|
|
|
|
// check plane size
|
|
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
|
|
|
|
float constVal[4];
|
|
float srcVal[4];
|
|
|
|
int i;
|
|
const float *constPtr = constant.ToFloatPtr();
|
|
|
|
register vector float vecX, vecY, vecZ, vecI3;
|
|
register vector float vecX2, vecY2, vecZ2, vecI32;
|
|
|
|
vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
|
|
vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
|
|
|
|
constVal[0] = *(constPtr);
|
|
constVal[1] = *(constPtr+1);
|
|
constVal[2] = *(constPtr+2);
|
|
constVal[3] = *(constPtr+3);
|
|
|
|
// populate const vector
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
|
|
vector float vecConst = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( vecConst, 0 );
|
|
vecConstY = vec_splat( vecConst, 1 );
|
|
vecConstZ = vec_splat( vecConst, 2 );
|
|
vecConstI3 = vec_splat( vecConst, 3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
}
|
|
|
|
const float *srcPtr = src[i].ToFloatPtr();
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, srcPtr );
|
|
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
v0 = vecOld; // vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
v2 = vec_ld( 31, planePtr );
|
|
v3 = vec_ld( 47, planePtr );
|
|
v4 = vec_ld( 63, planePtr );
|
|
v5 = vec_ld( 79, planePtr );
|
|
v6 = vec_ld( 95, planePtr );
|
|
v7 = vec_ld( 111, planePtr );
|
|
vecOld = vec_ld( 127, planePtr );
|
|
|
|
vecPlaneLd1 = vec_perm( v0, v1, permVec );
|
|
vecPlaneLd2 = vec_perm( v1, v2, permVec );
|
|
vecPlaneLd3 = vec_perm( v2, v3, permVec );
|
|
vecPlaneLd4 = vec_perm( v3, v4, permVec );
|
|
|
|
vecPlaneLd5 = vec_perm( v4, v5, permVec );
|
|
vecPlaneLd6 = vec_perm( v5, v6, permVec );
|
|
vecPlaneLd7 = vec_perm( v6, v7, permVec );
|
|
vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors, since this is square its basically
|
|
// a matrix transpose
|
|
v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
|
|
v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
|
|
v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
|
|
v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
|
|
|
|
vecX = vec_mergeh( v0, v1 );
|
|
vecY = vec_mergel( v0, v1 );
|
|
vecZ = vec_mergeh( v2, v3 );
|
|
vecI3 = vec_mergel( v2, v3 );
|
|
|
|
v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
|
|
v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
|
|
v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
|
|
v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
|
|
|
|
vecX2 = vec_mergeh( v4, v5 );
|
|
vecY2 = vec_mergel( v4, v5 );
|
|
vecZ2 = vec_mergeh( v6, v7 );
|
|
vecI32 = vec_mergel( v6, v7 );
|
|
|
|
// do calculation
|
|
v4 = vec_madd( vecConstX, vecX, zeroVector );
|
|
v5 = vec_madd( vecConstY, vecY, v4 );
|
|
v6 = vec_madd( vecConstZ, vecZ, v5 );
|
|
v7 = vec_madd( vecConstI3, vecI3, v6 );
|
|
|
|
v0 = vec_madd( vecConstX, vecX2, zeroVector );
|
|
v1 = vec_madd( vecConstY, vecY2, v0 );
|
|
v2 = vec_madd( vecConstZ, vecZ2, v1 );
|
|
v3 = vec_madd( vecConstI3, vecI32, v2 );
|
|
|
|
//store result
|
|
ALIGNED_STORE2( &dst[i], v7, v3 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
//dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
|
|
srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
|
|
srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
|
|
srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
|
|
dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
|
|
}
|
|
}
|
|
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
|
|
|
|
// idDrawVert size is 60 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
int i;
|
|
const float *constPtr = constant.ToFloatPtr();
|
|
const float *srcPtr = src[0].xyz.ToFloatPtr();
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float vecDest1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
float constVal[4];
|
|
float srcVal[3];
|
|
|
|
constVal[0] = *(constPtr+0);
|
|
constVal[1] = *(constPtr+1);
|
|
constVal[2] = *(constPtr+2);
|
|
constVal[3] = *(constPtr+3);
|
|
|
|
// populate const vec
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 15, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
vecConstI3 = vec_splat( v0, 3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
}
|
|
|
|
// every fourth one will have the same alignment, so can store these. Make sure we
|
|
// have enough so we don't run off the end of the array
|
|
if ( i+3 < count ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
for ( ; i+3 < count; i+=4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 11, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 11, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 11, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 11, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
|
|
|
|
// store results
|
|
vec_st( vecDest1, 0, &dst[i] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
// dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
|
|
dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
|
|
dst[i] += constVal[3];
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
|
|
|
|
// idDrawVert size is 60 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
int i;
|
|
const float *constPtr = constant.ToFloatPtr();
|
|
const float *srcPtr = src[0].xyz.ToFloatPtr();
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float vecDest1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
float constVal[4];
|
|
float srcVal[3];
|
|
|
|
constVal[0] = *(constPtr+0);
|
|
constVal[1] = *(constPtr+1);
|
|
constVal[2] = *(constPtr+2);
|
|
constVal[3] = *(constPtr+3);
|
|
|
|
// populate const vec
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 15, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
vecConstI3 = vec_splat( v0, 3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
}
|
|
|
|
for ( ; i+3 < count; i+=4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
|
|
|
|
// store results
|
|
vec_st( vecDest1, 0, &dst[i] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
// dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
|
|
dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
|
|
dst[i] += constVal[3];
|
|
}
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
|
|
|
|
int i;
|
|
float src0Val[3];
|
|
float src1Val[3];
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
|
|
register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
|
|
register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
// permute vectors
|
|
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
|
|
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
|
|
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
|
|
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
|
|
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
|
|
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src0[i] * src1[i];
|
|
}
|
|
|
|
const float *src0Ptr = src0[i].ToFloatPtr();
|
|
const float *src1Ptr = src1[i].ToFloatPtr();
|
|
vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
|
|
vector float vecOld0 = vec_ld( 0, src0Ptr );
|
|
vector float vecOld1 = vec_ld( 0, src1Ptr );
|
|
|
|
for ( i = 0; i+7 < count; i += 8 ) {
|
|
float *s0Ptr = (float*)( src0Ptr + (i*3) );
|
|
float *s1Ptr = (float*)( src1Ptr + (i*3) );
|
|
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
|
|
v0 = vecOld0;
|
|
v1 = vec_ld( 15, s0Ptr );
|
|
v2 = vec_ld( 31, s0Ptr );
|
|
v3 = vec_ld( 47, s0Ptr );
|
|
v4 = vec_ld( 63, s0Ptr );
|
|
v5 = vec_ld( 79, s0Ptr );
|
|
vecOld0 = vec_ld( 95, s0Ptr );
|
|
|
|
v6 = vecOld1;
|
|
v7 = vec_ld( 15, s1Ptr );
|
|
v8 = vec_ld( 31, s1Ptr );
|
|
v9 = vec_ld( 47, s1Ptr );
|
|
v10 = vec_ld( 63, s1Ptr );
|
|
v11 = vec_ld( 79, s1Ptr );
|
|
vecOld1 = vec_ld( 95, s1Ptr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec1 );
|
|
vecLd2 = vec_perm( v1, v2, permVec1 );
|
|
vecLd3 = vec_perm( v2, v3, permVec1 );
|
|
vecLd4 = vec_perm( v3, v4, permVec1 );
|
|
vecLd5 = vec_perm( v4, v5, permVec1 );
|
|
vecLd6 = vec_perm( v5, vecOld0, permVec1 );
|
|
|
|
vecLd7 = vec_perm( v6, v7, permVec2 );
|
|
vecLd8 = vec_perm( v7, v8, permVec2 );
|
|
vecLd9 = vec_perm( v8, v9, permVec2 );
|
|
vecLd10 = vec_perm( v9, v10, permVec2 );
|
|
vecLd11 = vec_perm( v10, v11, permVec2 );
|
|
vecLd12 = vec_perm( v11, vecOld1, permVec2 );
|
|
|
|
// permute into X Y Z vectors
|
|
vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
|
|
vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
|
|
vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
|
|
vecX0 = vec_perm( vecX0, vecLd3, permX2 );
|
|
vecY0 = vec_perm( vecY0, vecLd3, permY2 );
|
|
vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
|
|
|
|
vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
|
|
vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
|
|
vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
|
|
vecX02 = vec_perm( vecX02, vecLd6, permX2 );
|
|
vecY02 = vec_perm( vecY02, vecLd6, permY2 );
|
|
vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
|
|
|
|
vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
|
|
vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
|
|
vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
|
|
vecX1 = vec_perm( vecX1, vecLd9, permX2 );
|
|
vecY1 = vec_perm( vecY1, vecLd9, permY2 );
|
|
vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
|
|
|
|
vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
|
|
vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
|
|
vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
|
|
vecX12 = vec_perm( vecX12, vecLd12, permX2 );
|
|
vecY12 = vec_perm( vecY12, vecLd12, permY2 );
|
|
vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
|
|
|
|
// do multiply
|
|
vecX0 = vec_madd( vecX0, vecX1, zeroVector );
|
|
vecY0 = vec_madd( vecY0, vecY1, vecX0 );
|
|
vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
|
|
vecX02 = vec_madd( vecX02, vecX12, zeroVector );
|
|
vecY02 = vec_madd( vecY02, vecY12, vecX02 );
|
|
vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
// dst[i] = src0[i] * src1[i];
|
|
src0Val[0] = *( src0Ptr + (i*3) + 0 );
|
|
src0Val[1] = *( src0Ptr + (i*3) + 1 );
|
|
src0Val[2] = *( src0Ptr + (i*3) + 2 );
|
|
|
|
src1Val[0] = *( src1Ptr + (i*3) + 0 );
|
|
src1Val[1] = *( src1Ptr + (i*3) + 1 );
|
|
src1Val[2] = *( src1Ptr + (i*3) + 2 );
|
|
|
|
dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
|
|
dot = 0.0f;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float zeroVector;
|
|
register vector float runningTotal1, runningTotal2;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
|
|
int i = 0;
|
|
|
|
runningTotal1 = (vector float)(0.0);
|
|
runningTotal2 = (vector float)(0.0);
|
|
zeroVector = (vector float)(0.0);
|
|
|
|
if ( count >= 8 ) {
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src1[i] );
|
|
v3_hi = vec_ld( 0, &src2[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load sources
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src1[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src2[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src2[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//multiply together and keep running sum
|
|
runningTotal1 = vec_madd( v0, v1, runningTotal1 );
|
|
runningTotal2 = vec_madd( v2, v3, runningTotal2 );
|
|
}
|
|
|
|
runningTotal1 = vec_add( runningTotal1, runningTotal2 );
|
|
|
|
// sum accross vector
|
|
v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
|
|
v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
|
|
runningTotal1 = vec_splat( v1, 0 );
|
|
vec_ste( runningTotal1, 0, &dot );
|
|
}
|
|
|
|
//handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
|
|
// spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
|
|
// counts less than 50, so not much point in trying to get vector code in on the action
|
|
for ( ; i < count ; i++ ) {
|
|
dot += src1[i] * src2[i];
|
|
}
|
|
|
|
}
|
|
#endif /* ENABLE_DOT */
|
|
|
|
#ifdef ENABLE_COMPARES
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGT
|
|
|
|
dst[i] = src0[i] > constant;
|
|
============
|
|
*/
|
|
|
|
void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] > constant;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] > constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmpgt( v0, constVec );
|
|
vr2 = vec_cmpgt( v1, constVec );
|
|
vr3 = vec_cmpgt( v2, constVec );
|
|
vr4 = vec_cmpgt( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] > constant;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGT
|
|
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
|
|
|
|
// Temp vector registers
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmpgt( vs0, constVec );
|
|
vtbi1 = vec_cmpgt( vs1, constVec );
|
|
vtbi2 = vec_cmpgt( vs2, constVec );
|
|
vtbi3 = vec_cmpgt( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1 to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGE
|
|
|
|
dst[i] = src0[i] >= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] >= constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmpge( v0, constVec );
|
|
vr2 = vec_cmpge( v1, constVec );
|
|
vr3 = vec_cmpge( v2, constVec );
|
|
vr4 = vec_cmpge( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] >= constant;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGE
|
|
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmpge( vs0, constVec );
|
|
vtbi1 = vec_cmpge( vs1, constVec );
|
|
vtbi2 = vec_cmpge( vs2, constVec );
|
|
vtbi3 = vec_cmpge( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1L to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLT
|
|
|
|
dst[i] = src0[i] < constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] < constant;
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] < constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmplt( v0, constVec );
|
|
vr2 = vec_cmplt( v1, constVec );
|
|
vr3 = vec_cmplt( v2, constVec );
|
|
vr4 = vec_cmplt( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] < constant;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLT
|
|
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmplt( vs0, constVec );
|
|
vtbi1 = vec_cmplt( vs1, constVec );
|
|
vtbi2 = vec_cmplt( vs2, constVec );
|
|
vtbi3 = vec_cmplt( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1L to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
}
|
|
|
|
}
|
|
//#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLE
|
|
|
|
dst[i] = src0[i] <= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] <= constant;
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] <= constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmple( v0, constVec );
|
|
vr2 = vec_cmple( v1, constVec );
|
|
vr3 = vec_cmple( v2, constVec );
|
|
vr4 = vec_cmple( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] <= constant;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLE
|
|
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmple( vs0, constVec );
|
|
vtbi1 = vec_cmple( vs1, constVec );
|
|
vtbi2 = vec_cmple( vs2, constVec );
|
|
vtbi3 = vec_cmple( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1L to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
}
|
|
}
|
|
#endif /* ENABLE_COMPARES */
|
|
|
|
#ifdef ENABLE_MINMAX
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
|
|
min = idMath::INFINITY; max = -idMath::INFINITY;
|
|
//#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float maxVec, minVec, tempMin, tempMax;
|
|
register vector unsigned char permVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
//calculate permute and do first load to
|
|
//get a starting point for min and max
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &src[0] );
|
|
|
|
maxVec = loadSplatUnalignedScalar( &max );
|
|
minVec = loadSplatUnalignedScalar( &min );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load sources
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
// minimum
|
|
v2 = vec_min( v0, v1 );
|
|
minVec = vec_min( minVec, v2 );
|
|
// maximum
|
|
v3 = vec_max( v0, v1 );
|
|
maxVec = vec_max( maxVec, v3 );
|
|
}
|
|
|
|
//minVec and maxVec hold the min/max elements from the array, but now
|
|
//we need to figure out which particular element it is
|
|
|
|
tempMin = minVec;
|
|
tempMax = maxVec;
|
|
|
|
// rotate vector around and compare to itself to find the real min/max
|
|
tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
|
|
tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
|
|
tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
|
|
tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
|
|
minVec = vec_splat( tempMin, 0 );
|
|
maxVec = vec_splat( tempMax, 0 );
|
|
vec_ste( minVec, 0, &min );
|
|
vec_ste( maxVec, 0, &max );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < count; i++ ) {
|
|
if ( src[i] < min ) {
|
|
min = src[i];
|
|
}
|
|
if ( src[i] > max ) {
|
|
max = src[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
|
|
min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
|
|
//#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
|
|
|
|
idVec2 v;
|
|
int i = 0;
|
|
int j;
|
|
|
|
const float *srcPtr = src[0].ToFloatPtr();
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4;
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
|
|
if ( count > 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, srcPtr );
|
|
|
|
for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
|
|
// load data
|
|
float *vecPtr = (float*)( srcPtr + (j*4) );
|
|
vector float v0, v1, v2, v3;
|
|
|
|
v0 = vecOld;
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
v3 = vec_ld( 47, vecPtr );
|
|
vecOld = vec_ld( 63, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, v3, permVec );
|
|
vecLd4 = vec_perm( v3, vecOld, permVec );
|
|
|
|
// each of these vectors contains 2 elements
|
|
// looks like | X Y X Y | X Y X Y
|
|
v0 = vec_min( vecLd1, vecLd2 );
|
|
v1 = vec_min( vecLd3, vecLd4 );
|
|
v0 = vec_min( v0, v1 );
|
|
|
|
v2 = vec_max( vecLd1, vecLd2 );
|
|
v3 = vec_max( vecLd3, vecLd4 );
|
|
v2 = vec_max( v2, v3 );
|
|
|
|
// since its always X Y X Y we don't have to re-merge each time. we can wait
|
|
// until the end
|
|
vecMin = vec_min( v0, vecMin );
|
|
vecMax = vec_max( v2, vecMax );
|
|
}
|
|
|
|
vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
|
|
vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMax, 0 );
|
|
v3 = vec_splat( vecMax, 1 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &max[0] );
|
|
vec_ste( v3, 0, &max[1] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[i];
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
//#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
|
|
|
|
int i = 0;
|
|
const float *srcPtr = src[0].ToFloatPtr();
|
|
idVec3 v;
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3;
|
|
register vector float vecMin, vecMax;
|
|
register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, srcPtr );
|
|
|
|
// 4 elements at a time
|
|
for ( ; i+3 < count; i += 4 ) {
|
|
float *vecPtr = (float*)( srcPtr + (i*3) );
|
|
vector float v0, v1, v2;
|
|
|
|
v0 = vecOld;
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
vecOld = vec_ld( 47, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, vecOld, permVec );
|
|
|
|
// put each idVec3 into its own vector as X Y Z (crap)
|
|
vecSrc1 = vecLd1;
|
|
vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
|
|
vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
|
|
vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
|
|
|
|
// do min and max
|
|
vecMin1 = vec_min( vecSrc1, vecSrc2 );
|
|
vecMin2 = vec_min( vecSrc3, vecSrc4 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( vecSrc1, vecSrc2 );
|
|
vecMax2 = vec_max( vecSrc3, vecSrc4 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax1, vecMax );
|
|
}
|
|
|
|
// store results
|
|
vector float v0, v1, v2, v3, v4, v5;
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i ++ ) {
|
|
v = src[i];
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
|
|
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
idVec3 v;
|
|
int i = 0;
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 11, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 11, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 11, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 11, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[i].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
|
|
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
idVec3 v;
|
|
int i = 0;
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[i].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
|
|
idVec3 v;
|
|
int i = 0;
|
|
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char vertPerm1;
|
|
vector unsigned char vertPerm2;
|
|
vector unsigned char vertPerm3;
|
|
vector unsigned char vertPerm4;
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
|
|
|
|
vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[indexes[i]].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
|
|
idVec3 v;
|
|
int i = 0;
|
|
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char vertPerm1;
|
|
vector unsigned char vertPerm2;
|
|
vector unsigned char vertPerm3;
|
|
vector unsigned char vertPerm4;
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[indexes[i]].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#endif /* ENABLE_MINMAX */
|
|
|
|
#ifdef ENABLE_CLAMP
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Clamp
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
register vector unsigned char permVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector float minVec, maxVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
|
|
}
|
|
|
|
//splat min/max into a vector
|
|
minVec = loadSplatUnalignedScalar( &min );
|
|
maxVec = loadSplatUnalignedScalar( &max );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
//apply minimum
|
|
v2 = vec_max( v0, minVec );
|
|
v3 = vec_max( v1, minVec );
|
|
|
|
//apply maximum
|
|
v4 = vec_min( v2, maxVec );
|
|
v5 = vec_min( v3, maxVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ClampMin
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
|
|
register vector float v0, v1, v2, v3;
|
|
register vector unsigned char permVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
register vector float constVec;
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &min );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = vec_max( v0, constVec );
|
|
v3 = vec_max( v1, constVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ClampMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
|
|
register vector float v0, v1, v2, v3;
|
|
register vector unsigned char permVec;
|
|
register vector float constVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src[i] < max ? max : src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &max );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_min( v0, constVec );
|
|
v3 = vec_min( v1, constVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src[i] < max ? max : src[i];
|
|
}
|
|
}
|
|
|
|
#endif /* ENABLE_CLAMP */
|
|
|
|
#ifdef ENABLE_16ROUTINES
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Zero16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
|
|
memset( dst, 0, count * sizeof( float ) );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Negate16
|
|
|
|
Assumptions:
|
|
dst is aligned
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
|
|
//#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
int i = 0;
|
|
vector float v0, v1, v2, v3;
|
|
|
|
//know its 16-byte aligned
|
|
for ( ; i + 7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_ld( 16, &dst[i] );
|
|
|
|
v2 = vec_sub( (vector float)(0), v0 );
|
|
v3 = vec_sub( (vector float)(0), v1 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_sub( (vector float)(0), v0 );
|
|
vec_st( v1, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Copy16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)]
|
|
memcpy( dst, src, sizeof(float) * count );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Add16
|
|
|
|
Assumptions:
|
|
Assumes dst, src1, src2 all start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src1 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src1[0] ) );
|
|
// src2 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src2[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i = 0;
|
|
|
|
//know all data is 16-byte aligned, so vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
//load sources
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 16, &src1[i] );
|
|
v2 = vec_ld( 0, &src2[i] );
|
|
v3 = vec_ld( 16, &src2[i] );
|
|
v4 = vec_add( v0, v2 );
|
|
v5 = vec_add( v1, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 0, &src2[i] );
|
|
v2 = vec_add( v0, v1 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Sub16
|
|
|
|
Assumptions:
|
|
Assumes that dst, src1, and src2 all start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src1 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src1[0] ) );
|
|
// src2 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src2[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i = 0;
|
|
|
|
//know data is aligned, so vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
//load sources
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 16, &src1[i] );
|
|
v2 = vec_ld( 0, &src2[i] );
|
|
v3 = vec_ld( 16, &src2[i] );
|
|
v4 = vec_sub( v0, v2 );
|
|
v5 = vec_sub( v1, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 0, &src2[i] );
|
|
v2 = vec_sub( v0, v1 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Mul16
|
|
|
|
Assumptions:
|
|
Assumes that dst and src1 start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src1[(X)] * constant
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src1 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src1[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float constVec;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
int i = 0;
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//know data is aligned, so vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
//load source
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 16, &src1[i] );
|
|
v2 = vec_madd( constVec, v0, zeroVector );
|
|
v3 = vec_madd( constVec, v1, zeroVector );
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_madd( constVec, v0, zeroVector );
|
|
vec_st( v1, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::AddAssign16
|
|
|
|
Assumptions:
|
|
Assumes that dst and src start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] += src[(X)]
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src is aligned
|
|
assert( IS_16BYTE_ALIGNED( src[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i = 0;
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 16, &src[i] );
|
|
v2 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
v4 = vec_add( v0, v2 );
|
|
v5 = vec_add( v1, v3 );
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v2 = vec_add( v0, v1 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::SubAssign16
|
|
|
|
Assumptions:
|
|
Assumes that dst and src start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] -= src[(X)]
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i=0;
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src is aligned
|
|
assert( IS_16BYTE_ALIGNED( src[0] ) );
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 16, &src[i] );
|
|
v2 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
v4 = vec_sub( v2, v0 );
|
|
v5 = vec_sub( v3, v1 );
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v2 = vec_sub( v1, v0 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulAssign16
|
|
|
|
Assumptions:
|
|
Assumes that dst starts at aligned address and count is multiple of 4
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] *= constant
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float constVec;
|
|
int i = 0;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_ld( 16, &dst[i] );
|
|
v2 = vec_madd( v0, constVec, zeroVector );
|
|
v3 = vec_madd( v1, constVec, zeroVector );
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_madd( v0, constVec, zeroVector );
|
|
vec_st( v1, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
#endif /* ENABLE_16ROUTINES */
|
|
|
|
#ifdef ENABLE_LOWER_TRIANGULAR
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MatX_LowerTriangularSolve
|
|
|
|
solves x in L * x = b for the first n rows of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
|
|
void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
|
|
int i, j;
|
|
const float *lptr;
|
|
const float *lptr2;
|
|
const float *lptr3;
|
|
const float *lptr4;
|
|
float sum;
|
|
float sum2;
|
|
float sum3;
|
|
float sum4;
|
|
float tempSum;
|
|
float tempSum2;
|
|
float tempSum3;
|
|
float tempSum4;
|
|
vector float vecSum1 = (vector float)(0.0);
|
|
vector float vecSum2 = (vector float)(0.0);
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
|
|
vector float zeroVector = (vector float)(0.0);
|
|
vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
|
|
|
|
vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
|
|
|
|
// unrolled this loop a bit
|
|
for ( i = skip; i+3 < n; i+=4 ) {
|
|
sum = b[i];
|
|
sum2 = b[i+1];
|
|
sum3 = b[i+2];
|
|
sum4 = b[i+3];
|
|
|
|
vecSum1 = zeroVector;
|
|
vecSum2 = zeroVector;
|
|
vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
|
|
lptr = L[i];
|
|
lptr2 = L[i+1];
|
|
lptr3 = L[i+2];
|
|
lptr4 = L[i+3];
|
|
|
|
vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
|
|
vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
|
|
vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
|
|
vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
|
|
|
|
for ( j = 0 ; j+7 < i; j+=8 ) {
|
|
|
|
v0 = vec_ld( 0, &x[j] );
|
|
v1 = vec_ld( 15, &x[j] );
|
|
vector float vecExtraX = vec_ld( 31, &x[j] );
|
|
v0 = vec_perm( v0, v1, vecPermX );
|
|
v1 = vec_perm( v1, vecExtraX, vecPermX );
|
|
|
|
v2 = vec_ld( 0, lptr + j );
|
|
v3 = vec_ld( 15, lptr + j );
|
|
vector float vecExtra1 = vec_ld( 31, lptr + j );
|
|
v2 = vec_perm( v2, v3, vecPermLptr1 );
|
|
v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
|
|
|
|
v4 = vec_ld( 0, lptr2 + j );
|
|
v5 = vec_ld( 15, lptr2 + j );
|
|
vector float vecExtra2 = vec_ld( 31, lptr2 + j );
|
|
v4 = vec_perm( v4, v5, vecPermLptr2 );
|
|
v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
|
|
|
|
v6 = vec_ld( 0, lptr3 + j );
|
|
v7 = vec_ld( 15, lptr3 + j );
|
|
vector float vecExtra3 = vec_ld( 31, lptr3 + j );
|
|
v6 = vec_perm( v6, v7, vecPermLptr3 );
|
|
v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
|
|
|
|
v8 = vec_ld( 0, lptr4 + j );
|
|
v9 = vec_ld( 15, lptr4 + j );
|
|
vector float vecExtra4 = vec_ld( 31, lptr4 + j );
|
|
v8 = vec_perm( v8, v9, vecPermLptr4 );
|
|
v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
|
|
|
|
vecSum1 = vec_madd( v2, v0, vecSum1 );
|
|
vecSum2 = vec_madd( v3, v1, vecSum2 );
|
|
|
|
vecSum3 = vec_madd( v4, v0, vecSum3 );
|
|
vecSum4 = vec_madd( v5, v1, vecSum4 );
|
|
|
|
vecSum5 = vec_madd( v6, v0, vecSum5 );
|
|
vecSum6 = vec_madd( v7, v1, vecSum6 );
|
|
|
|
vecSum7 = vec_madd( v8, v0, vecSum7 );
|
|
vecSum8 = vec_madd( v9, v1, vecSum8 );
|
|
}
|
|
|
|
// if we ran the unrolled code, we need to sum accross the vectors
|
|
// to find out how much to subtract from sum
|
|
if ( j > 0 ) {
|
|
vecSum1 = vec_add( vecSum1, vecSum2 );
|
|
vecSum3 = vec_add( vecSum3, vecSum4 );
|
|
vecSum5 = vec_add( vecSum5, vecSum6 );
|
|
vecSum7 = vec_add( vecSum7, vecSum8 );
|
|
//sum accross the vectors
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
|
|
|
|
vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
|
|
vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
|
|
|
|
vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
|
|
vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
|
|
|
|
vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
|
|
vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
|
|
|
|
//move the result to the FPU
|
|
vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
|
|
vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
|
|
vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
|
|
vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
|
|
|
|
sum -= tempSum;
|
|
sum2 -= tempSum2;
|
|
sum3 -= tempSum3;
|
|
sum4 -= tempSum4;
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; j < i; j++ ) {
|
|
sum -= lptr[j] * x[j];
|
|
sum2 -= lptr2[j] * x[j];
|
|
sum3 -= lptr3[j] * x[j];
|
|
sum4 -= lptr4[j] * x[j];
|
|
}
|
|
|
|
// store the 4 results at a time
|
|
sum2 -= ( lptr2[i] * sum );
|
|
sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
|
|
sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
|
|
|
|
x[i] = sum;
|
|
x[i+1] = sum2;
|
|
x[i+2] = sum3;
|
|
x[i+3] = sum4;
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < n; i++ ) {
|
|
sum = b[i];
|
|
vecSum1 = zeroVector;
|
|
vecSum2 = zeroVector;
|
|
lptr = L[i];
|
|
vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
|
|
|
|
for ( j = 0 ; j+7 < i; j+=8 ) {
|
|
|
|
v0 = vec_ld( 0, &x[j] );
|
|
v2 = vec_ld( 15, &x[j] );
|
|
vector float vecExtraX = vec_ld( 31, &x[j] );
|
|
v0 = vec_perm( v0, v2, vecPermX );
|
|
v2 = vec_perm( v2, vecExtraX, vecPermX );
|
|
|
|
v1 = vec_ld( 0, lptr + j );
|
|
v3 = vec_ld( 15, lptr + j );
|
|
vector float vecExtra = vec_ld( 31, lptr + j );
|
|
v1 = vec_perm( v1, v3, vecPermLptr );
|
|
v3 = vec_perm( v3, vecExtra, vecPermLptr );
|
|
|
|
vecSum1 = vec_madd( v1, v0, vecSum1 );
|
|
vecSum2 = vec_madd( v3, v2, vecSum2 );
|
|
}
|
|
|
|
// if we ran the unrolled code, we need to sum accross the vectors
|
|
// to find out how much to subtract from sum
|
|
if ( j > 0 ) {
|
|
//sum accross the vectors
|
|
vecSum1 = vec_add( vecSum1, vecSum2 );
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
|
|
|
|
//move the result to the FPU
|
|
vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
|
|
sum -= tempSum;
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; j < i; j++ ) {
|
|
sum -= lptr[j] * x[j];
|
|
}
|
|
x[i] = sum;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L.Transpose() * x = b for the first n rows of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
float x0, x1, x2, x3, x4, x5, x6;
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
// using local variables to avoid aliasing issues
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x1 = b[1];
|
|
x0 = b[0] - lptr[1*nc+0] * x1;
|
|
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
case 3:
|
|
x2 = b[2];
|
|
x1 = b[1] - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
case 4:
|
|
x3 = b[3];
|
|
x2 = b[2] - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
|
|
return;
|
|
case 5:
|
|
x4 = b[4];
|
|
x3 = b[3] - lptr[4*nc+3] * x4;
|
|
x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[4] = x4;
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
case 6:
|
|
x5 = b[5];
|
|
x4 = b[4] - lptr[5*nc+4] * x5;
|
|
x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
|
|
x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[5] = x5;
|
|
x[4] = x4;
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
|
|
return;
|
|
case 7:
|
|
x6 = b[6];
|
|
x5 = b[5] - lptr[6*nc+5] * x6;
|
|
x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
|
|
x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
|
|
x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[6] = x6;
|
|
x[5] = x5;
|
|
x[4] = x4;
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int i, j;
|
|
register float s0, s1, s2, s3;
|
|
float *xptr;
|
|
|
|
lptr = L.ToFloatPtr() + n * nc + n - 4;
|
|
xptr = x + n;
|
|
|
|
// process 4 rows at a time
|
|
for ( i = n; i >= 4; i -= 4 ) {
|
|
s0 = b[i-4];
|
|
s1 = b[i-3];
|
|
s2 = b[i-2];
|
|
s3 = b[i-1];
|
|
// process 4x4 blocks
|
|
for ( j = 0; j < n-i; j += 4 ) {
|
|
s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
|
|
s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
|
|
s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
|
|
s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
|
|
s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
|
|
s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
|
|
s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
|
|
s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
|
|
s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
|
|
s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
|
|
s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
|
|
s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
|
|
s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
|
|
s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
|
|
s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
|
|
s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
|
|
}
|
|
// process left over of the 4 rows
|
|
s0 -= lptr[0-1*nc] * s3;
|
|
s1 -= lptr[1-1*nc] * s3;
|
|
s2 -= lptr[2-1*nc] * s3;
|
|
s0 -= lptr[0-2*nc] * s2;
|
|
s1 -= lptr[1-2*nc] * s2;
|
|
s0 -= lptr[0-3*nc] * s1;
|
|
// store result
|
|
xptr[-4] = s0;
|
|
xptr[-3] = s1;
|
|
xptr[-2] = s2;
|
|
xptr[-1] = s3;
|
|
// update pointers for next four rows
|
|
lptr -= 4 + 4 * nc;
|
|
xptr -= 4;
|
|
}
|
|
// process left over rows
|
|
for ( i--; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[0] + i;
|
|
for ( j = i + 1; j < n; j++ ) {
|
|
s0 -= lptr[j*nc] * x[j];
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MatX_LDLTFactor
|
|
============
|
|
*/
|
|
bool VPCALL idSIMD_AltiVec::MatX_LDLTFactor( idMatX &mat, idVecX &invDiag, const int n ) {
|
|
int i, j, k, nc;
|
|
float *v, *diag, *mptr;
|
|
float s0, s1, s2, s3, sum, d;
|
|
float s0_2, s1_2, s2_2, s3_2, sum_2;
|
|
float *mptr2;
|
|
|
|
v = (float *) _alloca16( n * sizeof( float ) );
|
|
diag = (float *) _alloca16( n * sizeof( float ) );
|
|
|
|
nc = mat.GetNumColumns();
|
|
|
|
if ( n <= 0 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
|
|
sum = mptr[0];
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
diag[0] = sum;
|
|
invDiag[0] = d = 1.0f / sum;
|
|
|
|
if ( n <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 1; j < n; j++ ) {
|
|
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[1];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
sum = mptr[1] - s0;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[1][1] = sum;
|
|
diag[1] = sum;
|
|
invDiag[1] = d = 1.0f / sum;
|
|
|
|
if ( n <= 2 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 2; j < n; j++ ) {
|
|
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[2];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
sum = mptr[2] - s0 - s1;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[2][2] = sum;
|
|
diag[2] = sum;
|
|
invDiag[2] = d = 1.0f / sum;
|
|
|
|
if ( n <= 3 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 3; j < n; j++ ) {
|
|
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
|
|
}
|
|
|
|
mptr = mat[3];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
sum = mptr[3] - s0 - s1 - s2;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[3][3] = sum;
|
|
diag[3] = sum;
|
|
invDiag[3] = d = 1.0f / sum;
|
|
|
|
if ( n <= 4 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 4; j < n; j++ ) {
|
|
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
|
}
|
|
|
|
for ( i = 4; i < n; i++ ) {
|
|
|
|
mptr = mat[i];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
|
|
for ( k = 4; k < i-3; k += 4 ) {
|
|
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
|
|
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
|
|
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
|
|
}
|
|
switch( i - k ) {
|
|
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
|
|
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
|
|
}
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
sum = mptr[i] - sum;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[i][i] = sum;
|
|
diag[i] = sum;
|
|
invDiag[i] = d = 1.0f / sum;
|
|
|
|
if ( i + 1 >= n ) {
|
|
return true;
|
|
}
|
|
|
|
// unrolling madness!
|
|
mptr = mat[i+1];
|
|
mptr2 = mat[i+1] + nc;
|
|
|
|
for ( j = i+1; j+1 < n; j+=2 ) {
|
|
s0 = mptr[0] * v[0];
|
|
s1 = mptr[1] * v[1];
|
|
s2 = mptr[2] * v[2];
|
|
s3 = mptr[3] * v[3];
|
|
|
|
s0_2 = mptr2[0] * v[0];
|
|
s1_2 = mptr2[1] * v[1];
|
|
s2_2 = mptr2[2] * v[2];
|
|
s3_2 = mptr2[3] * v[3];
|
|
|
|
for ( k = 4; k < i-7; k += 8 ) {
|
|
s0 += mptr[k+0] * v[k+0];
|
|
s1 += mptr[k+1] * v[k+1];
|
|
s2 += mptr[k+2] * v[k+2];
|
|
s3 += mptr[k+3] * v[k+3];
|
|
s0 += mptr[k+4] * v[k+4];
|
|
s1 += mptr[k+5] * v[k+5];
|
|
s2 += mptr[k+6] * v[k+6];
|
|
s3 += mptr[k+7] * v[k+7];
|
|
|
|
s0_2 += mptr2[k+0] * v[k+0];
|
|
s1_2 += mptr2[k+1] * v[k+1];
|
|
s2_2 += mptr2[k+2] * v[k+2];
|
|
s3_2 += mptr2[k+3] * v[k+3];
|
|
s0_2 += mptr2[k+4] * v[k+4];
|
|
s1_2 += mptr2[k+5] * v[k+5];
|
|
s2_2 += mptr2[k+6] * v[k+6];
|
|
s3_2 += mptr2[k+7] * v[k+7];
|
|
}
|
|
|
|
switch( i - k ) {
|
|
case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
|
|
case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
|
|
case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
|
|
case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
|
|
case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
|
|
case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
|
|
case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
|
|
}
|
|
// disassociate these adds
|
|
s3 += s2;
|
|
s1 += s0;
|
|
sum = s1 + s3;
|
|
|
|
s3_2 += s2_2;
|
|
s1_2 += s0_2;
|
|
sum_2 = s1_2 + s3_2;
|
|
|
|
mptr[i] = ( mptr[i] - sum ) * d;
|
|
mptr2[i] = ( mptr2[i] - sum_2 ) * d;
|
|
|
|
mptr += nc*2;
|
|
mptr2 += nc*2;
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; j < n; j++ ) {
|
|
s0 = mptr[0] * v[0];
|
|
s1 = mptr[1] * v[1];
|
|
s2 = mptr[2] * v[2];
|
|
s3 = mptr[3] * v[3];
|
|
for ( k = 4; k < i-7; k += 8 ) {
|
|
s0 += mptr[k+0] * v[k+0];
|
|
s1 += mptr[k+1] * v[k+1];
|
|
s2 += mptr[k+2] * v[k+2];
|
|
s3 += mptr[k+3] * v[k+3];
|
|
s0 += mptr[k+4] * v[k+4];
|
|
s1 += mptr[k+5] * v[k+5];
|
|
s2 += mptr[k+6] * v[k+6];
|
|
s3 += mptr[k+7] * v[k+7];
|
|
}
|
|
switch( i - k ) {
|
|
case 7: s0 += mptr[k+6] * v[k+6];
|
|
case 6: s1 += mptr[k+5] * v[k+5];
|
|
case 5: s2 += mptr[k+4] * v[k+4];
|
|
case 4: s3 += mptr[k+3] * v[k+3];
|
|
case 3: s0 += mptr[k+2] * v[k+2];
|
|
case 2: s1 += mptr[k+1] * v[k+1];
|
|
case 1: s2 += mptr[k+0] * v[k+0];
|
|
}
|
|
// disassociate these adds
|
|
s3 += s2;
|
|
s1 += s0;
|
|
sum = s1 + s3;
|
|
mptr[i] = ( mptr[i] - sum ) * d;
|
|
mptr += nc;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
#endif /* ENABLE_LOWER_TRIANGULAR */
|
|
|
|
|
|
#ifdef LIVE_VICARIOUSLY
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::BlendJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
|
|
int i;
|
|
|
|
// since lerp is a constant, we can special case the two cases if they're true
|
|
if ( lerp <= 0.0f ) {
|
|
// this sets joints back to joints. No sense in doing no work, so just return
|
|
return;
|
|
}
|
|
|
|
if ( lerp >= 1.0f ) {
|
|
// this copies each q from blendJoints to joints and copies each t from blendJoints to joints
|
|
memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
|
|
return;
|
|
}
|
|
|
|
vector float vecLerp = loadSplatUnalignedScalar( &lerp );
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
for ( i = 0; i+3 < numJoints; i+=4 ) {
|
|
int j = index[i];
|
|
int j2 = index[i+1];
|
|
int j3 = index[i+2];
|
|
int j4 = index[i+3];
|
|
|
|
// slerp
|
|
const float *jointPtr = joints[j].q.ToFloatPtr();
|
|
const float *blendPtr = blendJoints[j].q.ToFloatPtr();
|
|
const float *jointPtr2 = joints[j2].q.ToFloatPtr();
|
|
const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
|
|
const float *jointPtr3 = joints[j3].q.ToFloatPtr();
|
|
const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
|
|
const float *jointPtr4 = joints[j4].q.ToFloatPtr();
|
|
const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
|
|
|
|
vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
|
|
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
|
|
vector float v12, v13, v14, v15, v16;
|
|
vector float vecFromX, vecFromY, vecFromZ, vecFromW;
|
|
vector float vecToX, vecToY, vecToZ, vecToW;
|
|
|
|
// load up the the idJointQuats from joints
|
|
v0 = vec_ld( 0, jointPtr );
|
|
v1 = vec_ld( 15, jointPtr );
|
|
v2 = vec_perm( v0, v1, permVec );
|
|
|
|
v3 = vec_ld( 0, jointPtr2 );
|
|
v4 = vec_ld( 15, jointPtr2 );
|
|
v5 = vec_perm( v3, v4, permVec2 );
|
|
|
|
v6 = vec_ld( 0, jointPtr3 );
|
|
v7 = vec_ld( 15, jointPtr3 );
|
|
v8 = vec_perm( v6, v7, permVec3 );
|
|
|
|
v9 = vec_ld( 0, jointPtr4 );
|
|
v10 = vec_ld( 15, jointPtr4 );
|
|
v11 = vec_perm( v9, v10, permVec4 );
|
|
|
|
// planarizing, so put each x y z w into its own vector
|
|
v0 = vec_mergeh( v2, v8 );
|
|
v1 = vec_mergeh( v5, v11 );
|
|
v3 = vec_mergel( v2, v8 );
|
|
v4 = vec_mergel( v5, v11 );
|
|
|
|
vecFromX = vec_mergeh( v0, v1 );
|
|
vecFromY = vec_mergel( v0, v1 );
|
|
vecFromZ = vec_mergeh( v3, v4 );
|
|
vecFromW = vec_mergel( v3, v4 );
|
|
|
|
// load up idJointQuats from blendJoints
|
|
v5 = vec_ld( 0, blendPtr );
|
|
v6 = vec_ld( 15, blendPtr );
|
|
v7 = vec_perm( v5, v6, permVec5 );
|
|
|
|
v8 = vec_ld( 0, blendPtr2 );
|
|
v9 = vec_ld( 15, blendPtr2 );
|
|
v10 = vec_perm( v8, v9, permVec6 );
|
|
|
|
v11 = vec_ld( 0, blendPtr3 );
|
|
v12 = vec_ld( 15, blendPtr3 );
|
|
v13 = vec_perm( v11, v12, permVec7 );
|
|
|
|
v14 = vec_ld( 0, blendPtr4 );
|
|
v15 = vec_ld( 15, blendPtr4 );
|
|
v16 = vec_perm( v14, v15, permVec8 );
|
|
|
|
// put these into their own vectors too
|
|
v5 = vec_mergeh( v7, v13 );
|
|
v6 = vec_mergeh( v10, v16 );
|
|
v8 = vec_mergel( v7, v13 );
|
|
v9 = vec_mergel( v10, v16 );
|
|
|
|
vecToX = vec_mergeh( v5, v6 );
|
|
vecToY = vec_mergel( v5, v6 );
|
|
vecToZ = vec_mergeh( v8, v9 );
|
|
vecToW = vec_mergel( v8, v9 );
|
|
|
|
// calculate cosom
|
|
vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
|
|
vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
|
|
vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
|
|
vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
|
|
|
|
// if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
|
|
// to
|
|
vector bool int vecCmp, vecCmp2;
|
|
vecCmp = vec_cmplt( vecCosom, zeroVector );
|
|
|
|
// negate if needed
|
|
vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
|
|
|
|
// check if we need to calculate scale
|
|
vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
|
|
vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
|
|
vector float vecScale1 = vec_splat( vecLerp, 0 );
|
|
|
|
vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
|
|
vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
|
|
vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
|
|
|
|
vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
|
|
vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
|
|
|
|
// see which ones we have to insert into our scale0 and scale1 vectors
|
|
vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
|
|
vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
|
|
|
|
// multiply each element by the scale
|
|
vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
|
|
vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
|
|
vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
|
|
vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
|
|
|
|
// multiply temp by scale and add to result
|
|
vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
|
|
vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
|
|
vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
|
|
vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
|
|
|
|
// do a transform again to get the results back to vectors we can store out
|
|
v5 = vec_mergeh( vecFromX, vecFromZ );
|
|
v6 = vec_mergeh( vecFromY, vecFromW );
|
|
v8 = vec_mergel( vecFromX, vecFromZ );
|
|
v9 = vec_mergel( vecFromY, vecFromW );
|
|
|
|
vecToX = vec_mergeh( v5, v6 );
|
|
vecToY = vec_mergel( v5, v6 );
|
|
vecToZ = vec_mergeh( v8, v9 );
|
|
vecToW = vec_mergel( v8, v9 );
|
|
|
|
vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
|
|
vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
|
|
vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
|
|
vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
|
|
|
|
// right rotate the input data
|
|
vecToX = vec_perm( vecToX, vecToX, storePerm1 );
|
|
vecToY = vec_perm( vecToY, vecToY, storePerm2 );
|
|
vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
|
|
vecToW = vec_perm( vecToW, vecToW, storePerm4 );
|
|
|
|
vec_ste( vecToX, 0, (float*) jointPtr );
|
|
vec_ste( vecToX, 4, (float*) jointPtr );
|
|
vec_ste( vecToX, 8, (float*) jointPtr );
|
|
vec_ste( vecToX, 12, (float*) jointPtr );
|
|
|
|
vec_ste( vecToY, 0, (float*) jointPtr2 );
|
|
vec_ste( vecToY, 4, (float*) jointPtr2 );
|
|
vec_ste( vecToY, 8, (float*) jointPtr2 );
|
|
vec_ste( vecToY, 12, (float*) jointPtr2 );
|
|
|
|
vec_ste( vecToZ, 0, (float*) jointPtr3 );
|
|
vec_ste( vecToZ, 4, (float*) jointPtr3 );
|
|
vec_ste( vecToZ, 8, (float*) jointPtr3 );
|
|
vec_ste( vecToZ, 12, (float*) jointPtr3 );
|
|
|
|
vec_ste( vecToW, 0, (float*) jointPtr4 );
|
|
vec_ste( vecToW, 4, (float*) jointPtr4 );
|
|
vec_ste( vecToW, 8, (float*) jointPtr4 );
|
|
vec_ste( vecToW, 12, (float*) jointPtr4 );
|
|
|
|
// lerp is v1 + l * ( v2 - v1 );
|
|
// the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
|
|
float *jointVecPtr = (float*)( jointPtr + 4 );
|
|
float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
|
|
float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
|
|
float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
|
|
|
|
v0 = vec_ld( 0, jointVecPtr );
|
|
v1 = vec_ld( 11, jointVecPtr );
|
|
vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
|
|
|
|
v2 = vec_ld( 0, jointVecPtr2 );
|
|
v3 = vec_ld( 11, jointVecPtr2 );
|
|
vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
|
|
|
|
v4 = vec_ld( 0, jointVecPtr3 );
|
|
v5 = vec_ld( 11, jointVecPtr3 );
|
|
vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
|
|
|
|
v6 = vec_ld( 0, jointVecPtr4 );
|
|
v7 = vec_ld( 11, jointVecPtr4 );
|
|
vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
|
|
|
|
vector float vecVecX, vecVecY, vecVecZ;
|
|
vecVecX = vecVecY = vecVecZ = zeroVector;
|
|
|
|
// planarize
|
|
v0 = vec_mergeh( vecLd1, vecLd3 );
|
|
v1 = vec_mergeh( vecLd2, vecLd4 );
|
|
v3 = vec_mergel( vecLd1, vecLd3 );
|
|
v4 = vec_mergel( vecLd2, vecLd4 );
|
|
|
|
vecVecX = vec_mergeh( v0, v1 );
|
|
vecVecY = vec_mergel( v0, v1 );
|
|
vecVecZ = vec_mergeh( v3, v4 );
|
|
|
|
// load blend joint idvec3's
|
|
float *blendVecPtr = (float*)( blendPtr + 4 );
|
|
float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
|
|
float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
|
|
float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
|
|
|
|
v0 = vec_ld( 0, blendVecPtr );
|
|
v1 = vec_ld( 11, blendVecPtr );
|
|
vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
|
|
|
|
v2 = vec_ld( 0, blendVecPtr2 );
|
|
v3 = vec_ld( 11, blendVecPtr2 );
|
|
vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
|
|
|
|
v4 = vec_ld( 0, blendVecPtr3 );
|
|
v5 = vec_ld( 11, blendVecPtr3 );
|
|
vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
|
|
|
|
v6 = vec_ld( 0, blendVecPtr4 );
|
|
v7 = vec_ld( 11, blendVecPtr4 );
|
|
vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
|
|
|
|
vector float vecBlendX, vecBlendY, vecBlendZ;
|
|
vecBlendX = vecBlendY = vecBlendZ = zeroVector;
|
|
|
|
// planarize
|
|
v0 = vec_mergeh( vecLd5, vecLd7 );
|
|
v1 = vec_mergeh( vecLd6, vecLd8 );
|
|
v3 = vec_mergel( vecLd5, vecLd7 );
|
|
v4 = vec_mergel( vecLd6, vecLd8 );
|
|
|
|
vecBlendX = vec_mergeh( v0, v1 );
|
|
vecBlendY = vec_mergel( v0, v1 );
|
|
vecBlendZ = vec_mergeh( v3, v4 );
|
|
|
|
// do subtraction
|
|
vecWork1 = vec_sub( vecBlendX, vecVecX );
|
|
vecWork2 = vec_sub( vecBlendY, vecVecY );
|
|
vecWork3 = vec_sub( vecBlendZ, vecVecZ );
|
|
|
|
// multiply by lerp and add to v1
|
|
vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
|
|
vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
|
|
vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
|
|
|
|
// put it back in original form
|
|
v0 = vec_mergeh( vecVecX, vecVecZ );
|
|
v1 = vec_mergeh( vecVecY, zeroVector );
|
|
v3 = vec_mergel( vecVecX, vecVecZ );
|
|
v4 = vec_mergel( vecVecY, zeroVector );
|
|
|
|
// generate vectors to store
|
|
vecWork1 = vec_mergeh( v0, v1 );
|
|
vecWork2 = vec_mergel( v0, v1 );
|
|
vecWork3 = vec_mergeh( v3, v4 );
|
|
vector float vecWork4 = vec_mergel( v3, v4 );
|
|
|
|
// store the T values
|
|
storePerm1 = vec_lvsr( 0, jointVecPtr );
|
|
storePerm2 = vec_lvsr( 0, jointVecPtr2 );
|
|
storePerm3 = vec_lvsr( 0, jointVecPtr3 );
|
|
storePerm4 = vec_lvsr( 0, jointVecPtr4 );
|
|
|
|
// right rotate the input data
|
|
vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
|
|
vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
|
|
vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
|
|
vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
|
|
|
|
vec_ste( vecWork1, 0, (float*) jointVecPtr );
|
|
vec_ste( vecWork1, 4, (float*) jointVecPtr );
|
|
vec_ste( vecWork1, 8, (float*) jointVecPtr );
|
|
|
|
vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
|
|
vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
|
|
vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
|
|
|
|
vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
|
|
vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
|
|
vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
|
|
|
|
vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
|
|
vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
|
|
vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numJoints; i++ ) {
|
|
int j = index[i];
|
|
joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
|
|
joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ConvertJointQuatsToJointMats
|
|
============
|
|
*/
|
|
|
|
// SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
|
|
// it's not easily parallelizable
|
|
void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
|
|
|
|
for ( int i = 0; i < numJoints; i++ ) {
|
|
|
|
const float *q = jointQuats[i].q.ToFloatPtr();
|
|
float *m = jointMats[i].ToFloatPtr();
|
|
|
|
m[0*4+3] = q[4];
|
|
m[1*4+3] = q[5];
|
|
m[2*4+3] = q[6];
|
|
|
|
float x2 = q[0] + q[0];
|
|
float y2 = q[1] + q[1];
|
|
float z2 = q[2] + q[2];
|
|
|
|
{
|
|
float xx = q[0] * x2;
|
|
float yy = q[1] * y2;
|
|
float zz = q[2] * z2;
|
|
|
|
m[0*4+0] = 1.0f - yy - zz;
|
|
m[1*4+1] = 1.0f - xx - zz;
|
|
m[2*4+2] = 1.0f - xx - yy;
|
|
}
|
|
|
|
{
|
|
float yz = q[1] * z2;
|
|
float wx = q[3] * x2;
|
|
|
|
m[2*4+1] = yz - wx;
|
|
m[1*4+2] = yz + wx;
|
|
}
|
|
|
|
{
|
|
float xy = q[0] * y2;
|
|
float wz = q[3] * z2;
|
|
|
|
m[1*4+0] = xy - wz;
|
|
m[0*4+1] = xy + wz;
|
|
}
|
|
|
|
{
|
|
float xz = q[0] * z2;
|
|
float wy = q[3] * y2;
|
|
|
|
m[0*4+2] = xz - wy;
|
|
m[2*4+0] = xz + wy;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ConvertJointMatsToJointQuats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
|
|
|
|
int index;
|
|
|
|
// Since we use very little of the data we have to pull in for the altivec version, we end up with
|
|
// a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
|
|
// of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
|
|
// bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
|
|
// my function so everyone can benefit on G5.
|
|
|
|
for ( index = 0; index < numJoints; index++ ) {
|
|
|
|
idJointQuat jq;
|
|
float trace;
|
|
float s;
|
|
float t;
|
|
int i;
|
|
int j;
|
|
int k;
|
|
|
|
static int next[3] = { 1, 2, 0 };
|
|
|
|
float *mat = (float*)( jointMats[index].ToFloatPtr() );
|
|
trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
|
|
|
|
if ( trace > 0.0f ) {
|
|
|
|
t = trace + 1.0f;
|
|
//s = idMath::InvSqrt( t ) * 0.5f;
|
|
s = FastScalarInvSqrt( t ) * 0.5f;
|
|
|
|
jq.q[3] = s * t;
|
|
jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
|
|
jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
|
|
jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
|
|
|
|
} else {
|
|
|
|
i = 0;
|
|
if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
|
|
i = 1;
|
|
}
|
|
if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
|
|
i = 2;
|
|
}
|
|
j = next[i];
|
|
k = next[j];
|
|
|
|
t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
|
|
//s = idMath::InvSqrt( t ) * 0.5f;
|
|
s = FastScalarInvSqrt( t ) * 0.5f;
|
|
|
|
jq.q[i] = s * t;
|
|
jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
|
|
jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
|
|
jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
|
|
}
|
|
|
|
jq.t[0] = mat[0 * 4 + 3];
|
|
jq.t[1] = mat[1 * 4 + 3];
|
|
jq.t[2] = mat[2 * 4 + 3];
|
|
jointQuats[index] = jq;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
int i;
|
|
#if 0
|
|
for( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] *= jointMats[parents[i]];
|
|
}
|
|
#else
|
|
|
|
// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
|
|
// on what the parents array looks like. This is true in the test code.
|
|
for ( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
float *jointPtr = jointMats[i].ToFloatPtr();
|
|
float *parentPtr = jointMats[parents[i]].ToFloatPtr();
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
// we need to load up 12 float elements that make up the Mat
|
|
v0 = vec_ld( 0, jointPtr );
|
|
v1 = vec_ld( 15, jointPtr );
|
|
v2 = vec_ld( 31, jointPtr );
|
|
v3 = vec_ld( 47, jointPtr );
|
|
|
|
// load parents
|
|
v4 = vec_ld( 0, parentPtr );
|
|
v5 = vec_ld( 15, parentPtr );
|
|
v6 = vec_ld( 31, parentPtr );
|
|
v7 = vec_ld( 47, parentPtr );
|
|
|
|
// permute into vectors
|
|
vector float vecJointMat1 = vec_perm( v0, v1, permVec );
|
|
vector float vecJointMat2 = vec_perm( v1, v2, permVec );
|
|
vector float vecJointMat3 = vec_perm( v2, v3, permVec );
|
|
|
|
vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
|
|
vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
|
|
vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
|
|
|
|
vector float zero = (vector float)(0);
|
|
vector float C1, C2, C3;
|
|
|
|
// matrix multiply
|
|
C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
|
|
C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
|
|
C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
|
|
|
|
C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
|
|
C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
|
|
C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
|
|
|
|
C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
|
|
C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
|
|
C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
|
|
|
|
// do the addition at the end
|
|
vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
|
|
C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
|
|
C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
|
|
C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
|
|
|
|
// store results
|
|
UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UntransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
int i;
|
|
#if 0
|
|
for( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] /= jointMats[parents[i]];
|
|
}
|
|
#else
|
|
// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
|
|
// on what the parents array looks like. This is true in the test code.
|
|
for ( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
float *jointPtr = jointMats[i].ToFloatPtr();
|
|
float *parentPtr = jointMats[parents[i]].ToFloatPtr();
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
// we need to load up 12 float elements that make up the Mat
|
|
v0 = vec_ld( 0, jointPtr );
|
|
v1 = vec_ld( 15, jointPtr );
|
|
v2 = vec_ld( 31, jointPtr );
|
|
v3 = vec_ld( 47, jointPtr );
|
|
|
|
// load parents
|
|
v4 = vec_ld( 0, parentPtr );
|
|
v5 = vec_ld( 15, parentPtr );
|
|
v6 = vec_ld( 31, parentPtr );
|
|
v7 = vec_ld( 47, parentPtr );
|
|
|
|
// permute into vectors
|
|
vector float vecJointMat1 = vec_perm( v0, v1, permVec );
|
|
vector float vecJointMat2 = vec_perm( v1, v2, permVec );
|
|
vector float vecJointMat3 = vec_perm( v2, v3, permVec );
|
|
|
|
vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
|
|
vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
|
|
vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
|
|
|
|
vector float zero = (vector float)(0);
|
|
vector float C1, C2, C3;
|
|
|
|
// do subtraction at the beginning
|
|
vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
|
|
vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
|
|
vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
|
|
vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
|
|
|
|
// matrix multiply
|
|
C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
|
|
C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
|
|
C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
|
|
|
|
C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
|
|
C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
|
|
C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
|
|
|
|
C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
|
|
C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
|
|
C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
|
|
|
|
// store results back
|
|
vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
|
|
|
|
// right rotate the input data
|
|
C1 = vec_perm( C1, C1, storePerm );
|
|
C2 = vec_perm( C2, C2, storePerm );
|
|
C3 = vec_perm( C3, C3, storePerm );
|
|
|
|
vec_ste( C1, 0, (float*) jointPtr );
|
|
vec_ste( C1, 4, (float*) jointPtr );
|
|
vec_ste( C1, 8, (float*) jointPtr );
|
|
vec_ste( C1, 12, (float*) jointPtr );
|
|
|
|
vec_ste( C2, 16, (float*) jointPtr );
|
|
vec_ste( C2, 20, (float*) jointPtr );
|
|
vec_ste( C2, 24, (float*) jointPtr );
|
|
vec_ste( C2, 28, (float*) jointPtr );
|
|
|
|
vec_ste( C3, 32, (float*) jointPtr );
|
|
vec_ste( C3, 36, (float*) jointPtr );
|
|
vec_ste( C3, 40, (float*) jointPtr );
|
|
vec_ste( C3, 44, (float*) jointPtr );
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TransformVerts
|
|
============
|
|
*/
|
|
|
|
// Here we don't have much for the vector unit to do, and the gain we get from doing the math
|
|
// in parallel is eaten by doing unaligned stores.
|
|
void VPCALL idSIMD_AltiVec::TransformVerts( idDrawVert *verts, const int numVerts, const idJointMat *joints, const idVec4 *weights, const int *index, int numWeights ) {
|
|
int i, j;
|
|
const byte *jointsPtr = (byte *)joints;
|
|
|
|
for( j = i = 0; i < numVerts; i++ ) {
|
|
idVec3 v;
|
|
|
|
float *matPtrOrig = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
|
|
float *weightPtr = (float*) weights[j].ToFloatPtr();
|
|
|
|
v[0] = matPtrOrig[0] * weightPtr[0];
|
|
v[0] += matPtrOrig[1] * weightPtr[1];
|
|
v[0] += matPtrOrig[2] * weightPtr[2];
|
|
v[0] += matPtrOrig[3] * weightPtr[3];
|
|
|
|
v[1] = matPtrOrig[4] * weightPtr[0];
|
|
v[1] += matPtrOrig[5] * weightPtr[1];
|
|
v[1] += matPtrOrig[6] * weightPtr[2];
|
|
v[1] += matPtrOrig[7] * weightPtr[3];
|
|
|
|
v[2] = matPtrOrig[8] * weightPtr[0];
|
|
v[2] += matPtrOrig[9] * weightPtr[1];
|
|
v[2] += matPtrOrig[10] * weightPtr[2];
|
|
v[2] += matPtrOrig[11] * weightPtr[3];
|
|
|
|
while( index[j*2+1] == 0 ) {
|
|
j++;
|
|
float *matPtr = ( *(idJointMat *)( jointsPtr + index[j*2] ) ).ToFloatPtr();
|
|
weightPtr = (float*) weights[j].ToFloatPtr();
|
|
|
|
v[0] += matPtr[0] * weightPtr[0];
|
|
v[0] += matPtr[1] * weightPtr[1];
|
|
v[0] += matPtr[2] * weightPtr[2];
|
|
v[0] += matPtr[3] * weightPtr[3];
|
|
|
|
v[1] += matPtr[4] * weightPtr[0];
|
|
v[1] += matPtr[5] * weightPtr[1];
|
|
v[1] += matPtr[6] * weightPtr[2];
|
|
v[1] += matPtr[7] * weightPtr[3];
|
|
|
|
v[2] += matPtr[8] * weightPtr[0];
|
|
v[2] += matPtr[9] * weightPtr[1];
|
|
v[2] += matPtr[10] * weightPtr[2];
|
|
v[2] += matPtr[11] * weightPtr[3];
|
|
}
|
|
j++;
|
|
|
|
verts[i].xyz = v;
|
|
}
|
|
}
|
|
#endif /* LIVE_VICARIOUSLY */
|
|
|
|
#ifdef ENABLE_CULL
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TracePointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
byte tOr;
|
|
tOr = 0;
|
|
|
|
// pointers
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
|
|
vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector float zeroVector = (vector float)(0);
|
|
vector float vecRadius;
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
|
|
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
|
|
vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecTotals;
|
|
vector unsigned int tempIntSum;
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
// load constants
|
|
vecRadius = loadSplatUnalignedScalar( &radius );
|
|
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
int i = 0;
|
|
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
|
|
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
|
|
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
|
|
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
|
|
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
|
|
vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
|
|
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
|
|
vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
|
|
|
|
// vec1Sum1 now holds d0, d1, d2, d3. calculate the
|
|
// difference with +radius and -radius
|
|
vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
|
|
vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
|
|
vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
|
|
vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
|
|
vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
|
|
vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
|
|
vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
|
|
vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
|
|
|
|
// do compare
|
|
vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
|
|
vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
|
|
vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
|
|
vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
|
|
vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
|
|
vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
// OR (add) them all together
|
|
vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
|
|
vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
|
|
vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
|
|
vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
|
|
|
|
vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
|
|
tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_mergeh( vecTotals, tempIntSum );
|
|
tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
|
|
tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
|
|
|
|
// store out results
|
|
vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
|
|
tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
|
|
vec_ste( tempSt, 0, &cullBitVal[0] );
|
|
vec_ste( tempSt, 4, &cullBitVal[0] );
|
|
vec_ste( tempSt, 8, &cullBitVal[0] );
|
|
vec_ste( tempSt, 12, &cullBitVal[0] );
|
|
|
|
tOr |= cullBitVal[0];
|
|
tOr |= cullBitVal[1];
|
|
tOr |= cullBitVal[2];
|
|
tOr |= cullBitVal[3];
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, t;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
|
|
t = d0 + radius;
|
|
bits = FLOATSIGNBITSET( t ) << 0;
|
|
t = d1 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 1;
|
|
t = d2 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 2;
|
|
t = d3 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 3;
|
|
|
|
t = d0 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 4;
|
|
t = d1 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 5;
|
|
t = d2 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 6;
|
|
t = d3 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 7;
|
|
|
|
bits ^= 0x0F; // flip lower four bits
|
|
|
|
tOr |= bits;
|
|
cullBits[i] = bits;
|
|
}
|
|
|
|
totalOr = tOr;
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TracePointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
byte tOr;
|
|
tOr = 0;
|
|
|
|
// pointers
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
|
|
vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector float zeroVector = (vector float)(0);
|
|
vector float vecRadius;
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
|
|
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
|
|
vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecTotals;
|
|
vector unsigned int tempIntSum;
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
// load constants
|
|
vecRadius = loadSplatUnalignedScalar( &radius );
|
|
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
int i = 0;
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
vecXYZ1 = vec_ld( 0, vertPtr );
|
|
vecXYZ2 = vec_ld( 0, vertPtr2 );
|
|
vecXYZ3 = vec_ld( 0, vertPtr3 );
|
|
vecXYZ4 = vec_ld( 0, vertPtr4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
|
|
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
|
|
vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
|
|
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
|
|
vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
|
|
|
|
// vec1Sum1 now holds d0, d1, d2, d3. calculate the
|
|
// difference with +radius and -radius
|
|
vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
|
|
vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
|
|
vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
|
|
vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
|
|
vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
|
|
vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
|
|
vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
|
|
vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
|
|
|
|
// do compare
|
|
vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
|
|
vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
|
|
vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
|
|
vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
|
|
vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
|
|
vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
// OR (add) them all together
|
|
vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
|
|
vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
|
|
vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
|
|
vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
|
|
|
|
vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
|
|
tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_mergeh( vecTotals, tempIntSum );
|
|
tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
|
|
tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
|
|
|
|
// store out results
|
|
vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
|
|
tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
|
|
vec_ste( tempSt, 0, &cullBitVal[0] );
|
|
vec_ste( tempSt, 4, &cullBitVal[0] );
|
|
vec_ste( tempSt, 8, &cullBitVal[0] );
|
|
vec_ste( tempSt, 12, &cullBitVal[0] );
|
|
|
|
tOr |= cullBitVal[0];
|
|
tOr |= cullBitVal[1];
|
|
tOr |= cullBitVal[2];
|
|
tOr |= cullBitVal[3];
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, t;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
|
|
t = d0 + radius;
|
|
bits = FLOATSIGNBITSET( t ) << 0;
|
|
t = d1 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 1;
|
|
t = d2 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 2;
|
|
t = d3 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 3;
|
|
|
|
t = d0 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 4;
|
|
t = d1 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 5;
|
|
t = d2 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 6;
|
|
t = d3 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 7;
|
|
|
|
bits ^= 0x0F; // flip lower four bits
|
|
|
|
tOr |= bits;
|
|
cullBits[i] = bits;
|
|
}
|
|
|
|
totalOr = tOr;
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DecalPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
|
|
vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 16 );
|
|
v1 = vec_ld( 15, planePtr + 16 );
|
|
vecPlane4 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 20 );
|
|
v3 = vec_ld( 15, planePtr + 20 );
|
|
vecPlane5 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
v0 = vec_mergeh( vecPlane4, zeroVector );
|
|
v1 = vec_mergeh( vecPlane5, zeroVector );
|
|
v2 = vec_mergel( vecPlane4, zeroVector );
|
|
v3 = vec_mergel( vecPlane5, zeroVector );
|
|
|
|
vecPlane4 = vec_mergeh( v0, v1 );
|
|
vecPlane5 = vec_mergel( v0, v1 );
|
|
vecPlane6 = vec_mergeh( v2, v3 );
|
|
vecPlane7 = vec_mergel( v2, v3 );
|
|
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
|
|
vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
|
|
vector unsigned int vecR1, vecR2, vecR3, vecR4;
|
|
vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int vBits[4];
|
|
vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
|
|
|
|
i = 0;
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
|
|
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
|
|
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
|
|
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
|
|
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
|
|
vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
|
|
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
|
|
vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
|
|
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
|
|
vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
|
|
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
|
|
vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
|
|
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
|
|
vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
|
|
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
|
|
vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
|
|
|
|
vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
|
|
vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
|
|
vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
|
|
vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
|
|
vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
|
|
vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
//OR them all together (this is the same as adding them, since they're all only 1 bit set)
|
|
vecR1 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
|
|
vecR1 = vec_add(vecR1, vecBitShifted2 );
|
|
vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
vecR2 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
|
|
vecR2 = vec_add(vecR2, vecBitShifted4 );
|
|
vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
|
|
|
|
vecR3 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
|
|
vecR3 = vec_add(vecR3, vecBitShifted6 );
|
|
vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
|
|
|
|
vecR4 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
|
|
vecR4 = vec_add(vecR4, vecBitShifted8 );
|
|
vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
|
|
|
|
// take the first element from each vector and put them into vecR1
|
|
vecR1 = vec_mergeh( vecR1, vecR2 );
|
|
vecR3 = vec_mergeh( vecR3, vecR4 );
|
|
vecR1 = vec_perm( vecR1, vecR3, permHalves );
|
|
|
|
// XOR with 0x3F to flip lower 6 bits
|
|
vecR1 = vec_xor( vecR1, vecFlipBits );
|
|
|
|
// store out results. don't have 16 at a time so let's just
|
|
// do this and avoid alignment concerns
|
|
vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
|
|
vec_ste( vecR1, 0, &vBits[0] );
|
|
vec_ste( vecR1, 4, &vBits[0] );
|
|
vec_ste( vecR1, 8, &vBits[0] );
|
|
vec_ste( vecR1, 12, &vBits[0] );
|
|
|
|
cullBits[i] = vBits[0];
|
|
cullBits[i+1] = vBits[1];
|
|
cullBits[i+2] = vBits[2];
|
|
cullBits[i+3] = vBits[3];
|
|
}
|
|
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
d4 = planes[4].Distance( v );
|
|
d5 = planes[5].Distance( v );
|
|
|
|
// they check if the sign bit is set by casting as long and shifting right 31 places.
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 3;
|
|
bits |= FLOATSIGNBITSET( d4 ) << 4;
|
|
bits |= FLOATSIGNBITSET( d5 ) << 5;
|
|
|
|
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DecalPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
|
|
vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 16 );
|
|
v1 = vec_ld( 15, planePtr + 16 );
|
|
vecPlane4 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 20 );
|
|
v3 = vec_ld( 15, planePtr + 20 );
|
|
vecPlane5 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
v0 = vec_mergeh( vecPlane4, zeroVector );
|
|
v1 = vec_mergeh( vecPlane5, zeroVector );
|
|
v2 = vec_mergel( vecPlane4, zeroVector );
|
|
v3 = vec_mergel( vecPlane5, zeroVector );
|
|
|
|
vecPlane4 = vec_mergeh( v0, v1 );
|
|
vecPlane5 = vec_mergel( v0, v1 );
|
|
vecPlane6 = vec_mergeh( v2, v3 );
|
|
vecPlane7 = vec_mergel( v2, v3 );
|
|
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
|
|
vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
|
|
vector unsigned int vecR1, vecR2, vecR3, vecR4;
|
|
vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int vBits[4];
|
|
vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
|
|
|
|
i = 0;
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
|
|
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
|
|
vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
|
|
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
|
|
vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
|
|
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
|
|
vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
|
|
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
|
|
vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
|
|
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
|
|
vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
|
|
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
|
|
vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
|
|
|
|
vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
|
|
vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
|
|
vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
|
|
vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
|
|
vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
|
|
vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
//OR them all together (this is the same as adding them, since they're all only 1 bit set)
|
|
vecR1 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
|
|
vecR1 = vec_add(vecR1, vecBitShifted2 );
|
|
vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
vecR2 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
|
|
vecR2 = vec_add(vecR2, vecBitShifted4 );
|
|
vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
|
|
|
|
vecR3 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
|
|
vecR3 = vec_add(vecR3, vecBitShifted6 );
|
|
vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
|
|
|
|
vecR4 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
|
|
vecR4 = vec_add(vecR4, vecBitShifted8 );
|
|
vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
|
|
|
|
// take the first element from each vector and put them into vecR1
|
|
vecR1 = vec_mergeh( vecR1, vecR2 );
|
|
vecR3 = vec_mergeh( vecR3, vecR4 );
|
|
vecR1 = vec_perm( vecR1, vecR3, permHalves );
|
|
|
|
// XOR with 0x3F to flip lower 6 bits
|
|
vecR1 = vec_xor( vecR1, vecFlipBits );
|
|
|
|
// store out results. don't have 16 at a time so let's just
|
|
// do this and avoid alignment concerns
|
|
vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
|
|
vec_ste( vecR1, 0, &vBits[0] );
|
|
vec_ste( vecR1, 4, &vBits[0] );
|
|
vec_ste( vecR1, 8, &vBits[0] );
|
|
vec_ste( vecR1, 12, &vBits[0] );
|
|
|
|
cullBits[i] = vBits[0];
|
|
cullBits[i+1] = vBits[1];
|
|
cullBits[i+2] = vBits[2];
|
|
cullBits[i+3] = vBits[3];
|
|
}
|
|
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
d4 = planes[4].Distance( v );
|
|
d5 = planes[5].Distance( v );
|
|
|
|
// they check if the sign bit is set by casting as long and shifting right 31 places.
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 3;
|
|
bits |= FLOATSIGNBITSET( d4 ) << 4;
|
|
bits |= FLOATSIGNBITSET( d5 ) << 5;
|
|
|
|
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
|
|
}
|
|
}
|
|
|
|
|
|
#endif /*DRAWVERT_PADDED */
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::OverlayPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
|
|
float p0x, p0y, p0z, p0d;
|
|
float p1x, p1y, p1z, p1d;
|
|
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
const float *vertPtr = verts[0].xyz.ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector unsigned char vecPerm;
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
p0x = *(planePtr + 0);
|
|
p0y = *(planePtr + 1);
|
|
p0z = *(planePtr + 2);
|
|
p0d = *(planePtr + 3);
|
|
p1x = *(planePtr + 4);
|
|
p1y = *(planePtr + 5);
|
|
p1z = *(planePtr + 6);
|
|
p1d = *(planePtr + 7);
|
|
|
|
// populate the planes
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 31, planePtr );
|
|
vecPlane1 = vec_perm( v1, v2, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane0 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane1 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane0 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane1);
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float oneVector = (vector float)(1);
|
|
|
|
vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
|
|
vector float negTwoVector = (vector float)(-2);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
|
|
vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
|
|
vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
|
|
vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
|
|
i = 0;
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
|
|
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
|
|
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
|
|
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
// like a splat, but only doing halves
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
|
|
vecSum1 = vec_add( vecSum1, vecPlane3 );
|
|
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
|
|
vecSum2 = vec_add( vecSum2, vecPlane3 );
|
|
|
|
// store out results
|
|
UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
|
|
|
|
// bit manipulation
|
|
vecCmp1 = vec_cmplt( vecSum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecSum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
|
|
// store out and write to cullBits
|
|
// finally, a use for algebra! 1-x = x + 1 - 2x
|
|
vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
|
|
vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
|
|
vecSum1Inv = vec_add( vecSum1Inv, oneVector );
|
|
vecSum2Inv = vec_add( vecSum2Inv, oneVector );
|
|
|
|
// do the same comparisons for the inverted d0/d1
|
|
vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
|
|
vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
|
|
vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
|
|
|
|
// shift them as needed
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
|
|
vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
|
|
vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
|
|
|
|
// OR them all together. since only 1 bit is set for each value, thats
|
|
// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
|
|
vector unsigned int vecResult;
|
|
vector unsigned int vecResult2;
|
|
vector unsigned int vecResult3;
|
|
vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
|
|
|
|
vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
// vecResult now holds the values without the inverses yet, so add those
|
|
vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
|
|
vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
|
|
vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
|
|
vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
|
|
|
|
vecResult = vec_add( vecResult, vecResult2 );
|
|
|
|
//store out results
|
|
vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
|
|
vec_ste( vecResult, 0, &cullBitVal[0] );
|
|
vec_ste( vecResult, 4, &cullBitVal[0] );
|
|
vec_ste( vecResult, 8, &cullBitVal[0] );
|
|
vec_ste( vecResult, 12, &cullBitVal[0] );
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1;
|
|
float vx, vy, vz;
|
|
|
|
vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
|
|
d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
|
|
d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
|
|
texCoords[i][0] = d0;
|
|
texCoords[i][1] = d1;
|
|
|
|
bits = ( d0 >= 0 ) ? 0 : 1;
|
|
d0 = 1.0f - d0;
|
|
bits |= ( d1 >= 0 ) ? 0 : 1*2;
|
|
d1 = 1.0f - d1;
|
|
|
|
bits |= ( d0 >= 0 ) ? 0: 1*4;
|
|
bits |= ( d1 >= 0 ) ? 0: 1*8;
|
|
|
|
cullBits[i] = bits;
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::OverlayPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
|
|
float p0x, p0y, p0z, p0d;
|
|
float p1x, p1y, p1z, p1d;
|
|
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
const float *vertPtr = verts[0].xyz.ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector unsigned char vecPerm;
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
p0x = *(planePtr + 0);
|
|
p0y = *(planePtr + 1);
|
|
p0z = *(planePtr + 2);
|
|
p0d = *(planePtr + 3);
|
|
p1x = *(planePtr + 4);
|
|
p1y = *(planePtr + 5);
|
|
p1z = *(planePtr + 6);
|
|
p1d = *(planePtr + 7);
|
|
|
|
// populate the planes
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 31, planePtr );
|
|
vecPlane1 = vec_perm( v1, v2, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane0 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane1 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane0 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane1);
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float oneVector = (vector float)(1);
|
|
|
|
vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
|
|
vector float negTwoVector = (vector float)(-2);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
|
|
vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
|
|
vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
|
|
vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
|
|
i = 0;
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
vecXYZ1 = vec_ld( 0, vertPtr );
|
|
vecXYZ2 = vec_ld( 0, vertPtr2 );
|
|
vecXYZ3 = vec_ld( 0, vertPtr3 );
|
|
vecXYZ4 = vec_ld( 0, vertPtr4 );
|
|
|
|
// like a splat, but only doing halves
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
|
|
vecSum1 = vec_add( vecSum1, vecPlane3 );
|
|
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
|
|
vecSum2 = vec_add( vecSum2, vecPlane3 );
|
|
|
|
// store out results
|
|
UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
|
|
|
|
// bit manipulation
|
|
vecCmp1 = vec_cmplt( vecSum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecSum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
|
|
// store out and write to cullBits
|
|
// finally, a use for algebra! 1-x = x + 1 - 2x
|
|
vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
|
|
vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
|
|
vecSum1Inv = vec_add( vecSum1Inv, oneVector );
|
|
vecSum2Inv = vec_add( vecSum2Inv, oneVector );
|
|
|
|
// do the same comparisons for the inverted d0/d1
|
|
vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
|
|
vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
|
|
vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
|
|
|
|
// shift them as needed
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
|
|
vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
|
|
vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
|
|
|
|
// OR them all together. since only 1 bit is set for each value, thats
|
|
// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
|
|
vector unsigned int vecResult;
|
|
vector unsigned int vecResult2;
|
|
vector unsigned int vecResult3;
|
|
vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
|
|
|
|
vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
// vecResult now holds the values without the inverses yet, so add those
|
|
vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
|
|
vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
|
|
vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
|
|
vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
|
|
|
|
vecResult = vec_add( vecResult, vecResult2 );
|
|
|
|
//store out results
|
|
vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
|
|
vec_ste( vecResult, 0, &cullBitVal[0] );
|
|
vec_ste( vecResult, 4, &cullBitVal[0] );
|
|
vec_ste( vecResult, 8, &cullBitVal[0] );
|
|
vec_ste( vecResult, 12, &cullBitVal[0] );
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1;
|
|
float vx, vy, vz;
|
|
|
|
vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
|
|
d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
|
|
d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
|
|
texCoords[i][0] = d0;
|
|
texCoords[i][1] = d1;
|
|
|
|
bits = ( d0 >= 0 ) ? 0 : 1;
|
|
d0 = 1.0f - d0;
|
|
bits |= ( d1 >= 0 ) ? 0 : 1*2;
|
|
d1 = 1.0f - d1;
|
|
|
|
bits |= ( d0 >= 0 ) ? 0: 1*4;
|
|
bits |= ( d1 >= 0 ) ? 0: 1*8;
|
|
|
|
cullBits[i] = bits;
|
|
}
|
|
}
|
|
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#endif /* ENABLE_CULL */
|
|
|
|
#ifdef ENABLE_DERIVE
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DeriveTriPlanes
|
|
|
|
Derives a plane equation for each triangle.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// idPlane size
|
|
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
|
|
int i;
|
|
|
|
vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
|
|
vector float vecVertA, vecVertB, vecVertC;
|
|
vector float vecVertA2, vecVertB2, vecVertC2;
|
|
vector float vecVertA3, vecVertB3, vecVertC3;
|
|
vector float vecVertA4, vecVertB4, vecVertC4;
|
|
|
|
vector float vecN, vecN2, vecN3, vecN4;
|
|
vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
|
|
vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
|
|
vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
|
|
vector float vecF;
|
|
vector float vecF1, vecF2, vecF3, vecF4;
|
|
vector float zeroVector = (vector float)(0);
|
|
vector float vecNegOne = (vector float)(-1);
|
|
vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
|
|
|
|
vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
|
|
vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
|
|
vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
|
|
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
|
|
const float *xyzPtr = verts[0].xyz.ToFloatPtr();
|
|
float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
int j;
|
|
for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// calculate permute vectors to load as needed. these are all
|
|
// triangle indexes and are usaully pretty close together but
|
|
// not guaranteed to be in any particular order
|
|
vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
|
|
#endif
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// load first A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
|
|
vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
|
|
vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
|
|
|
|
// set the last element to 0
|
|
vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
|
|
vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
|
|
vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
|
|
|
|
// load second A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
|
|
vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
|
|
vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
|
|
|
|
// set the last element to 0
|
|
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
|
|
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
|
|
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
|
|
|
|
// load third A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
|
|
vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
|
|
vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
|
|
|
|
// set the last element to 0
|
|
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
|
|
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
|
|
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
|
|
|
|
// load the fourth A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
|
|
vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
|
|
vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
|
|
|
|
// set the last element to 0
|
|
vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
|
|
vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
|
|
vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
|
|
#else
|
|
// load first A B C
|
|
vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
|
|
vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
|
|
vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
|
|
vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
|
|
vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
|
|
|
|
// load second A B C
|
|
vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
|
|
vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
|
|
vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
|
|
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
|
|
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
|
|
|
|
// load third A B C
|
|
vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
|
|
vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
|
|
vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
|
|
vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
|
|
vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
|
|
|
|
// load the fourth A B C
|
|
vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
|
|
vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
|
|
vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
|
|
vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
|
|
vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
|
|
#endif
|
|
// calculate d0 and d1 for each
|
|
vecD0 = vec_sub( vecVertB, vecVertA );
|
|
vecD1 = vec_sub( vecVertC, vecVertA );
|
|
|
|
vecD2 = vec_sub( vecVertB2, vecVertA2 );
|
|
vecD3 = vec_sub( vecVertC2, vecVertA2 );
|
|
|
|
vecD4 = vec_sub( vecVertB3, vecVertA3 );
|
|
vecD5 = vec_sub( vecVertC3, vecVertA3 );
|
|
|
|
vecD6 = vec_sub( vecVertB4, vecVertA4 );
|
|
vecD7 = vec_sub( vecVertC4, vecVertA4 );
|
|
|
|
vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
|
|
vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
|
|
vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
|
|
vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
|
|
vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
|
|
vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
|
|
vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
|
|
vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
|
|
|
|
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
|
|
vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
|
|
vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
|
|
vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
|
|
vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
|
|
vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
|
|
vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
|
|
vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
|
|
|
|
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
|
|
vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
|
|
vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
|
|
vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
|
|
|
|
// transpose vecNs
|
|
vector float v0, v1, v2, v3;
|
|
v0 = vec_mergeh( vecN, vecN3 );
|
|
v1 = vec_mergeh( vecN2, vecN4 );
|
|
v2 = vec_mergel( vecN, vecN3 );
|
|
v3 = vec_mergel( vecN2, vecN4 );
|
|
|
|
vecN = vec_mergeh( v0, v1 );
|
|
vecN2 = vec_mergel( v0, v1 );
|
|
vecN3 = vec_mergeh( v2, v3 );
|
|
vecN4 = vec_mergel( v2, v3 );
|
|
|
|
vecF = vec_madd( vecN, vecN, zeroVector );
|
|
vecF = vec_madd( vecN2, vecN2, vecF );
|
|
vecF = vec_madd( vecN3, vecN3, vecF );
|
|
|
|
vecF = ReciprocalSquareRoot( vecF );
|
|
|
|
vecF1 = vec_madd( vecF, vecN, zeroVector );
|
|
vecF2 = vec_madd( vecF, vecN2, zeroVector );
|
|
vecF3 = vec_madd( vecF, vecN3, zeroVector );
|
|
vecF4 = vec_madd( vecF, vecN4, zeroVector );
|
|
|
|
vector float v8, v9, v10, v11;
|
|
v8 = vecF1;
|
|
v9 = vecF2;
|
|
v10 = vecF3;
|
|
v11 = vecF4;
|
|
|
|
// transpose vecVerts
|
|
v0 = vec_mergeh( vecVertA, vecVertA3 );
|
|
v1 = vec_mergeh( vecVertA2, vecVertA4 );
|
|
v2 = vec_mergel( vecVertA, vecVertA3 );
|
|
v3 = vec_mergel( vecVertA2, vecVertA4 );
|
|
|
|
vecVertA = vec_mergeh( v0, v1 );
|
|
vecVertA2 = vec_mergel( v0, v1 );
|
|
vecVertA3 = vec_mergeh( v2, v3 );
|
|
vecVertA4 = vec_mergel( v2, v3 );
|
|
|
|
vector float vecTotals;
|
|
vecTotals = vec_madd( vecVertA, v8, zeroVector );
|
|
vecTotals = vec_madd( vecVertA2, v9, vecTotals );
|
|
vecTotals = vec_madd( vecVertA3, v10, vecTotals );
|
|
vecTotals = vec_madd( vecVertA4, v11, vecTotals );
|
|
vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
|
|
|
|
// transpose vecFs
|
|
v0 = vec_mergeh( vecF1, vecF3 );
|
|
v1 = vec_mergeh( vecF2, vecF );
|
|
v2 = vec_mergel( vecF1, vecF3 );
|
|
v3 = vec_mergel( vecF2, vecF );
|
|
|
|
vecF1 = vec_mergeh( v0, v1 );
|
|
vecF2 = vec_mergel( v0, v1 );
|
|
vecF3 = vec_mergeh( v2, v3 );
|
|
vecF4 = vec_mergel( v2, v3 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numIndexes; i += 3, j++ ) {
|
|
const idDrawVert *a, *b, *c;
|
|
float d0[3], d1[3], f;
|
|
idVec3 n;
|
|
|
|
a = verts + indexes[i + 0];
|
|
b = verts + indexes[i + 1];
|
|
c = verts + indexes[i + 2];
|
|
|
|
d0[0] = b->xyz[0] - a->xyz[0];
|
|
d0[1] = b->xyz[1] - a->xyz[1];
|
|
d0[2] = b->xyz[2] - a->xyz[2];
|
|
|
|
d1[0] = c->xyz[0] - a->xyz[0];
|
|
d1[1] = c->xyz[1] - a->xyz[1];
|
|
d1[2] = c->xyz[2] - a->xyz[2];
|
|
|
|
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
|
|
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
|
|
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
|
|
|
|
f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
|
|
//idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
|
|
|
|
n.x *= f;
|
|
n.y *= f;
|
|
n.z *= f;
|
|
|
|
planes[j].SetNormal( n );
|
|
planes[j].FitThroughPoint( a->xyz );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DeriveTangents
|
|
|
|
Derives the normal and orthogonal tangent vectors for the triangle vertices.
|
|
For each vertex the normal and tangent vectors are derived from all triangles
|
|
using the vertex which results in smooth tangents across the mesh.
|
|
In the process the triangle planes are calculated as well.
|
|
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DeriveTangents( idPlane *planes, idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
int i;
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
idPlane *planesPtr = planes;
|
|
for ( i = 0; i < numIndexes; i += 3 ) {
|
|
idDrawVert *a, *b, *c;
|
|
// unsigned long signBit;
|
|
float d0[5], d1[5], area;
|
|
idVec3 n, t0, t1;
|
|
float f1, f2, f3;
|
|
|
|
int v0 = indexes[i + 0];
|
|
int v1 = indexes[i + 1];
|
|
int v2 = indexes[i + 2];
|
|
|
|
a = verts + v0;
|
|
b = verts + v1;
|
|
c = verts + v2;
|
|
|
|
d0[0] = b->xyz[0] - a->xyz[0];
|
|
d0[1] = b->xyz[1] - a->xyz[1];
|
|
d0[2] = b->xyz[2] - a->xyz[2];
|
|
d0[3] = b->st[0] - a->st[0];
|
|
d0[4] = b->st[1] - a->st[1];
|
|
|
|
d1[0] = c->xyz[0] - a->xyz[0];
|
|
d1[1] = c->xyz[1] - a->xyz[1];
|
|
d1[2] = c->xyz[2] - a->xyz[2];
|
|
d1[3] = c->st[0] - a->st[0];
|
|
d1[4] = c->st[1] - a->st[1];
|
|
|
|
// normal
|
|
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
|
|
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
|
|
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
|
|
|
|
f1 = n.x * n.x + n.y * n.y + n.z * n.z;
|
|
|
|
// area sign bit
|
|
area = d0[3] * d1[4] - d0[4] * d1[3];
|
|
|
|
// first tangent
|
|
t0[0] = d0[0] * d1[4] - d0[4] * d1[0];
|
|
t0[1] = d0[1] * d1[4] - d0[4] * d1[1];
|
|
t0[2] = d0[2] * d1[4] - d0[4] * d1[2];
|
|
|
|
f2 = t0.x * t0.x + t0.y * t0.y + t0.z * t0.z;
|
|
|
|
// second tangent
|
|
t1[0] = d0[3] * d1[0] - d0[0] * d1[3];
|
|
t1[1] = d0[3] * d1[1] - d0[1] * d1[3];
|
|
t1[2] = d0[3] * d1[2] - d0[2] * d1[3];
|
|
|
|
f3 = t1.x * t1.x + t1.y * t1.y + t1.z * t1.z;
|
|
|
|
// Behold! The power of the pipeline
|
|
FastScalarInvSqrt_x3( &f1, &f2, &f3 );
|
|
#ifdef PPC_INTRINSICS
|
|
f2 = __fsel( area, f2, -f2 );
|
|
f3 = __fsel( area, f3, -f3 );
|
|
#else
|
|
f2 = ( area < 0.0f ) ? -f2 : f2;
|
|
f3 = ( area < 0.0f ) ? -f3 : f3;
|
|
#endif
|
|
t0.x *= f2;
|
|
t0.y *= f2;
|
|
t0.z *= f2;
|
|
|
|
n.x *= f1;
|
|
n.y *= f1;
|
|
n.z *= f1;
|
|
|
|
planesPtr->SetNormal( n );
|
|
planesPtr->FitThroughPoint( a->xyz );
|
|
planesPtr++;
|
|
|
|
t1.x *= f3;
|
|
t1.y *= f3;
|
|
t1.z *= f3;
|
|
|
|
if ( used[v0] ) {
|
|
a->normal += n;
|
|
a->tangents[0] += t0;
|
|
a->tangents[1] += t1;
|
|
} else {
|
|
a->normal = n;
|
|
a->tangents[0] = t0;
|
|
a->tangents[1] = t1;
|
|
used[v0] = true;
|
|
}
|
|
|
|
if ( used[v1] ) {
|
|
b->normal += n;
|
|
b->tangents[0] += t0;
|
|
b->tangents[1] += t1;
|
|
} else {
|
|
b->normal = n;
|
|
b->tangents[0] = t0;
|
|
b->tangents[1] = t1;
|
|
used[v1] = true;
|
|
}
|
|
|
|
if ( used[v2] ) {
|
|
c->normal += n;
|
|
c->tangents[0] += t0;
|
|
c->tangents[1] += t1;
|
|
} else {
|
|
c->normal = n;
|
|
c->tangents[0] = t0;
|
|
c->tangents[1] = t1;
|
|
used[v2] = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef DERIVE_UNSMOOTH_DRAWVERT_ALIGNED
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DeriveUnsmoothedTangents
|
|
|
|
Derives the normal and orthogonal tangent vectors for the triangle vertices.
|
|
For each vertex the normal and tangent vectors are derived from a single dominant triangle.
|
|
============
|
|
*/
|
|
#define DERIVE_UNSMOOTHED_BITANGENT
|
|
void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
|
|
|
|
int i;
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// drawverts aligned
|
|
assert( IS_16BYTE_ALIGNED( verts[0] ) );
|
|
|
|
vector float vecVertA, vecVertB, vecVertC;
|
|
vector float vecVertA2, vecVertB2, vecVertC2;
|
|
vector float vecVertA3, vecVertB3, vecVertC3;
|
|
vector float vecVertA4, vecVertB4, vecVertC4;
|
|
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8;
|
|
vector float vecS0, vecS1, vecS2;
|
|
vector float vecS0_2, vecS1_2, vecS2_2;
|
|
vector float vecS0_3, vecS1_3, vecS2_3;
|
|
vector float vecS0_4, vecS1_4, vecS2_4;
|
|
|
|
vector float vecD1, vecD2, vecD3, vecD4, vecD5, vecD6;
|
|
vector float vecD7, vecD8, vecD9, vecD10, vecD11, vecD12;
|
|
vector float vecT1, vecT1_2, vecT1_3, vecT1_4, vecT2, vecT2_2, vecT2_3, vecT2_4;
|
|
vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
|
|
vector float vecN, vecN2, vecN3, vecN4;
|
|
|
|
vector unsigned char vecPermN0 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
|
|
vector unsigned char vecPermN1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
|
|
vector unsigned char vecPermT0 = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3);
|
|
vector unsigned char vecPermT1 = (vector unsigned char)(8,9,10,11,8,9,10,11,8,9,10,11,8,9,10,11);
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
vector float vecNegOne = (vector float)(-1.0);
|
|
|
|
vector float vecStore1, vecStore2, vecStore3;
|
|
vector unsigned char vecPermFirstThreeLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vecPermStoreSecond = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vecPermLeadAndThree = (vector unsigned char)(0,1,2,3,16,17,18,19,20,21,22,23,24,25,26,27);
|
|
vector unsigned char vecPermStore2 = (vector unsigned char)(4,5,6,7,8,9,10,11,24,25,26,27,28,29,30,31);
|
|
vector unsigned char vecPermStore3 = (vector unsigned char)(4,5,6,7,8,9,10,11,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vecPermStore4 = (vector unsigned char)(8,9,10,11,16,17,18,19,20,21,22,23,24,25,26,27);
|
|
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
|
|
vector float vecLd1, vecLd2, vecLd3;
|
|
vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3, vecPerm4;
|
|
|
|
float *normalPtr = verts[0].normal.ToFloatPtr();
|
|
float *xyzPtr = verts[0].xyz.ToFloatPtr();
|
|
|
|
vector float vecFirstHalf, vecSecondHalf;
|
|
vector float vecFirstHalf2, vecSecondHalf2;
|
|
vector float vecFirstHalf3, vecSecondHalf3;
|
|
vector float vecFirstHalf4, vecSecondHalf4;
|
|
|
|
for ( i = 0; i+3 < numVerts; i+=4 ) {
|
|
int bOffset1, bOffset2, bOffset3, bOffset4;
|
|
int cOffset1, cOffset2, cOffset3, cOffset4;
|
|
|
|
bOffset1 = dominantTris[i].v2;
|
|
cOffset1 = dominantTris[i].v3;
|
|
bOffset2 = dominantTris[i+1].v2;
|
|
cOffset2 = dominantTris[i+1].v3;
|
|
bOffset3 = dominantTris[i+2].v2;
|
|
cOffset3 = dominantTris[i+2].v3;
|
|
bOffset4 = dominantTris[i+3].v2;
|
|
cOffset4 = dominantTris[i+3].v3;
|
|
|
|
vecPerm0 = vec_lvsl( 0, xyzPtr + ( i * DRAWVERT_OFFSET ) );
|
|
v0 = vec_ld( 0, xyzPtr + (i * DRAWVERT_OFFSET ) );
|
|
v1 = vec_ld( 16, xyzPtr + (i * DRAWVERT_OFFSET ) );
|
|
vecVertA = vec_perm( v0, v1, vecPerm0 );
|
|
|
|
vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset1 * DRAWVERT_OFFSET ) );
|
|
v2 = vec_ld( 0, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
|
|
v3 = vec_ld( 16, xyzPtr + ( bOffset1 * DRAWVERT_OFFSET ) );
|
|
vecVertB = vec_perm( v2, v3, vecPerm1 );
|
|
|
|
vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
|
|
v4 = vec_ld( 0, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
|
|
v5 = vec_ld( 16, xyzPtr + ( cOffset1 * DRAWVERT_OFFSET ) );
|
|
vecVertC = vec_perm( v4, v5, vecPerm2 );
|
|
|
|
// put remainder into v2
|
|
v1 = vec_perm( v1, v1, vecPerm0 );
|
|
v3 = vec_perm( v3, v3, vecPerm1 );
|
|
v5 = vec_perm( v5, v5, vecPerm2 );
|
|
|
|
v1 = vec_mergeh( v1, v5 );
|
|
v2 = vec_mergeh( v3, zeroVector );
|
|
v2 = vec_mergeh( v1, v2 );
|
|
v2 = vec_perm( v2, v2, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
|
|
|
|
// load second one
|
|
vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
v0 = vec_ld( 0, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
v1 = vec_ld( 16, xyzPtr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vecVertA2 = vec_perm( v0, v1, vecPerm0 );
|
|
|
|
vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset2 * DRAWVERT_OFFSET ) );
|
|
v3 = vec_ld( 0, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
|
|
v4 = vec_ld( 16, xyzPtr + ( bOffset2 * DRAWVERT_OFFSET ) );
|
|
vecVertB2 = vec_perm( v3, v4, vecPerm3 );
|
|
|
|
vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
|
|
v5 = vec_ld( 0, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
|
|
v6 = vec_ld( 16, xyzPtr + ( cOffset2 * DRAWVERT_OFFSET ) );
|
|
vecVertC2 = vec_perm( v5, v6, vecPerm4 );
|
|
|
|
// put remainder into v3
|
|
v1 = vec_perm( v1, v1, vecPerm0 );
|
|
v4 = vec_perm( v4, v4, vecPerm3 );
|
|
v5 = vec_perm( v6, v6, vecPerm4 );
|
|
|
|
v1 = vec_mergeh( v1, v5 );
|
|
v3 = vec_mergeh( v4, zeroVector );
|
|
v3 = vec_mergeh( v1, v3 );
|
|
v3 = vec_perm( v3, v3, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
|
|
|
|
// load third one
|
|
vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
v0 = vec_ld( 0, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
v1 = vec_ld( 16, xyzPtr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
vecVertA3 = vec_perm( v0, v1, vecPerm0 );
|
|
|
|
vecPerm1 = vec_lvsl( 0, xyzPtr + (bOffset3 * DRAWVERT_OFFSET ) );
|
|
v4 = vec_ld( 0, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
|
|
v5 = vec_ld( 16, xyzPtr + ( bOffset3 * DRAWVERT_OFFSET ) );
|
|
vecVertB3 = vec_perm( v4, v5, vecPerm1 );
|
|
|
|
vecPerm2 = vec_lvsl( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
|
|
v6 = vec_ld( 0, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
|
|
v7 = vec_ld( 16, xyzPtr + ( cOffset3 * DRAWVERT_OFFSET ) );
|
|
vecVertC3 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// put remainder into v4
|
|
v1 = vec_perm( v1, v1, vecPerm0 );
|
|
v5 = vec_perm( v5, v5, vecPerm1 );
|
|
v7 = vec_perm( v7, v7, vecPerm2 );
|
|
|
|
v1 = vec_mergeh( v1, v7 );
|
|
v4 = vec_mergeh( v5, zeroVector );
|
|
v4 = vec_mergeh( v1, v4 );
|
|
v4 = vec_perm( v4, v4, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
|
|
|
|
// load fourth one
|
|
vecPerm0 = vec_lvsl( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
v0 = vec_ld( 0, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
v1 = vec_ld( 16, xyzPtr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vecVertA4 = vec_perm( v0, v1, vecPerm0 );
|
|
|
|
vecPerm3 = vec_lvsl( 0, xyzPtr + (bOffset4 * DRAWVERT_OFFSET ) );
|
|
v5 = vec_ld( 0, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
|
|
v6 = vec_ld( 16, xyzPtr + ( bOffset4 * DRAWVERT_OFFSET ) );
|
|
vecVertB4 = vec_perm( v5, v6, vecPerm3 );
|
|
|
|
vecPerm4 = vec_lvsl( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
|
|
v7 = vec_ld( 0, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
|
|
v8 = vec_ld( 16, xyzPtr + ( cOffset4 * DRAWVERT_OFFSET ) );
|
|
vecVertC4 = vec_perm( v7, v8, vecPerm4 );
|
|
|
|
// put remainder into v5
|
|
v1 = vec_perm( v1, v1, vecPerm0 );
|
|
v6 = vec_perm( v6, v6, vecPerm3 );
|
|
v8 = vec_perm( v8, v8, vecPerm4 );
|
|
|
|
v1 = vec_mergeh( v1, v8 );
|
|
v5 = vec_mergeh( v6, zeroVector );
|
|
v5 = vec_mergeh( v1, v5 );
|
|
v5 = vec_perm( v5, v5, (vector unsigned char)(4,5,6,7,0,1,2,3,8,9,10,11,0,1,2,3) );
|
|
|
|
// remainder vectors look like b->st[1], a->st[1], c->st[1], a->st[1]
|
|
|
|
//vecD1 now holds d0, d1, d2, d3
|
|
vecD1 = vec_sub( vecVertB, vecVertA );
|
|
vecD4 = vec_sub( vecVertB2, vecVertA2 );
|
|
vecD7 = vec_sub( vecVertB3, vecVertA3 );
|
|
vecD10 = vec_sub( vecVertB4, vecVertA4 );
|
|
|
|
// vecD2 how holds d5, d6, d7, d8
|
|
vecD2 = vec_sub( vecVertC, vecVertA );
|
|
vecD5 = vec_sub( vecVertC2, vecVertA2 );
|
|
vecD8 = vec_sub( vecVertC3, vecVertA3 );
|
|
vecD11 = vec_sub( vecVertC4, vecVertA4 );
|
|
|
|
// vecD3 now holds d4, crap, d9, crap
|
|
vecD3 = vec_sub( v2, vec_sld( v2, v2, 4 ) );
|
|
vecD6 = vec_sub( v3, vec_sld( v3, v3, 4 ) );
|
|
vecD9 = vec_sub( v4, vec_sld( v4, v4, 4 ) );
|
|
vecD12 = vec_sub( v5, vec_sld( v5, v5, 4 ) );
|
|
|
|
// get permute vectors for loading from dt
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i].normalizationScale[0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+1].normalizationScale[0] ), (vector unsigned char)(1) );
|
|
vecPerm3 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+2].normalizationScale[0] ), (vector unsigned char)(1) );
|
|
vecPerm4 = vec_add( vec_lvsl( -1, (int*) &dominantTris[i+3].normalizationScale[0] ), (vector unsigned char)(1) );
|
|
|
|
// load S values from dominantTris
|
|
v0 = vec_ld( 0, &dominantTris[i].normalizationScale[0] );
|
|
v1 = vec_ld( 11, &dominantTris[i].normalizationScale[0] );
|
|
v2 = vec_ld( 0, &dominantTris[i+1].normalizationScale[0] );
|
|
v3 = vec_ld( 11, &dominantTris[i+1].normalizationScale[0] );
|
|
v4 = vec_ld( 0, &dominantTris[i+2].normalizationScale[0] );
|
|
v5 = vec_ld( 11, &dominantTris[i+2].normalizationScale[0] );
|
|
v6 = vec_ld( 0, &dominantTris[i+3].normalizationScale[0] );
|
|
v7 = vec_ld( 11, &dominantTris[i+3].normalizationScale[0] );
|
|
|
|
v0 = vec_perm( v0, v1, vecPerm1 );
|
|
v2 = vec_perm( v2, v3, vecPerm2 );
|
|
v4 = vec_perm( v4, v5, vecPerm3 );
|
|
v6 = vec_perm( v6, v7, vecPerm4 );
|
|
|
|
vecS0 = vec_splat( v0, 0 );
|
|
vecS1 = vec_splat( v0, 1 );
|
|
vecS2 = vec_splat( v0, 2 );
|
|
|
|
vecS0_2 = vec_splat( v2, 0);
|
|
vecS1_2 = vec_splat( v2, 1 );
|
|
vecS2_2 = vec_splat( v2, 2 );
|
|
|
|
vecS0_3 = vec_splat( v4, 0 );
|
|
vecS1_3 = vec_splat( v4, 1 );
|
|
vecS2_3 = vec_splat( v4, 2 );
|
|
|
|
vecS0_4 = vec_splat( v6, 0 );
|
|
vecS1_4 = vec_splat( v6, 1 );
|
|
vecS2_4 = vec_splat( v6, 2 );
|
|
|
|
// do calculation
|
|
vecWork1 = vec_perm( vecD2, vecD2, vecPermN1 );
|
|
vecWork2 = vec_perm( vecD1, vecD1, vecPermN0 );
|
|
vecWork3 = vec_perm( vecD5, vecD5, vecPermN1 );
|
|
vecWork4 = vec_perm( vecD4, vecD4, vecPermN0 );
|
|
vecWork5 = vec_perm( vecD8, vecD8, vecPermN1 );
|
|
vecWork6 = vec_perm( vecD7, vecD7, vecPermN0 );
|
|
vecWork7 = vec_perm( vecD11, vecD11, vecPermN1 );
|
|
vecWork8 = vec_perm( vecD10, vecD10, vecPermN0 );
|
|
|
|
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecWork1 = vec_perm( vecD2, vecD2, vecPermN0 );
|
|
vecWork2 = vec_perm( vecD1, vecD1, vecPermN1 );
|
|
vecWork3 = vec_perm( vecD5, vecD5, vecPermN0 );
|
|
vecWork4 = vec_perm( vecD4, vecD4, vecPermN1 );
|
|
vecWork5 = vec_perm( vecD8, vecD8, vecPermN0 );
|
|
vecWork6 = vec_perm( vecD7, vecD7, vecPermN1 );
|
|
vecWork7 = vec_perm( vecD11, vecD11, vecPermN0 );
|
|
vecWork8 = vec_perm( vecD10, vecD10, vecPermN1 );
|
|
|
|
vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
|
|
vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
|
|
vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
|
|
vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
|
|
|
|
|
|
// calculate N values
|
|
vecN = vec_madd( vecS2, vecSecondHalf, zeroVector );
|
|
vecN2 = vec_madd( vecS2_2, vecSecondHalf2, zeroVector );
|
|
vecN3 = vec_madd( vecS2_3, vecSecondHalf3, zeroVector );
|
|
vecN4 = vec_madd( vecS2_4, vecSecondHalf4, zeroVector );
|
|
|
|
// calculate both halves of the calculation for t
|
|
vecWork1 = vecD1;
|
|
vecWork2 = vec_perm( vecD3, vecD3, vecPermT1 );
|
|
vecWork3 = vecD4;
|
|
vecWork4 = vec_perm( vecD6, vecD6, vecPermT1 );
|
|
vecWork5 = vecD7;
|
|
vecWork6 = vec_perm( vecD9, vecD9, vecPermT1 );
|
|
vecWork7 = vecD10;
|
|
vecWork8 = vec_perm( vecD12, vecD12, vecPermT1 );
|
|
|
|
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecWork1 = vecD2;
|
|
vecWork2 = vec_perm( vecD3, vecD3, vecPermT0 );
|
|
vecWork3 = vecD5;
|
|
vecWork4 = vec_perm( vecD6, vecD6, vecPermT0 );
|
|
vecWork5 = vecD8;
|
|
vecWork6 = vec_perm( vecD9, vecD9, vecPermT0 );
|
|
vecWork7 = vecD11;
|
|
vecWork8 = vec_perm( vecD12, vecD12, vecPermT0 );
|
|
|
|
vecSecondHalf = vec_nmsub( vecWork1, vecWork2, vecFirstHalf );
|
|
vecSecondHalf2 = vec_nmsub( vecWork3, vecWork4, vecFirstHalf2 );
|
|
vecSecondHalf3 = vec_nmsub( vecWork5, vecWork6, vecFirstHalf3 );
|
|
vecSecondHalf4 = vec_nmsub( vecWork7, vecWork8, vecFirstHalf4 );
|
|
|
|
// calculate T values
|
|
vecT1 = vec_madd( vecS0, vecSecondHalf, zeroVector );
|
|
vecT1_2 = vec_madd( vecS0_2, vecSecondHalf2, zeroVector );
|
|
vecT1_3 = vec_madd( vecS0_3, vecSecondHalf3, zeroVector );
|
|
vecT1_4 = vec_madd( vecS0_4, vecSecondHalf4, zeroVector );
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
vecWork1 = vecD1;
|
|
vecWork2 = vec_perm( vecD2, vecD2, vecPermT2 );
|
|
vecWork3 = vecD4;
|
|
vecWork4 = vec_perm( vecD5, vecD5, vecPermT2 );
|
|
vecWork5 = vecD7;
|
|
vecWork6 = vec_perm( vecD8, vecD8, vecPermT2 );
|
|
vecWork7 = vecD10;
|
|
vecWork8 = vec_perm( vecD11, vecD11, vecPermT2 );
|
|
|
|
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecWork1 = vec_perm( vecD1, vecD1, vecPermT2 );
|
|
vecWork2 = vecD2;
|
|
vecWork3 = vec_perm( vecD4, vecD4, vecPermT2 );
|
|
vecWork4 = vecD5;
|
|
vecWork5 = vec_perm( vecD7, vecD7, vecPermT2 );
|
|
vecWork6 = vecD8;
|
|
vecWork7 = vec_perm( vecD10, vecD10, vecPermT2 );
|
|
vecWork8 = vecD11;
|
|
|
|
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
#else
|
|
vecWork1 = vec_perm( vecN, vecN, vecPermN1 );
|
|
vecWork2 = vec_perm( vecT1, vecT1, vecPermN0 );
|
|
vecWork3 = vec_perm( vecN2, vecN2, vecPermN1 );
|
|
vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN0 );
|
|
vecWork5 = vec_perm( vecN3, vecN3, vecPermN1 );
|
|
vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN0 );
|
|
vecWork7 = vec_perm( vecN4, vecN4, vecPermN1 );
|
|
vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN0 );
|
|
|
|
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecWork1 = vec_perm( vecN, vecN, vecPermN0 );
|
|
vecWork2 = vec_perm( vecT1, vecT1, vecPermN1 );
|
|
vecWork3 = vec_perm( vecN2, vecN2, vecPermN0 );
|
|
vecWork4 = vec_perm( vecT1_2, vecT1_2, vecPermN1 );
|
|
vecWork5 = vec_perm( vecN3, vecN3, vecPermN0 );
|
|
vecWork6 = vec_perm( vecT1_3, vecT1_3, vecPermN1 );
|
|
vecWork7 = vec_perm( vecN4, vecN4, vecPermN0 );
|
|
vecWork8 = vec_perm( vecT1_4, vecT1_4, vecPermN1 );
|
|
|
|
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
#endif
|
|
// finish the calculation
|
|
vecSecondHalf = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
|
|
vecSecondHalf2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
|
|
vecSecondHalf3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
|
|
vecSecondHalf4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
|
|
|
|
vecT2 = vec_madd( vecS1, vecSecondHalf, zeroVector );
|
|
vecT2_2 = vec_madd( vecS1_2, vecSecondHalf2, zeroVector );
|
|
vecT2_3 = vec_madd( vecS1_3, vecSecondHalf3, zeroVector );
|
|
vecT2_4 = vec_madd( vecS1_4, vecSecondHalf4, zeroVector );
|
|
|
|
// Store results
|
|
|
|
// read values that we need to preserve
|
|
vecLd1 = vec_ld( 0, normalPtr + ( i * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 32, normalPtr + ( i * DRAWVERT_OFFSET ) );
|
|
|
|
//generate vectors to store
|
|
vecStore1 = vec_perm( vecLd1, vecN, vecPermLeadAndThree );
|
|
vecStore2 = vec_perm( vecT1, vecT2, vecPermFirstThreeLast );
|
|
vecStore3 = vec_perm( vecT2, vecLd2, vecPermStore2 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE3( normalPtr + ( i * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
|
|
|
|
// read values that we need to preserve
|
|
vecLd3 = vec_ld( 32, normalPtr + ( (i+1) * DRAWVERT_OFFSET ));
|
|
|
|
// generate vectors to store
|
|
vecStore1 = vec_perm( vecN2, vecT1_2, vecPermFirstThreeLast );
|
|
vecStore2 = vec_perm( vecT1_2, vecT2_2, vecPermStoreSecond );
|
|
vecStore3 = vec_perm( vecT2_2, vecLd3, (vector unsigned char)(8,9,10,11,20,21,22,23,24,25,26,27,28,29,30,31) );
|
|
|
|
// instead of doing permute, shift it where it needs to be and use vec_ste
|
|
// store out vectors
|
|
ALIGNED_STORE3( normalPtr + ((i+1) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
|
|
|
|
// read values that we need to preserve
|
|
vecLd1 = vec_ld( 0, normalPtr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
|
|
// generate vectors to store
|
|
vecStore1 = vec_perm( vecLd1, vecN3, vecPermFirstThreeLast );
|
|
vecStore2 = vec_perm( vecN3, vecT1_3, vecPermStore3 );
|
|
vecStore3 = vec_perm( vecT1_3, vecT2_3, vecPermStore4 );
|
|
|
|
// store out vectors
|
|
ALIGNED_STORE3( normalPtr + ((i+2) * DRAWVERT_OFFSET), vecStore1, vecStore2, vecStore3 );
|
|
|
|
// read values that we need to preserve
|
|
vecLd2 = vec_ld( 0, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 32, normalPtr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
|
|
// generate vectors to store
|
|
vecStore1 = vec_perm( vecLd2, vecN4, vecPermHalves );
|
|
vecStore2 = vec_perm( vecN4, vecT1_4, vecPermStore4 );
|
|
vecStore3 = vec_perm( vecT2_4, vecLd3, vecPermFirstThreeLast );
|
|
|
|
// store out vectors
|
|
ALIGNED_STORE3( normalPtr + ((i+3) * DRAWVERT_OFFSET ), vecStore1, vecStore2, vecStore3 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
idDrawVert *a, *b, *c;
|
|
float d0, d1, d2, d3, d4;
|
|
float d5, d6, d7, d8, d9;
|
|
float s0, s1, s2;
|
|
float n0, n1, n2;
|
|
float t0, t1, t2;
|
|
float t3, t4, t5;
|
|
|
|
const dominantTri_s &dt = dominantTris[i];
|
|
|
|
a = verts + i;
|
|
b = verts + dt.v2;
|
|
c = verts + dt.v3;
|
|
|
|
d0 = b->xyz[0] - a->xyz[0];
|
|
d1 = b->xyz[1] - a->xyz[1];
|
|
d2 = b->xyz[2] - a->xyz[2];
|
|
d3 = b->st[0] - a->st[0];
|
|
|
|
d4 = b->st[1] - a->st[1];
|
|
|
|
d5 = c->xyz[0] - a->xyz[0];
|
|
d6 = c->xyz[1] - a->xyz[1];
|
|
d7 = c->xyz[2] - a->xyz[2];
|
|
d8 = c->st[0] - a->st[0];
|
|
|
|
d9 = c->st[1] - a->st[1];
|
|
|
|
s0 = dt.normalizationScale[0];
|
|
s1 = dt.normalizationScale[1];
|
|
s2 = dt.normalizationScale[2];
|
|
|
|
n0 = s2 * ( d6 * d2 - d7 * d1 );
|
|
n1 = s2 * ( d7 * d0 - d5 * d2 );
|
|
n2 = s2 * ( d5 * d1 - d6 * d0 );
|
|
|
|
t0 = s0 * ( d0 * d9 - d4 * d5 );
|
|
t1 = s0 * ( d1 * d9 - d4 * d6 );
|
|
t2 = s0 * ( d2 * d9 - d4 * d7 );
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
t3 = s1 * ( d3 * d5 - d0 * d8 );
|
|
t4 = s1 * ( d3 * d6 - d1 * d8 );
|
|
t5 = s1 * ( d3 * d7 - d2 * d8 );
|
|
#else
|
|
t3 = s1 * ( n2 * t1 - n1 * t2 );
|
|
t4 = s1 * ( n0 * t2 - n2 * t0 );
|
|
t5 = s1 * ( n1 * t0 - n0 * t1 );
|
|
#endif
|
|
|
|
a->normal[0] = n0;
|
|
a->normal[1] = n1;
|
|
a->normal[2] = n2;
|
|
|
|
a->tangents[0][0] = t0;
|
|
a->tangents[0][1] = t1;
|
|
a->tangents[0][2] = t2;
|
|
|
|
a->tangents[1][0] = t3;
|
|
a->tangents[1][1] = t4;
|
|
a->tangents[1][2] = t5;
|
|
}
|
|
}
|
|
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DeriveUnsmoothedTangents
|
|
|
|
Derives the normal and orthogonal tangent vectors for the triangle vertices.
|
|
For each vertex the normal and tangent vectors are derived from a single dominant triangle.
|
|
============
|
|
*/
|
|
#define DERIVE_UNSMOOTHED_BITANGENT
|
|
|
|
void VPCALL idSIMD_AltiVec::DeriveUnsmoothedTangents( idDrawVert *verts, const dominantTri_s *dominantTris, const int numVerts ) {
|
|
int i;
|
|
|
|
for ( i = 0; i < numVerts; i++ ) {
|
|
idDrawVert *a, *b, *c;
|
|
float d0, d1, d2, d3, d4;
|
|
float d5, d6, d7, d8, d9;
|
|
float s0, s1, s2;
|
|
float n0, n1, n2;
|
|
float t0, t1, t2;
|
|
float t3, t4, t5;
|
|
|
|
const dominantTri_s &dt = dominantTris[i];
|
|
|
|
a = verts + i;
|
|
b = verts + dt.v2;
|
|
c = verts + dt.v3;
|
|
|
|
d0 = b->xyz[0] - a->xyz[0];
|
|
d1 = b->xyz[1] - a->xyz[1];
|
|
d2 = b->xyz[2] - a->xyz[2];
|
|
d3 = b->st[0] - a->st[0];
|
|
|
|
d4 = b->st[1] - a->st[1];
|
|
|
|
d5 = c->xyz[0] - a->xyz[0];
|
|
d6 = c->xyz[1] - a->xyz[1];
|
|
d7 = c->xyz[2] - a->xyz[2];
|
|
d8 = c->st[0] - a->st[0];
|
|
|
|
d9 = c->st[1] - a->st[1];
|
|
|
|
s0 = dt.normalizationScale[0];
|
|
s1 = dt.normalizationScale[1];
|
|
s2 = dt.normalizationScale[2];
|
|
|
|
n0 = s2 * ( d6 * d2 - d7 * d1 );
|
|
n1 = s2 * ( d7 * d0 - d5 * d2 );
|
|
n2 = s2 * ( d5 * d1 - d6 * d0 );
|
|
|
|
t0 = s0 * ( d0 * d9 - d4 * d5 );
|
|
t1 = s0 * ( d1 * d9 - d4 * d6 );
|
|
t2 = s0 * ( d2 * d9 - d4 * d7 );
|
|
|
|
#ifndef DERIVE_UNSMOOTHED_BITANGENT
|
|
t3 = s1 * ( d3 * d5 - d0 * d8 );
|
|
t4 = s1 * ( d3 * d6 - d1 * d8 );
|
|
t5 = s1 * ( d3 * d7 - d2 * d8 );
|
|
#else
|
|
t3 = s1 * ( n2 * t1 - n1 * t2 );
|
|
t4 = s1 * ( n0 * t2 - n2 * t0 );
|
|
t5 = s1 * ( n1 * t0 - n0 * t1 );
|
|
#endif
|
|
|
|
a->normal[0] = n0;
|
|
a->normal[1] = n1;
|
|
a->normal[2] = n2;
|
|
|
|
a->tangents[0][0] = t0;
|
|
a->tangents[0][1] = t1;
|
|
a->tangents[0][2] = t2;
|
|
|
|
a->tangents[1][0] = t3;
|
|
a->tangents[1][1] = t4;
|
|
a->tangents[1][2] = t5;
|
|
}
|
|
|
|
}
|
|
#endif /* DERIVE_UNSMOOTH_DRAWVERT_ALIGNED */
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::NormalizeTangents
|
|
|
|
Normalizes each vertex normal and projects and normalizes the
|
|
tangent vectors onto the plane orthogonal to the vertex normal.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::NormalizeTangents( idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
float *addr = verts[0].normal.ToFloatPtr();
|
|
float *tAddr = verts[0].tangents[0].ToFloatPtr();
|
|
|
|
// v0 through v3 maintain originally loaded values so we don't take
|
|
// as much hit for unaligned stores
|
|
vector float v0, v1, v2, v3;
|
|
// v5 through v8 are the "working" values of the vectors
|
|
vector float v5, v6, v7, v8;
|
|
// working values
|
|
vector float vec1T0, vec1T1, vec2T0, vec2T1, vec3T0, vec3T1, vec4T0, vec4T1;
|
|
vector float vecSum, vecTSum1, vecTSum2, tempSum, tempSum2, tempSum3;
|
|
vector float vecF, vecF2;
|
|
vector float vecTemp, vecTemp2, vecTemp3, vecTemp4;
|
|
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
|
|
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vecPermSplatFirstWithZero = (vector unsigned char)(0,1,2,3,0,1,2,3,0,1,2,3,16,17,18,19);
|
|
vector unsigned char vecPerm0, vecPerm1, vecPerm2, vecPerm3;
|
|
vector unsigned char storePerm0, storePerm1, storePerm2, storePerm3;
|
|
|
|
vector float vecTan11, vecTan12, vecTan13, vecTan21, vecTan22, vecTan23;
|
|
vector float vecTan31, vecTan32, vecTan33, vecTan41, vecTan42, vecTan43;
|
|
|
|
vector unsigned char vec1T0Perm, vec1T1Perm, vec2T0Perm, vec2T1Perm, vec3T0Perm, vec3T1Perm, vec4T0Perm, vec4T1Perm;
|
|
vector unsigned char storeT11, storeT12, storeT21, storeT22, storeT31, storeT32;
|
|
vector unsigned char storeT41, storeT42;
|
|
|
|
int i = 0;
|
|
|
|
if ( i+3 < numVerts ) {
|
|
// for loading normal from idDrawVert
|
|
vecPerm0 = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vecPerm1 = vec_add( vec_lvsl( -1, addr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, addr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
vecPerm3 = vec_add( vec_lvsl( -1, addr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
|
|
// for loading tangents from idDrawVert
|
|
vec1T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
vec1T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 0 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
|
|
vec2T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
vec2T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 1 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
|
|
vec3T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
vec3T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 2 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
|
|
vec4T0Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) ), (vector unsigned char)(1) );
|
|
vec4T1Perm = vec_add( vec_lvsl( -1, tAddr + ( 3 * DRAWVERT_OFFSET ) + 3 ), (vector unsigned char)(1) );
|
|
|
|
// generate permute vectors to store normals
|
|
storePerm0 = vec_lvsr( 0, addr );
|
|
storePerm1 = vec_lvsr( 0, addr + ( 1 * DRAWVERT_OFFSET ) );
|
|
storePerm2 = vec_lvsr( 0, addr + ( 2 * DRAWVERT_OFFSET ) );
|
|
storePerm3 = vec_lvsr( 0, addr + ( 3 * DRAWVERT_OFFSET ) );
|
|
|
|
// generate permute vectors to store tangents
|
|
storeT11 = vec_lvsr( 0, tAddr + ( 0 * DRAWVERT_OFFSET ) );
|
|
storeT12 = vec_lvsr( 12, tAddr + ( 0 * DRAWVERT_OFFSET ) );
|
|
|
|
storeT21 = vec_lvsr( 0, tAddr + ( 1 * DRAWVERT_OFFSET ) );
|
|
storeT22 = vec_lvsr( 12, tAddr + ( 1 * DRAWVERT_OFFSET ) );
|
|
|
|
storeT31 = vec_lvsr( 0, tAddr + ( 2 * DRAWVERT_OFFSET ) );
|
|
storeT32 = vec_lvsr( 12, tAddr + ( 2 * DRAWVERT_OFFSET ) );
|
|
|
|
storeT41 = vec_lvsr( 0, tAddr + ( 3 * DRAWVERT_OFFSET ) );
|
|
storeT42 = vec_lvsr( 12, tAddr + ( 3 * DRAWVERT_OFFSET ) );
|
|
}
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
|
|
// load normals
|
|
vector float vecNormal11 = vec_ld( 0, addr + ( i * DRAWVERT_OFFSET ) );
|
|
vector float vecNormal12 = vec_ld( 15, addr + ( i * DRAWVERT_OFFSET ) );
|
|
v0 = vec_perm( vecNormal11, vecNormal12, vecPerm0 );
|
|
|
|
vector float vecNormal21 = vec_ld( 0, addr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vector float vecNormal22 = vec_ld( 15, addr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
v1 = vec_perm( vecNormal21, vecNormal22, vecPerm1 );
|
|
|
|
vector float vecNormal31 = vec_ld( 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
vector float vecNormal32 = vec_ld( 15, addr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
v2 = vec_perm( vecNormal31, vecNormal32, vecPerm2 );
|
|
|
|
vector float vecNormal41 = vec_ld( 0, addr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vector float vecNormal42 = vec_ld( 15, addr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
v3 = vec_perm( vecNormal41, vecNormal42, vecPerm3 );
|
|
|
|
// zero out the last element of each useless vector
|
|
v0 = vec_perm( v0, zeroVector, vecPermLast );
|
|
v1 = vec_perm( v1, zeroVector, vecPermLast );
|
|
v2 = vec_perm( v2, zeroVector, vecPermLast );
|
|
v3 = vec_perm( v3, zeroVector, vecPermLast );
|
|
|
|
// got 4 vectors in v0 through v3, sum them each accross
|
|
// and put into one vector
|
|
vecTemp = vec_madd( v0, v0, zeroVector );
|
|
|
|
vecSum = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
vecSum = vec_add( vecSum, vec_sld( vecSum, vecSum, 4 ) );
|
|
// element 0 of vecSum now has sum of v0
|
|
|
|
vecTemp2 = vec_madd( v1, v1, zeroVector );
|
|
tempSum = vec_add( vecTemp2, vec_sld( vecTemp2, vecTemp2, 8 ) );
|
|
tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
|
|
// put this into vecSum
|
|
vecSum = vec_mergeh( vecSum, tempSum );
|
|
|
|
vecTemp3 = vec_madd( v2, v2, zeroVector );
|
|
tempSum = vec_add( vecTemp3, vec_sld( vecTemp3, vecTemp3, 8 ) );
|
|
tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
|
|
// put this into vecSum
|
|
vecSum = vec_perm( vecSum, tempSum, vecPermHalves );
|
|
|
|
vecTemp4 = vec_madd( v3, v3, zeroVector );
|
|
tempSum = vec_add( vecTemp4, vec_sld( vecTemp4, vecTemp4, 8 ) );
|
|
tempSum = vec_add( tempSum, vec_sld( tempSum, tempSum, 4 ) );
|
|
// put this into vecSum
|
|
vecSum = vec_perm( vecSum, tempSum, vecPermLast );
|
|
|
|
// take reciprocal square roots of these
|
|
vecF = ReciprocalSquareRoot( vecSum );
|
|
|
|
// multiply each vector by f
|
|
v5 = vec_madd( v0, vec_splat( vecF, 0 ), zeroVector );
|
|
v6 = vec_madd( v1, vec_splat( vecF, 1 ), zeroVector );
|
|
v7 = vec_madd( v2, vec_splat( vecF, 2 ), zeroVector );
|
|
v8 = vec_madd( v3, vec_splat( vecF, 3 ), zeroVector );
|
|
|
|
// load tangents as unaligned
|
|
vecTan11 = vec_ld( 0, tAddr + ( i * DRAWVERT_OFFSET ) );
|
|
vecTan12 = vec_ld( 11, tAddr + ( i * DRAWVERT_OFFSET ) );
|
|
vecTan13 = vec_ld( 23, tAddr + ( i * DRAWVERT_OFFSET ) );
|
|
|
|
vecTan21 = vec_ld( 0, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
|
|
vecTan22 = vec_ld( 11, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
|
|
vecTan23 = vec_ld( 23, tAddr + ( (i+1) * DRAWVERT_OFFSET ) );
|
|
|
|
vecTan31 = vec_ld( 0, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
vecTan32 = vec_ld( 11, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
vecTan33 = vec_ld( 23, tAddr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
|
|
vecTan41 = vec_ld( 0, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
|
|
vecTan42 = vec_ld( 11, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
|
|
vecTan43 = vec_ld( 23, tAddr + ( (i+3) * DRAWVERT_OFFSET ) );
|
|
|
|
vec1T0 = vec_perm( vecTan11, vecTan12, vec1T0Perm );
|
|
vec1T1 = vec_perm( vecTan12, vecTan13, vec1T1Perm );
|
|
vec2T0 = vec_perm( vecTan21, vecTan22, vec2T0Perm );
|
|
vec2T1 = vec_perm( vecTan22, vecTan23, vec2T1Perm );
|
|
vec3T0 = vec_perm( vecTan31, vecTan32, vec3T0Perm );
|
|
vec3T1 = vec_perm( vecTan32, vecTan33, vec3T1Perm );
|
|
vec4T0 = vec_perm( vecTan41, vecTan42, vec4T0Perm );
|
|
vec4T1 = vec_perm( vecTan42, vecTan43, vec4T1Perm );
|
|
|
|
//zero out last element of tangents
|
|
vec1T0 = vec_perm( vec1T0, zeroVector, vecPermLast );
|
|
vec1T1 = vec_perm( vec1T1, zeroVector, vecPermLast );
|
|
vec2T0 = vec_perm( vec2T0, zeroVector, vecPermLast );
|
|
vec2T1 = vec_perm( vec2T1, zeroVector, vecPermLast );
|
|
vec3T0 = vec_perm( vec3T0, zeroVector, vecPermLast );
|
|
vec3T1 = vec_perm( vec3T1, zeroVector, vecPermLast );
|
|
vec4T0 = vec_perm( vec4T0, zeroVector, vecPermLast );
|
|
vec4T1 = vec_perm( vec4T1, zeroVector, vecPermLast );
|
|
|
|
// all tangents[0]
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec1T0, v5, tempSum );
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
// put tempSum splatted accross vecTSum1
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
|
|
|
|
//vec1T0 now contains what needs to be rsqrt'd and multiplied by f
|
|
vec1T0 = vec_sub( vec1T0, vecTSum1 );
|
|
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec2T0, v6, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
|
|
vec2T0 = vec_sub( vec2T0, vecTSum1 );
|
|
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec3T0, v7, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
|
|
vec3T0 = vec_sub( vec3T0, vecTSum1 );
|
|
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec4T0, v8, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
|
|
vec4T0 = vec_sub( vec4T0, vecTSum1 );
|
|
|
|
// all tangents[1]
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec1T1, v5, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v5, zeroVector );
|
|
|
|
//vec1T0 now contains what needs to be rsqrt'd and multiplied by f
|
|
vec1T1 = vec_sub( vec1T1, vecTSum1 );
|
|
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec2T1, v6, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v6, zeroVector );
|
|
vec2T1 = vec_sub( vec2T1, vecTSum1 );
|
|
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec3T1, v7, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v7, zeroVector );
|
|
vec3T1 = vec_sub( vec3T1, vecTSum1 );
|
|
|
|
tempSum = zeroVector;
|
|
tempSum = vec_madd( vec4T1, v8, tempSum );
|
|
|
|
//sum accross tempSum
|
|
vecTSum1 = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
vecTSum1 = vec_perm( vecTSum1, zeroVector, vecPermSplatFirstWithZero );
|
|
vecTSum1 = vec_madd( vecTSum1, v8, zeroVector );
|
|
vec4T1 = vec_sub( vec4T1, vecTSum1 );
|
|
|
|
|
|
// sum accross vectors and put into one vector
|
|
vecTemp = vec_madd( vec1T0, vec1T0, zeroVector );
|
|
vecTSum1 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
vecTSum1 = vec_add( vecTSum1, vec_sld( vecTSum1, vecTSum1, 4 ) );
|
|
|
|
// element 0 of vecSum now has sum of v0
|
|
vecTemp = vec_madd( vec2T0, vec2T0, zeroVector );
|
|
tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
|
|
// put this into vecSum
|
|
vecTemp = vec_madd( vec3T0, vec3T0, zeroVector );
|
|
vecTSum1 = vec_mergeh( vecTSum1, tempSum2 );
|
|
tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
|
|
// put this into vecSum
|
|
vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermHalves );
|
|
vecTemp = vec_madd( vec4T0, vec4T0, zeroVector );
|
|
tempSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
tempSum2 = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 4 ) );
|
|
// put this into vecSum
|
|
vecTSum1 = vec_perm( vecTSum1, tempSum2, vecPermLast );
|
|
|
|
vecTemp = vec_madd( vec1T1, vec1T1, zeroVector );
|
|
vecTSum2 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
vecTSum2 = vec_add( vecTSum2, vec_sld( vecTSum2, vecTSum2, 4 ) );
|
|
// element 0 of vecSum now has sum of v0
|
|
vecTemp = vec_madd( vec2T1, vec2T1, zeroVector );
|
|
tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
|
|
// put this into vecSum
|
|
vecTSum2 = vec_mergeh( vecTSum2, tempSum3 );
|
|
vecTemp = vec_madd( vec3T1, vec3T1, zeroVector );
|
|
tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
|
|
// put this into vecSum
|
|
vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermHalves );
|
|
vecTemp = vec_madd( vec4T1, vec4T1, zeroVector );
|
|
tempSum3 = vec_add( vecTemp, vec_sld( vecTemp, vecTemp, 8 ) );
|
|
tempSum3 = vec_add( tempSum3, vec_sld( tempSum3, tempSum3, 4 ) );
|
|
// put this into vecSum
|
|
vecTSum2 = vec_perm( vecTSum2, tempSum3, vecPermLast );
|
|
|
|
// tangents[0]
|
|
vecF = ReciprocalSquareRoot( vecTSum1 );
|
|
// tangents[1]
|
|
vecF2 = ReciprocalSquareRoot( vecTSum2 );
|
|
|
|
// multiply each tangent vector by f
|
|
|
|
vec1T0 = vec_madd( vec1T0, vec_splat( vecF, 0 ), zeroVector );
|
|
vec2T0 = vec_madd( vec2T0, vec_splat( vecF, 1 ), zeroVector );
|
|
vec3T0 = vec_madd( vec3T0, vec_splat( vecF, 2 ), zeroVector );
|
|
vec4T0 = vec_madd( vec4T0, vec_splat( vecF, 3 ), zeroVector );
|
|
|
|
vec1T1 = vec_madd( vec1T1, vec_splat( vecF2, 0 ), zeroVector );
|
|
vec2T1 = vec_madd( vec2T1, vec_splat( vecF2, 1 ), zeroVector );
|
|
vec3T1 = vec_madd( vec3T1, vec_splat( vecF2, 2 ), zeroVector );
|
|
vec4T1 = vec_madd( vec4T1, vec_splat( vecF2, 3 ), zeroVector );
|
|
|
|
// rotate input data
|
|
v5 = vec_perm( v5, v5, storePerm0 );
|
|
v6 = vec_perm( v6, v6, storePerm1 );
|
|
v7 = vec_perm( v7, v7, storePerm2 );
|
|
v8 = vec_perm( v8, v8, storePerm3 );
|
|
|
|
vec_ste( v5, 0, addr + ( (i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v5, 4, addr + ( (i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v5, 8, addr + ( (i+0) * DRAWVERT_OFFSET ) );
|
|
|
|
vec_ste( v6, 0, addr + ( (i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v6, 4, addr + ( (i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v6, 8, addr + ( (i+1) * DRAWVERT_OFFSET ) );
|
|
|
|
vec_ste( v7, 0, addr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v7, 4, addr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v7, 8, addr + ( (i+2) * DRAWVERT_OFFSET ) );
|
|
|
|
vec_ste( v8, 0, addr + ( (i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v8, 4, addr + ( (i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( v8, 8, addr + ( (i+3) * DRAWVERT_OFFSET ) );
|
|
|
|
// store tangents[0] and tangents[1]
|
|
vec1T0 = vec_perm( vec1T0, vec1T0, storeT11 );
|
|
vec1T1 = vec_perm( vec1T1, vec1T1, storeT12 );
|
|
|
|
vec_ste( vec1T0, 0, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec1T0, 4, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec1T0, 8, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec1T1, 12, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec1T1, 16, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec1T1, 20, tAddr + ((i+0) * DRAWVERT_OFFSET ) );
|
|
|
|
// store second tangents[0] and tangents[1]
|
|
vec2T0 = vec_perm( vec2T0, vec2T0, storeT21 );
|
|
vec2T1 = vec_perm( vec2T1, vec2T1, storeT22 );
|
|
|
|
vec_ste( vec2T0, 0, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec2T0, 4, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec2T0, 8, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec2T1, 12, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec2T1, 16, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec2T1, 20, tAddr + ((i+1) * DRAWVERT_OFFSET ) );
|
|
|
|
// store third tangents[0] and tangents[1]
|
|
vec3T0 = vec_perm( vec3T0, vec3T0, storeT31 );
|
|
vec3T1 = vec_perm( vec3T1, vec3T1, storeT32 );
|
|
|
|
vec_ste( vec3T0, 0, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec3T0, 4, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec3T0, 8, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec3T1, 12, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec3T1, 16, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec3T1, 20, tAddr + ((i+2) * DRAWVERT_OFFSET ) );
|
|
|
|
// store fourth tangents[0] and tangents[1]
|
|
vec4T0 = vec_perm( vec4T0, vec4T0, storeT41 );
|
|
vec4T1 = vec_perm( vec4T1, vec4T1, storeT42 );
|
|
|
|
vec_ste( vec4T0, 0, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec4T0, 4, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec4T0, 8, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec4T1, 12, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec4T1, 16, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
vec_ste( vec4T1, 20, tAddr + ((i+3) * DRAWVERT_OFFSET ) );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
idVec3 &v = verts[i].normal;
|
|
float f;
|
|
|
|
//f = idMath::RSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
|
|
f = FastScalarInvSqrt( v.x * v.x + v.y * v.y + v.z * v.z );
|
|
v.x *= f; v.y *= f; v.z *= f;
|
|
|
|
for ( int j = 0; j < 2; j++ ) {
|
|
idVec3 &t = verts[i].tangents[j];
|
|
|
|
t -= ( t * v ) * v;
|
|
// f = idMath::RSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
|
|
f = FastScalarInvSqrt( t.x * t.x + t.y * t.y + t.z * t.z );
|
|
t.x *= f; t.y *= f; t.z *= f;
|
|
}
|
|
}
|
|
}
|
|
#endif /* ENABLE_DERIVE */
|
|
|
|
#ifdef ENABLE_CREATE
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateTextureSpaceLightVectors
|
|
|
|
Calculates light vectors in texture space for the given triangle vertices.
|
|
For each vertex the direction towards the light origin is projected onto texture space.
|
|
The light vectors are only calculated for the vertices referenced by the indexes.
|
|
============
|
|
*/
|
|
|
|
void VPCALL idSIMD_AltiVec::CreateTextureSpaceLightVectors( idVec3 *lightVectors, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
int i;
|
|
for ( i = 0; i+7 < numIndexes; i+= 8 ) {
|
|
used[indexes[i]] = true;
|
|
used[indexes[i+1]] = true;
|
|
used[indexes[i+2]] = true;
|
|
used[indexes[i+3]] = true;
|
|
used[indexes[i+4]] = true;
|
|
used[indexes[i+5]] = true;
|
|
used[indexes[i+6]] = true;
|
|
used[indexes[i+7]] = true;
|
|
}
|
|
|
|
for ( ; i < numIndexes; i++ ) {
|
|
used[indexes[i]] = true;
|
|
}
|
|
|
|
for ( i = 0; i+1 < numVerts; i+=2 ) {
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
const idDrawVert *v2 = &verts[i+1];
|
|
|
|
float x, y, z;
|
|
float x2, y2, z2;
|
|
idVec3 lightDir, lightDir2;
|
|
|
|
lightDir[0] = lightOrigin[0] - v->xyz[0];
|
|
lightDir[1] = lightOrigin[1] - v->xyz[1];
|
|
lightDir[2] = lightOrigin[2] - v->xyz[2];
|
|
|
|
lightDir2[0] = lightOrigin[0] - v2->xyz[0];
|
|
lightDir2[1] = lightOrigin[1] - v2->xyz[1];
|
|
lightDir2[2] = lightOrigin[2] - v2->xyz[2];
|
|
|
|
x = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
|
|
y = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
|
|
z = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
|
|
|
|
x2 = lightDir2[0] * v2->tangents[0][0] + lightDir2[1] * v2->tangents[0][1] + lightDir2[2] * v2->tangents[0][2];
|
|
y2 = lightDir2[0] * v2->tangents[1][0] + lightDir2[1] * v2->tangents[1][1] + lightDir2[2] * v2->tangents[1][2];
|
|
z2 = lightDir2[0] * v2->normal[0] + lightDir2[1] * v2->normal[1] + lightDir2[2] * v2->normal[2];
|
|
|
|
if ( used[i] ) {
|
|
lightVectors[i][0] = x;
|
|
lightVectors[i][1] = y;
|
|
lightVectors[i][2] = z;
|
|
}
|
|
|
|
if ( used[i+1] ) {
|
|
lightVectors[i+1][0] = x2;
|
|
lightVectors[i+1][1] = y2;
|
|
lightVectors[i+1][2] = z2;
|
|
}
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
if ( !used[i] ) {
|
|
continue;
|
|
}
|
|
|
|
const idDrawVert *v = &verts[i];
|
|
idVec3 lightDir;
|
|
|
|
lightDir[0] = lightOrigin[0] - v->xyz[0];
|
|
lightDir[1] = lightOrigin[1] - v->xyz[1];
|
|
lightDir[2] = lightOrigin[2] - v->xyz[2];
|
|
|
|
lightVectors[i][0] = lightDir[0] * v->tangents[0][0] + lightDir[1] * v->tangents[0][1] + lightDir[2] * v->tangents[0][2];
|
|
lightVectors[i][1] = lightDir[0] * v->tangents[1][0] + lightDir[1] * v->tangents[1][1] + lightDir[2] * v->tangents[1][2];
|
|
lightVectors[i][2] = lightDir[0] * v->normal[0] + lightDir[1] * v->normal[1] + lightDir[2] * v->normal[2];
|
|
}
|
|
}
|
|
|
|
#if 1
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateSpecularTextureCoords
|
|
|
|
Calculates specular texture coordinates for the given triangle vertices.
|
|
For each vertex the normalized direction towards the light origin is added to the
|
|
normalized direction towards the view origin and the result is projected onto texture space.
|
|
The texture coordinates are only calculated for the vertices referenced by the indexes.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CreateSpecularTextureCoords( idVec4 *texCoords, const idVec3 &lightOrigin, const idVec3 &viewOrigin, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
bool *used = (bool *)_alloca16( numVerts * sizeof( used[0] ) );
|
|
memset( used, 0, numVerts * sizeof( used[0] ) );
|
|
|
|
int i;
|
|
for ( i = 0; i+7 < numIndexes; i+= 8 ) {
|
|
used[indexes[i]] = true;
|
|
used[indexes[i+1]] = true;
|
|
used[indexes[i+2]] = true;
|
|
used[indexes[i+3]] = true;
|
|
used[indexes[i+4]] = true;
|
|
used[indexes[i+5]] = true;
|
|
used[indexes[i+6]] = true;
|
|
used[indexes[i+7]] = true;
|
|
}
|
|
|
|
for ( ; i < numIndexes; i++ ) {
|
|
used[indexes[i]] = true;
|
|
}
|
|
|
|
// load lightOrigin and viewOrigin into vectors
|
|
const float *lightOriginPtr = lightOrigin.ToFloatPtr();
|
|
const float *viewOriginPtr = viewOrigin.ToFloatPtr();
|
|
vector unsigned char permVec = vec_lvsl( 0, lightOriginPtr );
|
|
vector unsigned char permVec2 = vec_lvsl( 0, viewOriginPtr );
|
|
vector float v0 = vec_ld( 0, lightOriginPtr );
|
|
vector float v1 = vec_ld( 15, lightOriginPtr );
|
|
vector float v2 = vec_ld( 0, viewOriginPtr );
|
|
vector float v3 = vec_ld( 15, viewOriginPtr );
|
|
vector float vecLightOrigin = vec_perm( v0, v1, permVec );
|
|
vector float vecViewOrigin = vec_perm( v2, v3, permVec2 );
|
|
const vector float zeroVector = (vector float)(0);
|
|
int index;
|
|
|
|
for ( index = 0; index+1 < numVerts; index+=2 ) {
|
|
const float *vertPtr = verts[index].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[index+1].xyz.ToFloatPtr();
|
|
|
|
permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
|
|
permVec2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
vector float v2 = vec_ld( 31, vertPtr );
|
|
vector float v3 = vec_ld( 47, vertPtr );
|
|
vector float v4 = vec_ld( 63, vertPtr );
|
|
|
|
vector float v5 = vec_ld( 0, vertPtr2 );
|
|
vector float v6 = vec_ld( 15, vertPtr2 );
|
|
vector float v7 = vec_ld( 31, vertPtr2 );
|
|
vector float v8 = vec_ld( 47, vertPtr2 );
|
|
vector float v9 = vec_ld( 63, vertPtr2 );
|
|
|
|
// figure out what values go where
|
|
vector float vecXYZ = vec_perm( v0, v1, permVec );
|
|
vector float vecNormal = vec_perm( v1, v2, permVec );
|
|
vecNormal = vec_sld( vecNormal, vecNormal, 4 );
|
|
const vector float vecTangent0 = vec_perm( v2, v3, permVec );
|
|
permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
|
|
const vector float vecTangent1 = vec_perm( v3, v4, permVec );
|
|
|
|
vector float vecXYZ2 = vec_perm( v5, v6, permVec2 );
|
|
vector float vecNormal2 = vec_perm( v6, v7, permVec2 );
|
|
vecNormal2 = vec_sld( vecNormal2, vecNormal2, 4 );
|
|
const vector float vecTangent02 = vec_perm( v7, v8, permVec2 );
|
|
permVec2 = vec_add( permVec2, (vector unsigned char)(-4) );
|
|
const vector float vecTangent12 = vec_perm( v8, v9, permVec2 );
|
|
|
|
// calculate lightDir
|
|
vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
|
|
vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
|
|
|
|
vector float vecLightDir2 = vec_sub( vecLightOrigin, vecXYZ2 );
|
|
vector float vecViewDir2 = vec_sub( vecViewOrigin, vecXYZ2 );
|
|
|
|
// calculate distance
|
|
vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
|
|
vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
|
|
|
|
vector float vecTempLight2 = vec_madd( vecLightDir2, vecLightDir2, zeroVector );
|
|
vector float vecTempView2 = vec_madd( vecViewDir2, vecViewDir2, zeroVector );
|
|
|
|
// sum accross first 3 elements of vector
|
|
vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
|
|
vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
|
|
vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
|
|
|
|
vector float tempSum4 = vec_add( vecTempLight2, vec_sld( vecTempLight2, vecTempLight2, 4 ) );
|
|
vecTempLight2 = vec_add( tempSum4, vec_sld( tempSum4, tempSum4, 8 ) );
|
|
vector float tempSum5 = vec_add( vecTempView2, vec_sld( vecTempView2, vecTempView2, 4 ) );
|
|
vecTempView2 = vec_add( tempSum5, vec_sld( tempSum5, tempSum5, 8 ) );
|
|
|
|
// splat sum accross the whole vector
|
|
vecTempLight = vec_splat( vecTempLight, 0 );
|
|
vecTempView = vec_splat( vecTempView, 0 );
|
|
|
|
vecTempLight2 = vec_splat( vecTempLight2, 0 );
|
|
vecTempView2 = vec_splat( vecTempView2, 0 );
|
|
|
|
vecTempLight = ReciprocalSquareRoot( vecTempLight );
|
|
vecTempView = ReciprocalSquareRoot( vecTempView );
|
|
|
|
vecTempLight2 = ReciprocalSquareRoot( vecTempLight2 );
|
|
vecTempView2 = ReciprocalSquareRoot( vecTempView2 );
|
|
|
|
// modify light and view vectors based on ilength
|
|
vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
|
|
vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
|
|
|
|
vecViewDir2 = vec_madd( vecViewDir2, vecTempView2, zeroVector );
|
|
vecLightDir2 = vec_madd( vecLightDir2, vecTempLight2, vecViewDir2 );
|
|
|
|
// calculate what to store in each texture coord
|
|
vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
|
|
vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
|
|
vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
|
|
|
|
vector float vecTC3 = vec_madd( vecLightDir2, vecTangent02, zeroVector );
|
|
vector float vecTC4 = vec_madd( vecLightDir2, vecTangent12, zeroVector );
|
|
vector float vecTC5 = vec_madd( vecLightDir2, vecNormal2, zeroVector );
|
|
|
|
// sum accross first 3 elements of vector
|
|
vector float tempSum3;
|
|
tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
|
|
vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
|
|
tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
|
|
vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
|
|
tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
|
|
vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
|
|
|
|
tempSum4 = vec_add( vecTC3, vec_sld( vecTC3, vecTC3, 4 ) );
|
|
vecTC3 = vec_add( tempSum4, vec_sld( vecTC3, vecTC3, 8 ) );
|
|
tempSum5 = vec_add( vecTC4, vec_sld( vecTC4, vecTC4, 4 ) );
|
|
vecTC4 = vec_add( tempSum5, vec_sld( vecTC4, vecTC4, 8 ) );
|
|
vector float tempSum6 = vec_add( vecTC5, vec_sld( vecTC5, vecTC5, 4 ) );
|
|
vecTC5 = vec_add( tempSum6, vec_sld( vecTC5, vecTC5, 8 ) );
|
|
|
|
vecTC0 = vec_splat( vecTC0, 0 );
|
|
vecTC1 = vec_splat( vecTC1, 0 );
|
|
vecTC2 = vec_splat( vecTC2, 0 );
|
|
|
|
vecTC3 = vec_splat( vecTC3, 0 );
|
|
vecTC4 = vec_splat( vecTC4, 0 );
|
|
vecTC5 = vec_splat( vecTC5, 0 );
|
|
|
|
if ( used[index] ) {
|
|
// store out results
|
|
vec_ste( vecTC0, 0, &texCoords[index][0] );
|
|
vec_ste( vecTC1, 0, &texCoords[index][1] );
|
|
vec_ste( vecTC2, 0, &texCoords[index][2] );
|
|
vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
|
|
}
|
|
|
|
if ( used[index+1] ) {
|
|
vec_ste( vecTC3, 0, &texCoords[index+1][0] );
|
|
vec_ste( vecTC4, 0, &texCoords[index+1][1] );
|
|
vec_ste( vecTC5, 0, &texCoords[index+1][2] );
|
|
vec_ste( (vector float)(1.0), 0, &texCoords[index+1][3] );
|
|
}
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; index < numVerts; index++ ) {
|
|
if ( !used[index] ) {
|
|
continue;
|
|
}
|
|
|
|
const float *vertPtr = verts[index].xyz.ToFloatPtr();
|
|
|
|
permVec = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
vector float v2 = vec_ld( 31, vertPtr );
|
|
vector float v3 = vec_ld( 47, vertPtr );
|
|
vector float v4 = vec_ld( 63, vertPtr );
|
|
|
|
// figure out what values go where
|
|
vector float vecXYZ = vec_perm( v0, v1, permVec );
|
|
vector float vecNormal = vec_perm( v1, v2, permVec );
|
|
vecNormal = vec_sld( vecNormal, vecNormal, 4 );
|
|
const vector float vecTangent0 = vec_perm( v2, v3, permVec );
|
|
permVec = vec_add( permVec, (vector unsigned char)(-4) ); //shift permute right 3 elements
|
|
const vector float vecTangent1 = vec_perm( v3, v4, permVec );
|
|
|
|
// calculate lightDir
|
|
vector float vecLightDir = vec_sub( vecLightOrigin, vecXYZ );
|
|
vector float vecViewDir = vec_sub( vecViewOrigin, vecXYZ );
|
|
|
|
// calculate distance
|
|
vector float vecTempLight = vec_madd( vecLightDir, vecLightDir, zeroVector );
|
|
vector float vecTempView = vec_madd( vecViewDir, vecViewDir, zeroVector );
|
|
|
|
// sum accross first 3 elements of vector
|
|
vector float tempSum = vec_add( vecTempLight, vec_sld( vecTempLight, vecTempLight, 4 ) );
|
|
vecTempLight = vec_add( tempSum, vec_sld( tempSum, tempSum, 8 ) );
|
|
vector float tempSum2 = vec_add( vecTempView, vec_sld( vecTempView, vecTempView, 4 ) );
|
|
vecTempView = vec_add( tempSum2, vec_sld( tempSum2, tempSum2, 8 ) );
|
|
|
|
// splat sum accross the whole vector
|
|
vecTempLight = vec_splat( vecTempLight, 0 );
|
|
vecTempView = vec_splat( vecTempView, 0 );
|
|
|
|
vecTempLight = ReciprocalSquareRoot( vecTempLight );
|
|
vecTempView = ReciprocalSquareRoot( vecTempView );
|
|
|
|
// modify light and view vectors based on ilength
|
|
vecViewDir = vec_madd( vecViewDir, vecTempView, zeroVector );
|
|
vecLightDir = vec_madd( vecLightDir, vecTempLight, vecViewDir );
|
|
|
|
// calculate what to store in each texture coord
|
|
vector float vecTC0 = vec_madd( vecLightDir, vecTangent0, zeroVector );
|
|
vector float vecTC1 = vec_madd( vecLightDir, vecTangent1, zeroVector );
|
|
vector float vecTC2 = vec_madd( vecLightDir, vecNormal, zeroVector );
|
|
|
|
// sum accross first 3 elements of vector
|
|
vector float tempSum3;
|
|
tempSum = vec_add( vecTC0, vec_sld( vecTC0, vecTC0, 4 ) );
|
|
vecTC0 = vec_add( tempSum, vec_sld( vecTC0, vecTC0, 8 ) );
|
|
tempSum2 = vec_add( vecTC1, vec_sld( vecTC1, vecTC1, 4 ) );
|
|
vecTC1 = vec_add( tempSum2, vec_sld( vecTC1, vecTC1, 8 ) );
|
|
tempSum3 = vec_add( vecTC2, vec_sld( vecTC2, vecTC2, 4 ) );
|
|
vecTC2 = vec_add( tempSum3, vec_sld( vecTC2, vecTC2, 8 ) );
|
|
|
|
vecTC0 = vec_splat( vecTC0, 0 );
|
|
vecTC1 = vec_splat( vecTC1, 0 );
|
|
vecTC2 = vec_splat( vecTC2, 0 );
|
|
|
|
// store out results
|
|
vec_ste( vecTC0, 0, &texCoords[index][0] );
|
|
vec_ste( vecTC1, 0, &texCoords[index][1] );
|
|
vec_ste( vecTC2, 0, &texCoords[index][2] );
|
|
vec_ste( (vector float)(1.0), 0, &texCoords[index][3] );
|
|
|
|
}
|
|
}
|
|
#endif /* 0 for disable spec coord */
|
|
|
|
#if 1
|
|
|
|
#ifdef VERTEXCACHE_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
|
|
int outVerts = 0;
|
|
int i = 0;
|
|
|
|
assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float oneVector = (vector float)(1);
|
|
register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
|
|
const float *lPtr = lightOrigin.ToFloatPtr();
|
|
const float *vPtr;
|
|
const float *vPtr2;
|
|
const float *vPtr3;
|
|
const float *vPtr4;
|
|
|
|
// put values into a vector
|
|
vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
|
|
v0 = vec_ld( 0, lPtr );
|
|
v1 = vec_ld( 15, lPtr );
|
|
v0 = vec_perm( v0, v1, vecPerm );
|
|
v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
|
|
|
|
//v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
|
|
for ( ; i+3 < numVerts; i+= 4 ) {
|
|
if ( ! vertRemap[i] ) {
|
|
vPtr = verts[i].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
|
|
v2 = vec_ld( 0, vPtr );
|
|
v3 = vec_ld( 15, vPtr );
|
|
v7 = vec_perm( v2, v3, vecPerm2 );
|
|
#else
|
|
v7 = vec_ld( 0, vPtr );
|
|
#endif
|
|
v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
|
|
v3 = vec_perm( v7, oneVector, vecPermZeroLast );
|
|
v1 = vec_sub( v2, v0 );
|
|
|
|
vec_st( v3, 0, &vertexCache[outVerts][0] );
|
|
vec_st( v1, 0, &vertexCache[outVerts+1][0] );
|
|
|
|
vertRemap[i] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
|
|
if ( ! vertRemap[i+1] ) {
|
|
vPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
|
|
v4 = vec_ld( 0, vPtr2 );
|
|
v5 = vec_ld( 15, vPtr2 );
|
|
v6 = vec_perm( v4, v5, vecPerm3 );
|
|
#else
|
|
v6 = vec_ld( 0, vPtr2 );
|
|
#endif
|
|
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
|
|
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
|
|
v6 = vec_sub( v4, v0 );
|
|
|
|
vec_st( v5, 0, &vertexCache[outVerts][0] );
|
|
vec_st( v6, 0, &vertexCache[outVerts+1][0] );
|
|
|
|
vertRemap[i+1] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
|
|
if ( ! vertRemap[i+2] ) {
|
|
vPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
|
|
v1 = vec_ld( 0, vPtr3 );
|
|
v2 = vec_ld( 15, vPtr3 );
|
|
v3 = vec_perm( v1, v2, vecPerm4 );
|
|
#else
|
|
v3 = vec_ld( 0, vPtr3 );
|
|
#endif
|
|
v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
|
|
v2 = vec_perm( v3, oneVector, vecPermZeroLast );
|
|
v3 = vec_sub( v1, v0 );
|
|
|
|
vec_st( v2, 0, &vertexCache[outVerts][0] );
|
|
vec_st( v3, 0, &vertexCache[outVerts+1][0] );
|
|
|
|
vertRemap[i+2] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
|
|
if ( ! vertRemap[i+3] ) {
|
|
vPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
|
|
v4 = vec_ld( 0, vPtr4 );
|
|
v5 = vec_ld( 16, vPtr4 );
|
|
v6 = vec_perm( v4, v5, vecPerm5 );
|
|
#else
|
|
v6 = vec_ld( 0, vPtr4 );
|
|
#endif
|
|
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
|
|
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
|
|
v6 = vec_sub( v4, v0 );
|
|
|
|
vec_st( v5, 0, &vertexCache[outVerts][0] );
|
|
vec_st( v6, 0, &vertexCache[outVerts+1][0] );
|
|
|
|
vertRemap[i+3] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
}
|
|
|
|
// cleanup
|
|
for (; i < numVerts; i++ ) {
|
|
if ( vertRemap[i] ) {
|
|
continue;
|
|
}
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[outVerts+0][0] = v[0];
|
|
vertexCache[outVerts+0][1] = v[1];
|
|
vertexCache[outVerts+0][2] = v[2];
|
|
vertexCache[outVerts+0][3] = 1.0f;
|
|
|
|
// R_SetupProjection() builds the projection matrix with a slight crunch
|
|
// for depth, which keeps this w=0 division from rasterizing right at the
|
|
// wrap around point and causing depth fighting with the rear caps
|
|
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
|
|
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
|
|
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
|
|
vertexCache[outVerts+1][3] = 0.0f;
|
|
vertRemap[i] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
return outVerts;
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, int *vertRemap, const idVec3 &lightOrigin, const idDrawVert *verts, const int numVerts ) {
|
|
int outVerts = 0;
|
|
int i = 0;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector unsigned char vecPerm, vecPerm2, vecPerm3, vecPerm4, vecPerm5;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float oneVector = (vector float)(1);
|
|
register vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
|
|
const float *lPtr = lightOrigin.ToFloatPtr();
|
|
const float *vPtr;
|
|
const float *vPtr2;
|
|
const float *vPtr3;
|
|
const float *vPtr4;
|
|
|
|
// put values into a vector
|
|
vecPerm = vec_add( vec_lvsl( -1, lPtr ), (vector unsigned char)(1) );
|
|
v0 = vec_ld( 0, lPtr );
|
|
v1 = vec_ld( 15, lPtr );
|
|
v0 = vec_perm( v0, v1, vecPerm );
|
|
v0 = vec_perm( v0, zeroVector, vecPermZeroLast );
|
|
|
|
//v0 now contains lightOrigin[0], lightOrigin[1], lightOrigin[2], 0
|
|
for ( ; i+3 < numVerts; i+= 4 ) {
|
|
if ( ! vertRemap[i] ) {
|
|
vPtr = verts[i].xyz.ToFloatPtr();
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm2 = vec_add( vec_lvsl( -1, vPtr ), (vector unsigned char)(1) );
|
|
v2 = vec_ld( 0, vPtr );
|
|
v3 = vec_ld( 15, vPtr );
|
|
v7 = vec_perm( v2, v3, vecPerm2 );
|
|
#else
|
|
v7 = vec_ld( 0, vPtr );
|
|
#endif
|
|
v2 = vec_perm( v7, zeroVector, vecPermZeroLast );
|
|
v3 = vec_perm( v7, oneVector, vecPermZeroLast );
|
|
v1 = vec_sub( v2, v0 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE2( &vertexCache[outVerts][0], v3, v1 );
|
|
|
|
vertRemap[i] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
|
|
if ( ! vertRemap[i+1] ) {
|
|
vPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm3 = vec_add( vec_lvsl( -1, vPtr2 ), (vector unsigned char)(1) );
|
|
v4 = vec_ld( 0, vPtr2 );
|
|
v5 = vec_ld( 15, vPtr2 );
|
|
v6 = vec_perm( v4, v5, vecPerm3 );
|
|
#else
|
|
v6 = vec_ld( 0, vPtr2 );
|
|
#endif
|
|
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
|
|
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
|
|
v6 = vec_sub( v4, v0 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
|
|
|
|
vertRemap[i+1] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
|
|
if ( ! vertRemap[i+2] ) {
|
|
vPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm4 = vec_add( vec_lvsl( -1, vPtr3 ), (vector unsigned char)(1) );
|
|
v1 = vec_ld( 0, vPtr3 );
|
|
v2 = vec_ld( 15, vPtr3 );
|
|
v3 = vec_perm( v1, v2, vecPerm4 );
|
|
#else
|
|
v3 = vec_ld( 0, vPtr3 );
|
|
#endif
|
|
v1 = vec_perm( v3, zeroVector, vecPermZeroLast );
|
|
v2 = vec_perm( v3, oneVector, vecPermZeroLast );
|
|
v3 = vec_sub( v1, v0 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE2( &vertexCache[outVerts][0], v2, v3 );
|
|
|
|
vertRemap[i+2] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
if ( ! vertRemap[i+3] ) {
|
|
vPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
#ifndef DRAWVERT_PADDED
|
|
vecPerm5 = vec_add( vec_lvsl( -1, vPtr4 ), (vector unsigned char)(1) );
|
|
v4 = vec_ld( 0, vPtr4 );
|
|
v5 = vec_ld( 16, vPtr4 );
|
|
v6 = vec_perm( v4, v5, vecPerm5 );
|
|
#else
|
|
v6 = vec_ld( 0, vPtr4 );
|
|
#endif
|
|
|
|
v4 = vec_perm( v6, zeroVector, vecPermZeroLast );
|
|
v5 = vec_perm( v6, oneVector, vecPermZeroLast );
|
|
v6 = vec_sub( v4, v0 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE2( &vertexCache[outVerts][0], v5, v6 );
|
|
|
|
|
|
vertRemap[i+3] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
}
|
|
|
|
// cleanup
|
|
for (; i < numVerts; i++ ) {
|
|
if ( vertRemap[i] ) {
|
|
continue;
|
|
}
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[outVerts+0][0] = v[0];
|
|
vertexCache[outVerts+0][1] = v[1];
|
|
vertexCache[outVerts+0][2] = v[2];
|
|
vertexCache[outVerts+0][3] = 1.0f;
|
|
|
|
// R_SetupProjection() builds the projection matrix with a slight crunch
|
|
// for depth, which keeps this w=0 division from rasterizing right at the
|
|
// wrap around point and causing depth fighting with the rear caps
|
|
vertexCache[outVerts+1][0] = v[0] - lightOrigin[0];
|
|
vertexCache[outVerts+1][1] = v[1] - lightOrigin[1];
|
|
vertexCache[outVerts+1][2] = v[2] - lightOrigin[2];
|
|
vertexCache[outVerts+1][3] = 0.0f;
|
|
vertRemap[i] = outVerts;
|
|
outVerts += 2;
|
|
}
|
|
return outVerts;
|
|
}
|
|
#endif /* VERTEXCACHE_ALIGNED */
|
|
|
|
#endif /* 0 to disable shadow cache */
|
|
|
|
#if 1
|
|
|
|
#ifdef VERTEXCACHE_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateVertexProgramShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// vertexCache aligned
|
|
assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// idVec4 size
|
|
assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float oneVector = (vector float)(1);
|
|
register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
int i = 0;
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
#endif
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v1 = vec_perm( v2, v3, vertPerm2 );
|
|
v2 = vec_perm( v4, v5, vertPerm3 );
|
|
v3 = vec_perm( v6, v7, vertPerm4 );
|
|
#else
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 0, vertPtr2 );
|
|
v2 = vec_ld( 0, vertPtr3 );
|
|
v3 = vec_ld( 0, vertPtr4 );
|
|
#endif
|
|
|
|
v0 = vec_perm( v0, oneVector, vecPermThreeOne );
|
|
v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
|
|
|
|
v1 = vec_perm( v1, oneVector, vecPermThreeOne );
|
|
v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
|
|
|
|
v2 = vec_perm( v2, oneVector, vecPermThreeOne );
|
|
v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
|
|
|
|
v3 = vec_perm( v3, oneVector, vecPermThreeOne );
|
|
v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
|
|
ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
|
|
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[i*2+0][0] = v[0];
|
|
vertexCache[i*2+1][0] = v[0];
|
|
vertexCache[i*2+0][1] = v[1];
|
|
vertexCache[i*2+1][1] = v[1];
|
|
vertexCache[i*2+0][2] = v[2];
|
|
vertexCache[i*2+1][2] = v[2];
|
|
vertexCache[i*2+0][3] = 1.0f;
|
|
vertexCache[i*2+1][3] = 0.0f;
|
|
}
|
|
return numVerts * 2;
|
|
}
|
|
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateVertexProgramShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_AltiVec::CreateVertexProgramShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// idVec4 size
|
|
assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float oneVector = (vector float)(1);
|
|
register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
int i = 0;
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
#endif
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v1 = vec_perm( v2, v3, vertPerm2 );
|
|
v2 = vec_perm( v4, v5, vertPerm3 );
|
|
v3 = vec_perm( v6, v7, vertPerm4 );
|
|
#else
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 0, vertPtr2 );
|
|
v2 = vec_ld( 0, vertPtr3 );
|
|
v3 = vec_ld( 0, vertPtr4 );
|
|
#endif
|
|
|
|
v0 = vec_perm( v0, oneVector, vecPermThreeOne );
|
|
v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
|
|
|
|
v1 = vec_perm( v1, oneVector, vecPermThreeOne );
|
|
v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
|
|
|
|
v2 = vec_perm( v2, oneVector, vecPermThreeOne );
|
|
v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
|
|
|
|
v3 = vec_perm( v3, oneVector, vecPermThreeOne );
|
|
v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
|
|
|
|
// store results as unaligned
|
|
vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
|
|
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
|
|
vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
|
|
vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
|
|
|
|
// right rotate input data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
|
|
vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[i*2+0][0] = v[0];
|
|
vertexCache[i*2+1][0] = v[0];
|
|
vertexCache[i*2+0][1] = v[1];
|
|
vertexCache[i*2+1][1] = v[1];
|
|
vertexCache[i*2+0][2] = v[2];
|
|
vertexCache[i*2+1][2] = v[2];
|
|
vertexCache[i*2+0][3] = 1.0f;
|
|
vertexCache[i*2+1][3] = 0.0f;
|
|
}
|
|
return numVerts * 2;
|
|
}
|
|
|
|
#endif /* VERTEXCACHE_ALIGNED */
|
|
|
|
#endif /* 0 to kill VP shader cache */
|
|
|
|
#endif /* ENABLE_CREATE */
|
|
|
|
#ifdef ENABLE_SOUND_ROUTINES
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSamplePCMTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
Assumes that dest starts at aligned address
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
|
|
|
|
// dest is aligned
|
|
assert( IS_16BYTE_ALIGNED( dest[0] ) );
|
|
|
|
vector signed short vs0, vs1;
|
|
register vector signed int vi0, vi1;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
|
|
// permute vectors
|
|
register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
|
|
register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
|
|
|
|
register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
// If this can be assumed true, we can eliminate another conditional that checks to see if we can
|
|
// load up a vector before the loop
|
|
assert( numSamples >= 12 );
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// 8 at a time
|
|
int i = 0;
|
|
|
|
vector signed short vsOld = vec_ld( 0, &src[i] );
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
|
|
|
|
for ( ; i+7 < numSamples; i+= 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// permute into vectors in the order to store
|
|
|
|
v2 = vec_splat( v0, 0 );
|
|
v3 = vec_splat( v0, 1 );
|
|
v4 = vec_splat( v0, 2 );
|
|
v5 = vec_splat( v0, 3 );
|
|
v6 = vec_splat( v1, 0 );
|
|
v7 = vec_splat( v1, 1 );
|
|
v8 = vec_splat( v1, 2 );
|
|
v9 = vec_splat( v1, 3 );
|
|
|
|
// store results
|
|
ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
|
|
}
|
|
// cleanup
|
|
for (; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i = 0;
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( ; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = v2;
|
|
v4 = vec_perm( v0, v0, vecSecondHalf );
|
|
v5 = v4;
|
|
v6 = vec_perm( v1, v1, vecFirstHalf );
|
|
v7 = v6;
|
|
v8 = vec_perm (v1, v1, vecSecondHalf );
|
|
v9 = v8;
|
|
|
|
// store results
|
|
ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
|
|
}
|
|
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
|
|
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 0, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecBottom );
|
|
v3 = vec_perm( v0, v0, vecTop );
|
|
v4 = vec_perm( v1, v1, vecBottom );
|
|
v5 = vec_perm (v1, v1, vecTop );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = vec_perm( v0, v0, vecSecondHalf );
|
|
v4 = vec_perm( v1, v1, vecFirstHalf );
|
|
v5 = vec_perm (v1, v1, vecSecondHalf );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
|
|
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
//unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
|
|
//convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
|
|
//store results
|
|
ALIGNED_STORE2( &dest[i], v0, v1 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i] = (float) src[i];
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSamplePCMTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
|
|
|
|
vector signed short vs0, vs1;
|
|
register vector signed int vi0, vi1;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
|
|
// permute vectors
|
|
register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
|
|
register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
|
|
|
|
register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
// calculate perm vector and masks for stores
|
|
vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
|
|
// original values of dest
|
|
vector float vecDest = vec_ld( 0, &dest[0] );
|
|
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// 8 at a time
|
|
int i = 0;
|
|
|
|
vector signed short vsOld = vec_ld( 0, &src[i] );
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
|
|
|
|
for ( ; i+7 < numSamples; i+= 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// permute into vectors in the order to store
|
|
|
|
v2 = vec_splat( v0, 0 );
|
|
v3 = vec_splat( v0, 1 );
|
|
v4 = vec_splat( v0, 2 );
|
|
v5 = vec_splat( v0, 3 );
|
|
v6 = vec_splat( v1, 0 );
|
|
v7 = vec_splat( v1, 1 );
|
|
v8 = vec_splat( v1, 2 );
|
|
v9 = vec_splat( v1, 3 );
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
v8 = vec_perm( v8, v8, storePerm );
|
|
v9 = vec_perm( v9, v9, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
|
|
vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
|
|
vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
|
|
vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
|
|
vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
|
|
vecDest = vec_sel( v9, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*4] );
|
|
}
|
|
// cleanup
|
|
for (; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i = 0;
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( ; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = v2;
|
|
v4 = vec_perm( v0, v0, vecSecondHalf );
|
|
v5 = v4;
|
|
v6 = vec_perm( v1, v1, vecFirstHalf );
|
|
v7 = v6;
|
|
v8 = vec_perm (v1, v1, vecSecondHalf );
|
|
v9 = v8;
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
v8 = vec_perm( v8, v8, storePerm );
|
|
v9 = vec_perm( v9, v9, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
|
|
vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
|
|
vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
|
|
vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
|
|
vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
|
|
vecDest = vec_sel( v9, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*4] );
|
|
}
|
|
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
|
|
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 0, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecBottom );
|
|
v3 = vec_perm( v0, v0, vecTop );
|
|
v4 = vec_perm( v1, v1, vecBottom );
|
|
v5 = vec_perm (v1, v1, vecTop );
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
|
|
vecDest = vec_sel( v5, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*2] );
|
|
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = vec_perm( v0, v0, vecSecondHalf );
|
|
v4 = vec_perm( v1, v1, vecFirstHalf );
|
|
v5 = vec_perm (v1, v1, vecSecondHalf );
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
|
|
vecDest = vec_sel( v5, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*2] );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
|
|
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
//vs0 = vec_ld( 0, &src[i] );
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 31, &dest[i] );
|
|
|
|
//unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
|
|
//convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
|
|
vecDest = vec_sel( v1, vecDestEnd, mask );
|
|
vec_st( vecDest, 31, &dest[i] );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i] = (float) src[i];
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSampleOGGTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
Assumes that dest starts at aligned address
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
|
|
// dest is aligned
|
|
assert( IS_16BYTE_ALIGNED( dest[0] ) );
|
|
|
|
register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
|
|
register vector float constVec, zeroVector;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
|
|
vector unsigned char vecPerm1;
|
|
vector unsigned char vecPerm2;
|
|
|
|
vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
|
|
vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
|
|
vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
|
|
vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
|
|
|
|
constVec = (vector float)(32768.0f);
|
|
zeroVector = (vector float)(0.0);
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
v0 = vec_madd( oggVec1, constVec, zeroVector );
|
|
v1 = vec_madd( oggVec2, constVec, zeroVector );
|
|
v2 = vec_madd( oggVec3, constVec, zeroVector );
|
|
v3 = vec_madd( oggVec4, constVec, zeroVector );
|
|
v4 = vec_madd( oggVec5, constVec, zeroVector );
|
|
v5 = vec_madd( oggVec6, constVec, zeroVector );
|
|
v6 = vec_madd( oggVec7, constVec, zeroVector );
|
|
v7 = vec_madd( oggVec8, constVec, zeroVector );
|
|
|
|
//store results
|
|
ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
|
|
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
|
|
}
|
|
|
|
} else {
|
|
|
|
// calculate perm vec for ogg
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
|
|
// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
|
|
// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
|
|
oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
|
|
oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
|
|
oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
|
|
oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
|
|
oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
|
|
oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
|
|
oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
|
|
|
|
//merge generates the interleaved pattern that we want and it
|
|
//doesn't require a permute vector, so use that instead
|
|
v0 = vec_mergeh( oggVec1, oggVec5 );
|
|
v1 = vec_mergel( oggVec1, oggVec5 );
|
|
v2 = vec_mergeh( oggVec2, oggVec6 );
|
|
v3 = vec_mergel( oggVec2, oggVec6 );
|
|
|
|
v4 = vec_mergeh( oggVec3, oggVec7 );
|
|
v5 = vec_mergel( oggVec3, oggVec7 );
|
|
v6 = vec_mergeh( oggVec4, oggVec8 );
|
|
v10 = vec_mergel( oggVec4, oggVec8 );
|
|
|
|
//store results
|
|
ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
|
|
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load values from ogg
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// permute into results vectors to store
|
|
v5 = vec_perm( v0, v0, vecOneTwo );
|
|
v6 = vec_perm( v0, v0, vecThreeFour);
|
|
v7 = vec_perm( v1, v1, vecOneTwo );
|
|
v8 = vec_perm( v1, v1, vecThreeFour );
|
|
|
|
//store results
|
|
ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
// load ogg[0][i] to ogg[0][i+4]
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors to store
|
|
v2 = vec_perm( v0, v1, vecFirst );
|
|
v3 = vec_perm( v0, v1, vecSecond );
|
|
v4 = vec_perm( v0, v1, vecThird );
|
|
v5 = vec_perm( v0, v1, vecFourth );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
|
|
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load values from ogg
|
|
v8 = v9;
|
|
v7 = vec_ld( 15, &ogg[0][i] );
|
|
v6 = v7;
|
|
v9 = vec_ld( 31, &ogg[0][i] );
|
|
|
|
v0 = vec_perm( v8, v7, vecPerm1 );
|
|
v1 = vec_perm( v6, v9, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
ALIGNED_STORE2( &dest[i], v0, v1 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*1+0] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors
|
|
v2 = vec_mergeh( v0, v1 );
|
|
v3 = vec_mergel( v0, v1 );
|
|
|
|
// store results
|
|
ALIGNED_STORE2( &dest[i*2], v2, v3 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*2+0] = ogg[0][i] * 32768.0f;
|
|
dest[i*2+1] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSampleOGGTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
|
|
|
|
register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
|
|
register vector float constVec, zeroVector;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
|
|
vector unsigned char vecPerm1;
|
|
vector unsigned char vecPerm2;
|
|
|
|
vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
|
|
vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
|
|
vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
|
|
vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
|
|
|
|
vector unsigned char storePerm;
|
|
|
|
constVec = (vector float)(32768.0f);
|
|
zeroVector = (vector float)(0.0);
|
|
|
|
// calculate perm vector and masks for stores
|
|
storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
|
|
// original values of dest
|
|
vector float vecDest = vec_ld( 0, &dest[0] );
|
|
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
v0 = vec_madd( oggVec1, constVec, zeroVector );
|
|
v1 = vec_madd( oggVec2, constVec, zeroVector );
|
|
v2 = vec_madd( oggVec3, constVec, zeroVector );
|
|
v3 = vec_madd( oggVec4, constVec, zeroVector );
|
|
v4 = vec_madd( oggVec5, constVec, zeroVector );
|
|
v5 = vec_madd( oggVec6, constVec, zeroVector );
|
|
v6 = vec_madd( oggVec7, constVec, zeroVector );
|
|
v7 = vec_madd( oggVec8, constVec, zeroVector );
|
|
|
|
// rotate input data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
|
|
vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
|
|
vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
|
|
vecDest = vec_sel( v7, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*4] );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
|
|
}
|
|
|
|
} else {
|
|
|
|
// calculate perm vec for ogg
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
|
|
// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
|
|
// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
|
|
oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
|
|
oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
|
|
oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
|
|
oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
|
|
oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
|
|
oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
|
|
oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
|
|
|
|
//merge generates the interleaved pattern that we want and it
|
|
//doesn't require a permute vector, so use that instead
|
|
v0 = vec_mergeh( oggVec1, oggVec5 );
|
|
v1 = vec_mergel( oggVec1, oggVec5 );
|
|
v2 = vec_mergeh( oggVec2, oggVec6 );
|
|
v3 = vec_mergel( oggVec2, oggVec6 );
|
|
|
|
v4 = vec_mergeh( oggVec3, oggVec7 );
|
|
v5 = vec_mergel( oggVec3, oggVec7 );
|
|
v6 = vec_mergeh( oggVec4, oggVec8 );
|
|
v10 = vec_mergel( oggVec4, oggVec8 );
|
|
|
|
// rotate input data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v10 = vec_perm( v10, v10, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
|
|
vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
|
|
vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
|
|
vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
|
|
vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
|
|
vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
|
|
vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
|
|
vecDest = vec_sel( v10, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*8] );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
|
|
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
|
|
// load values from ogg
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// permute into results vectors to store
|
|
v5 = vec_perm( v0, v0, vecOneTwo );
|
|
v6 = vec_perm( v0, v0, vecThreeFour);
|
|
v7 = vec_perm( v1, v1, vecOneTwo );
|
|
v8 = vec_perm( v1, v1, vecThreeFour );
|
|
|
|
// rotate input data
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
v8 = vec_perm( v8, v8, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
|
|
vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
|
|
vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
|
|
vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
|
|
vecDest = vec_sel( v8, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
// load ogg[0][i] to ogg[0][i+4]
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors to store
|
|
v2 = vec_perm( v0, v1, vecFirst );
|
|
v3 = vec_perm( v0, v1, vecSecond );
|
|
v4 = vec_perm( v0, v1, vecThird );
|
|
v5 = vec_perm( v0, v1, vecFourth );
|
|
|
|
// rotate input data
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
|
|
vecDest = vec_sel( v5, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*4] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
|
|
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load values from ogg
|
|
v8 = v9;
|
|
v7 = vec_ld( 15, &ogg[0][i] );
|
|
v6 = v7;
|
|
v9 = vec_ld( 31, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 31, &dest[i] );
|
|
|
|
v0 = vec_perm( v8, v7, vecPerm1 );
|
|
v1 = vec_perm( v6, v9, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// rotate data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
|
|
vecDest = vec_sel( v1, vecDestEnd, mask );
|
|
vec_st( vecDest, 31, &dest[i] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*1+0] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors
|
|
v2 = vec_mergeh( v0, v1 );
|
|
v3 = vec_mergel( v0, v1 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE2( &dest[i*2], v2, v3 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*2+0] = ogg[0][i] * 32768.0f;
|
|
dest[i*2+1] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerMono
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
int i;
|
|
float inc[2];
|
|
float spkr[4];
|
|
|
|
register vector float vecInc;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
register vector float vecSamplesLd1, vecSamplesLd2;
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
|
|
register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
|
|
register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
|
|
register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
|
|
register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
|
|
|
|
//constants
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] *= 2;
|
|
inc[1] *= 2;
|
|
|
|
//load data into registers
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
|
|
//load samples and mix buffers
|
|
vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
|
|
vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLast = vec_ld( 31, &samples[i] );
|
|
|
|
vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
|
|
vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
|
|
|
|
vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
|
|
vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
|
|
vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
|
|
vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
|
|
//add for next iteration
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerMono
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
|
|
int i;
|
|
float inc[2];
|
|
float spkr[4];
|
|
|
|
register vector float vecInc;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
register vector float vecSamplesLd1, vecSamplesLd2;
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
|
|
register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
|
|
register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
|
|
register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
|
|
register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
|
|
|
|
//constants
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] *= 2;
|
|
inc[1] *= 2;
|
|
|
|
//load data into registers
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
|
|
//load samples and mix buffers
|
|
vecSamplesLd1 = vecSamplesLast;
|
|
vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLast = vec_ld( 31, &samples[i] );
|
|
|
|
vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
|
|
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
|
|
vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
|
|
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
|
|
|
|
vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
|
|
vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
|
|
vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
|
|
vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
|
|
//add for next iteration
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
}
|
|
}
|
|
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerStereo
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
int i, k;
|
|
float inc[2];
|
|
float spkr[4];
|
|
|
|
// loading buffers
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
// loading buffers
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecInc;
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
for ( k = 0; k < 2; k++ ) {
|
|
inc[k] *= 2;
|
|
}
|
|
|
|
// load data in vectors
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
// load mix buffers and samples
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
|
|
vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
|
|
|
|
vecSamples1 = vecSamplesLast;
|
|
vecSamples2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamples3 = vec_ld( 31, &samples[i*2] );
|
|
vecSamples4 = vec_ld( 47, &samples[i*2] );
|
|
vecSamplesLast = vec_ld( 63, &samples[i*2] );
|
|
|
|
vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
|
|
vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
|
|
vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
|
|
vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
|
|
//store results
|
|
ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerStereo
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
|
|
int i, k;
|
|
float inc[2];
|
|
float spkr[4];
|
|
// loading buffers
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
// loading buffers
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecInc;
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
for ( k = 0; k < 2; k++ ) {
|
|
inc[k] *= 2;
|
|
}
|
|
|
|
// load data in vectors
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
// load mix buffers and samples
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
|
|
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
|
|
vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
|
|
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
|
|
|
|
vecSamples1 = vecSamplesLast;
|
|
vecSamples2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamples3 = vec_ld( 31, &samples[i*2] );
|
|
vecSamples4 = vec_ld( 47, &samples[i*2] );
|
|
vecSamplesLast = vec_ld( 63, &samples[i*2] );
|
|
|
|
vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
|
|
vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
|
|
vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
|
|
vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
|
|
// store results
|
|
UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
}
|
|
}
|
|
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerMono
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
float incL[24];
|
|
float sL[24];
|
|
int i, k;
|
|
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
|
|
// permute vectors for sample
|
|
vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
for ( k = 0; k < 6; k++ ) {
|
|
sL[k] = lastV[k];
|
|
}
|
|
for ( k = 6; k < 12; k++ ) {
|
|
sL[k] = lastV[k-6] + incL[k];
|
|
}
|
|
for ( k = 12; k < 18; k++ ) {
|
|
sL[k] = lastV[k-12] + incL[k] + incL[k];
|
|
}
|
|
for ( k = 18; k < 24; k++ ) {
|
|
sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
|
|
}
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
for ( k = 0; k < 24; k++ ) {
|
|
incL[k] *= 4;
|
|
}
|
|
|
|
//load the data
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
vecIncl5 = vec_ld( 63, &incL[0] );
|
|
vecIncl6 = vec_ld( 79, &incL[0] );
|
|
vecIncl7 = vec_ld( 95, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
|
|
vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
|
|
vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
vecSL5 = vec_ld( 63, &sL[0] );
|
|
vecSL6 = vec_ld( 79, &sL[0] );
|
|
vecSL7 = vec_ld( 95, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
|
|
vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
|
|
vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
|
|
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
|
|
//need a cleanup loop
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
|
|
//load mix buffer into vectors, assume aligned
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
|
|
vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
|
|
vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
|
|
vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want
|
|
vecSamples1 = vec_splat( vecSamplesLd, 0 );
|
|
vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
|
|
vecSamples3 = vec_splat( vecSamplesLd, 1 );
|
|
vecSamples4 = vec_splat( vecSamplesLd, 2 );
|
|
vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
|
|
vecSamples6 = vec_splat( vecSamplesLd, 3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
|
|
vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
|
|
vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
|
|
|
|
//store out results
|
|
ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
vecSL4 = vec_add( vecSL4, vecIncl4 );
|
|
vecSL5 = vec_add( vecSL5, vecIncl5 );
|
|
vecSL6 = vec_add( vecSL6, vecIncl6 );
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerMono
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
float incL[24];
|
|
float sL[24];
|
|
int i, k;
|
|
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
|
|
// permute vectors for sample
|
|
register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
for ( k = 0; k < 6; k++ ) {
|
|
sL[k] = lastV[k];
|
|
}
|
|
for ( k = 6; k < 12; k++ ) {
|
|
sL[k] = lastV[k-6] + incL[k];
|
|
}
|
|
for ( k = 12; k < 18; k++ ) {
|
|
sL[k] = lastV[k-12] + incL[k] + incL[k];
|
|
}
|
|
for ( k = 18; k < 24; k++ ) {
|
|
sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
|
|
}
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
for ( k = 0; k < 24; k++ ) {
|
|
incL[k] *= 4;
|
|
}
|
|
|
|
// load the data
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
vecIncl5 = vec_ld( 63, &incL[0] );
|
|
vecIncl6 = vec_ld( 79, &incL[0] );
|
|
vecIncl7 = vec_ld( 95, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
|
|
vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
|
|
vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
vecSL5 = vec_ld( 63, &sL[0] );
|
|
vecSL6 = vec_ld( 79, &sL[0] );
|
|
vecSL7 = vec_ld( 95, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
|
|
vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
|
|
vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
|
|
//need a cleanup loop
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
|
|
//load mix buffer into vectors
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
|
|
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
|
|
vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
|
|
vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
|
|
vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
|
|
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
|
|
vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
|
|
vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want
|
|
vecSamples1 = vec_splat( vecSamplesLd, 0 );
|
|
vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
|
|
vecSamples3 = vec_splat( vecSamplesLd, 1 );
|
|
vecSamples4 = vec_splat( vecSamplesLd, 2 );
|
|
vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
|
|
vecSamples6 = vec_splat( vecSamplesLd, 3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
|
|
vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
|
|
vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
vecSL4 = vec_add( vecSL4, vecIncl4 );
|
|
vecSL5 = vec_add( vecSL5, vecIncl5 );
|
|
vecSL6 = vec_add( vecSL6, vecIncl6 );
|
|
}
|
|
}
|
|
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerStereo
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
float incL[12];
|
|
float sL[12];
|
|
int i;
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
|
|
// permute vectors for sample
|
|
vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
|
|
vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
sL[0] = lastV[0];
|
|
sL[1] = lastV[1];
|
|
sL[2] = lastV[2];
|
|
sL[3] = lastV[3];
|
|
sL[4] = lastV[4];
|
|
sL[5] = lastV[5];
|
|
sL[6] = lastV[0] + incL[0];
|
|
sL[7] = lastV[1] + incL[1];
|
|
sL[8] = lastV[2] + incL[2];
|
|
sL[9] = lastV[3] + incL[3];
|
|
sL[10] = lastV[4] + incL[4];
|
|
sL[11] = lastV[5] + incL[5];
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
incL[0] *= 2;
|
|
incL[1] *= 2;
|
|
incL[2] *= 2;
|
|
incL[3] *= 2;
|
|
incL[4] *= 2;
|
|
incL[5] *= 2;
|
|
incL[6] *= 2;
|
|
incL[7] *= 2;
|
|
incL[8] *= 2;
|
|
incL[9] *= 2;
|
|
incL[10] *= 2;
|
|
incL[11] *= 2;
|
|
|
|
//we aligned this data, so load it up
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
|
|
|
|
//load mix buffer into vectors, assume aligned
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want. For the 2nd vector,
|
|
//the order happens to be the same as the order we loaded them
|
|
//in, so there's no need to permute that one
|
|
vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
|
|
vecSamples2 = vecSamplesLd;
|
|
vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
|
|
//store out results
|
|
ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerStereo
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
float incL[12];
|
|
float sL[12];
|
|
|
|
int i;
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
|
|
// permute vectors for sample
|
|
vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
|
|
vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
sL[0] = lastV[0];
|
|
sL[1] = lastV[1];
|
|
sL[2] = lastV[2];
|
|
sL[3] = lastV[3];
|
|
sL[4] = lastV[4];
|
|
sL[5] = lastV[5];
|
|
sL[6] = lastV[0] + incL[0];
|
|
sL[7] = lastV[1] + incL[1];
|
|
sL[8] = lastV[2] + incL[2];
|
|
sL[9] = lastV[3] + incL[3];
|
|
sL[10] = lastV[4] + incL[4];
|
|
sL[11] = lastV[5] + incL[5];
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
incL[0] *= 2;
|
|
incL[1] *= 2;
|
|
incL[2] *= 2;
|
|
incL[3] *= 2;
|
|
incL[4] *= 2;
|
|
incL[5] *= 2;
|
|
incL[6] *= 2;
|
|
incL[7] *= 2;
|
|
incL[8] *= 2;
|
|
incL[9] *= 2;
|
|
incL[10] *= 2;
|
|
incL[11] *= 2;
|
|
|
|
// load the data
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
|
|
|
|
//load mix buffer into vectors
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
|
|
vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want. For the 2nd vector,
|
|
//the order happens to be the same as the order we loaded them
|
|
//in, so there's no need to permute that one
|
|
vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
|
|
vecSamples2 = vecSamplesLd;
|
|
vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
//this is basically a clamp for sound mixing
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector signed int vi0, vi1, vi2, vi3;
|
|
register vector signed short vs0, vs1;
|
|
register vector float minVec, maxVec, constVec;
|
|
int i = 0;
|
|
|
|
//unaligned at start, since samples is not 16-byte aligned
|
|
for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
|
|
samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
|
|
}
|
|
|
|
constVec = (vector float)(65536.0f);
|
|
|
|
//splat min/max into a vector
|
|
minVec = (vector float)(-32768.0f);
|
|
maxVec = (vector float)(32767.0f);
|
|
|
|
vector float vecOld = vec_ld( 0, &mixBuffer[i] );
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < numSamples; i += 16 ) {
|
|
//load source
|
|
v0 = vecOld;
|
|
v1 = vec_ld( 15, &mixBuffer[i] );
|
|
v2 = vec_ld( 31, &mixBuffer[i] );
|
|
v3 = vec_ld( 31, &mixBuffer[i] );
|
|
vecOld = vec_ld( 47, &mixBuffer[i] );
|
|
|
|
v0 = vec_perm( v0, v1, permVec );
|
|
v1 = vec_perm( v1, v2, permVec );
|
|
v2 = vec_perm( v2, v3, permVec );
|
|
v3 = vec_perm( v3, vecOld, permVec );
|
|
|
|
//apply minimum
|
|
v4 = vec_max( v0, minVec );
|
|
v5 = vec_max( v1, minVec );
|
|
v6 = vec_max( v2, minVec );
|
|
v7 = vec_max( v3, minVec );
|
|
|
|
//apply maximum
|
|
v4 = vec_min( v4, maxVec );
|
|
v5 = vec_min( v5, maxVec );
|
|
v6 = vec_min( v6, maxVec );
|
|
v7 = vec_min( v7, maxVec );
|
|
|
|
// convert floats to ints
|
|
vi0 = vec_cts( v4, 0 );
|
|
vi1 = vec_cts( v5, 0 );
|
|
vi2 = vec_cts( v6, 0 );
|
|
vi3 = vec_cts( v7, 0 );
|
|
|
|
// pack ints into shorts
|
|
vs0 = vec_pack( vi0, vi1 );
|
|
vs1 = vec_pack( vi2, vi3 );
|
|
ALIGNED_STORE2( &samples[i], vs0, vs1 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < numSamples ; i++ ) {
|
|
samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
|
|
}
|
|
}
|
|
#endif /* ENABLE_SOUND_ROUTINES */
|
|
|
|
#endif /* MACOS_X */
|