9416 lines
312 KiB
C++
9416 lines
312 KiB
C++
// Copyright (C) 2004 Id Software, Inc.
|
|
//
|
|
|
|
|
|
#include "../precompiled.h"
|
|
#pragma hdrstop
|
|
|
|
#include "Simd_Generic.h"
|
|
#include "Simd_AltiVec.h"
|
|
|
|
//===============================================================
|
|
//
|
|
// AltiVec implementation of idSIMDProcessor
|
|
//
|
|
// Doom3 SIMD Library version 0.5
|
|
// Patrick Flanagan (pflanagan@apple.com)
|
|
// Sanjay Patel (spatel@apple.com)
|
|
// Architecture & Performance Group, Apple Computer
|
|
//
|
|
//===============================================================
|
|
|
|
#if defined(MACOS_X) && defined(__ppc__)
|
|
|
|
#include <math.h>
|
|
#include <float.h>
|
|
|
|
#ifdef PPC_INTRINSICS
|
|
#include <ppc_intrinsics.h>
|
|
#endif
|
|
|
|
#if defined(bool) && __GNUC__ < 4
|
|
#undef bool
|
|
#endif
|
|
|
|
// Data struct sizes
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// 60 bytes, 15 floats at 4 bytes each
|
|
#define DRAWVERT_OFFSET 15
|
|
#else
|
|
// 64 bytes, 16 floats
|
|
#define DRAWVERT_OFFSET 16
|
|
#endif
|
|
// 16 bytes each, 4 floats
|
|
#define PLANE_OFFSET 4
|
|
// 16 bytes each, 4 floats
|
|
#define IDVEC4_OFFSET 4
|
|
|
|
// Alignment tests
|
|
#define IS_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F ) == 0 )
|
|
#define NOT_16BYTE_ALIGNED( x ) ( ( (unsigned long)&x & 0x0F) != 0 )
|
|
|
|
// Aligned storing floats
|
|
#define ALIGNED_STORE2( ADDR, V0, V1 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR )
|
|
|
|
#define ALIGNED_STORE3( ADDR, V0, V1, V2 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR )
|
|
|
|
#define ALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR ); \
|
|
vec_st( V3, 48, ADDR )
|
|
|
|
#define ALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR ); \
|
|
vec_st( V3, 48, ADDR ); \
|
|
vec_st( V4, 64, ADDR ); \
|
|
vec_st( V5, 80, ADDR )
|
|
|
|
#define ALIGNED_STORE8( ADDR, V0, V1, V2, V3, V4, V5, V6, V7 ) \
|
|
vec_st( V0, 0, ADDR ); \
|
|
vec_st( V1, 16, ADDR ); \
|
|
vec_st( V2, 32, ADDR ); \
|
|
vec_st( V3, 48, ADDR ); \
|
|
vec_st( V4, 64, ADDR ); \
|
|
vec_st( V5, 80, ADDR ); \
|
|
vec_st( V6, 96, ADDR ); \
|
|
vec_st( V7, 112, ADDR )
|
|
|
|
// Unaligned storing floats. These assume that we can trash the input
|
|
#define UNALIGNED_STORE1( ADDR, V0 ) { \
|
|
/* use store element */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_lvsr( 0, ADDR ); \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
vec_ste( V0, 0, ADDR ); \
|
|
vec_ste( V0, 4, ADDR ); \
|
|
vec_ste( V0, 8, ADDR ); \
|
|
vec_ste( V0, 12, ADDR ); \
|
|
}
|
|
|
|
#define UNALIGNED_STORE2( ADDR, V0, V1 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 31, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE3( ADDR, V0, V1, V2 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 47, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE4( ADDR, V0, V1, V2, V3 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 63, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
|
|
ULStoreVal5 = vec_sel( V3, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); \
|
|
vec_st( ULStoreVal5, 63, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE6( ADDR, V0, V1, V2, V3, V4, V5 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 95, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
|
|
V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
|
|
V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
|
|
ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
|
|
ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
|
|
ULStoreVal7 = vec_sel( V5, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); \
|
|
vec_st( ULStoreVal5, 63, ADDR ); \
|
|
vec_st( ULStoreVal6, 79, ADDR ); \
|
|
vec_st( ULStoreVal7, 95, ADDR ); }
|
|
|
|
#define UNALIGNED_STORE9( ADDR, V0, V1, V2, V3, V4, V5, V6, V7, V8 ) { \
|
|
/* load up the values that are there now */ \
|
|
vector float ULStoreMacro1 = vec_ld( 0, ADDR ); \
|
|
vector float ULStoreMacro2 = vec_ld( 143, ADDR ); \
|
|
/* generate permute vector and mask */ \
|
|
vector unsigned char ULStoreMacroPerm = vec_sub( vec_lvsr( 15, ADDR ), (vector unsigned char)(1) ); \
|
|
vector unsigned int ULStoreMacroMask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), ULStoreMacroPerm ); \
|
|
/* right rotate input data */ \
|
|
V0 = vec_perm( V0, V0, ULStoreMacroPerm ); \
|
|
V1 = vec_perm( V1, V1, ULStoreMacroPerm ); \
|
|
V2 = vec_perm( V2, V2, ULStoreMacroPerm ); \
|
|
V3 = vec_perm( V3, V3, ULStoreMacroPerm ); \
|
|
V4 = vec_perm( V4, V4, ULStoreMacroPerm ); \
|
|
V5 = vec_perm( V5, V5, ULStoreMacroPerm ); \
|
|
V6 = vec_perm( V6, V6, ULStoreMacroPerm ); \
|
|
V7 = vec_perm( V7, V7, ULStoreMacroPerm ); \
|
|
V8 = vec_perm( V8, V8, ULStoreMacroPerm ); \
|
|
/* setup the output vectors */ \
|
|
vector float ULStoreVal1, ULStoreVal2, ULStoreVal3, ULStoreVal4, ULStoreVal5, ULStoreVal6, ULStoreVal7; \
|
|
vector float ULStoreVal8, ULStoreVal9, ULStoreVal10; \
|
|
ULStoreVal1 = vec_sel( ULStoreMacro1, V0, ULStoreMacroMask ); \
|
|
ULStoreVal2 = vec_sel( V0, V1, ULStoreMacroMask ); \
|
|
ULStoreVal3 = vec_sel( V1, V2, ULStoreMacroMask ); \
|
|
ULStoreVal4 = vec_sel( V2, V3, ULStoreMacroMask ); \
|
|
ULStoreVal5 = vec_sel( V3, V4, ULStoreMacroMask ); \
|
|
ULStoreVal6 = vec_sel( V4, V5, ULStoreMacroMask ); \
|
|
ULStoreVal7 = vec_sel( V5, V6, ULStoreMacroMask ); \
|
|
ULStoreVal8 = vec_sel( V6, V7, ULStoreMacroMask ); \
|
|
ULStoreVal9 = vec_sel( V7, V8, ULStoreMacroMask ); \
|
|
ULStoreVal10 = vec_sel( V8, ULStoreMacro2, ULStoreMacroMask ); \
|
|
/* store results */ \
|
|
vec_st( ULStoreVal1, 0, ADDR ); \
|
|
vec_st( ULStoreVal2, 15, ADDR ); \
|
|
vec_st( ULStoreVal3, 31, ADDR ); \
|
|
vec_st( ULStoreVal4, 47, ADDR ); \
|
|
vec_st( ULStoreVal5, 63, ADDR ); \
|
|
vec_st( ULStoreVal6, 79, ADDR ); \
|
|
vec_st( ULStoreVal7, 95, ADDR ); \
|
|
vec_st( ULStoreVal8, 111, ADDR ); \
|
|
vec_st( ULStoreVal9, 127, ADDR ); \
|
|
vec_st( ULStoreVal10, 143, ADDR ); }
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::GetName
|
|
============
|
|
*/
|
|
const char *idSIMD_AltiVec::GetName( void ) const {
|
|
return "AltiVec";
|
|
}
|
|
|
|
/*
|
|
Helper Functions
|
|
*/
|
|
#if 0
|
|
// Prints the values of a vector, useful for debugging but
|
|
// should never be called in real code
|
|
inline void debugPrintVector( vector float v, char *msg ) {
|
|
printf("%s -- %vf\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector unsigned int v, char *msg ) {
|
|
printf("%s -- %vd\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector bool int v, char *msg ) {
|
|
printf("%s -- %vi\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector unsigned char v, char *msg ) {
|
|
printf("%s -- %vuc\n", msg, v );
|
|
}
|
|
|
|
inline void debugPrintVector( vector unsigned short v, char *msg ) {
|
|
printf("%s -- %vs\n", msg, v );
|
|
}
|
|
#endif
|
|
/*
|
|
===============
|
|
Reciprocal
|
|
|
|
For each element in vector:
|
|
n = 1 / n
|
|
===============
|
|
*/
|
|
|
|
// Use Newton-Raphson to calculate reciprocal of a vector
|
|
inline vector float Reciprocal( vector float v ) {
|
|
//Get the reciprocal estimate
|
|
vector float estimate = vec_re( v );
|
|
//One round of Newton-Raphson refinement
|
|
return vec_madd( vec_nmsub( estimate, v, (vector float) (1.0) ), estimate, estimate );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
ReciprocalSquareRoot
|
|
|
|
For each element in vector:
|
|
n = 1 / sqrt(n)
|
|
===============
|
|
*/
|
|
// Reciprocal square root estimate of a vector
|
|
inline vector float ReciprocalSquareRoot( vector float v ) {
|
|
//Get the square root reciprocal estimate
|
|
vector float zero = (vector float)(0);
|
|
vector float oneHalf = (vector float)(0.5);
|
|
vector float one = (vector float)(1.0);
|
|
vector float estimate = vec_rsqrte( vec_max( v, (vector float)(FLT_MIN) ) );
|
|
|
|
//One round of Newton-Raphson refinement
|
|
vector float estimateSquared = vec_madd( estimate, estimate, zero );
|
|
vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
|
|
return vec_madd( vec_nmsub( v, estimateSquared, one ), halfEstimate, estimate );
|
|
}
|
|
|
|
|
|
/*
|
|
===============
|
|
Divide
|
|
|
|
For each element in vectors:
|
|
n = a / b
|
|
===============
|
|
*/
|
|
// Use reciprocal estimate and multiply to divide a vector
|
|
inline vector float Divide( vector float a, vector float b ) {
|
|
return vec_madd( a, Reciprocal( b ), (vector float)(0) );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
loadSplatUnalignedScalar
|
|
|
|
For each element in vector:
|
|
n = s
|
|
===============
|
|
*/
|
|
inline vector float loadSplatUnalignedScalar( const float *s ) {
|
|
vector unsigned char splatMap = vec_lvsl( 0, s );
|
|
vector float v = vec_ld( 0, s );
|
|
splatMap = (vector unsigned char) vec_splat( (vector float) splatMap, 0 );
|
|
return vec_perm( v, v, splatMap );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
VectorATan16
|
|
|
|
For each element in vector:
|
|
n = idMath::ATan16( x, y )
|
|
===============
|
|
*/
|
|
// calculates arc tangent of a vector with 16 bits of precision, based on atan16 in idMath
|
|
inline vector float VectorATan16( vector float x, vector float y ) {
|
|
|
|
vector float xDivY = Divide( x, y );
|
|
vector float yDivX = Divide( y, x );
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
vector bool int vecCmp = vec_cmpgt( vec_abs( y ), vec_abs( x ) );
|
|
vector float vecA = vec_sel( yDivX, xDivY, vecCmp );
|
|
vector bool int vecCmp2 = vec_cmplt( vecA, zeroVector );
|
|
vector float vecS = vec_madd( vecA, vecA, (vector float)(0) );
|
|
|
|
// do calculation for S
|
|
vector float vecWork1 = vec_madd( (vector float)(0.0028662257f), vecS, (vector float)(-0.0161657367f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.0429096138f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.0752896400f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1065626393f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.1420889944f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(0.1999355085f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(-0.3333314528f) );
|
|
vecWork1 = vec_madd( vecWork1, vecS, (vector float)(1) );
|
|
|
|
// get the regular S value
|
|
vecS = vec_madd( vecWork1, vecA, (vector float)(0) );
|
|
|
|
// calculate what to return if y > x
|
|
vector float negSPlusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(0.5f * 3.14159265358979323846f) );
|
|
vector float negSMinusHalfPI = vec_madd( vecS, (vector float)(-1), (vector float)(-0.5f * 3.14159265358979323846f) );
|
|
vector float modRet = vec_sel( negSPlusHalfPI, negSMinusHalfPI, vecCmp2 );
|
|
|
|
return vec_sel( modRet, vecS, vecCmp );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
VectorSin16
|
|
|
|
For each element in vector:
|
|
n = idMath::Sin16( v )
|
|
===============
|
|
*/
|
|
inline vector float VectorSin16( vector float v ) {
|
|
vector float zero = (vector float)(0);
|
|
|
|
#if 0
|
|
// load up half PI and use it to calculate the rest of the values. This is
|
|
// sometimes cheaper than loading them from memory
|
|
|
|
vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
|
|
vector float PI = vec_add( halfPI, halfPI );
|
|
vector float oneandhalfPI = vec_add( PI, halfPI );
|
|
vector float twoPI = vec_add( oneandhalfPI, halfPI );
|
|
#else
|
|
vector float halfPI = (vector float) ( 0.5f * 3.14159265358979323846f );
|
|
vector float PI = (vector float)(3.14159265358979323846f);
|
|
vector float oneandhalfPI = (vector float)(3.14159265358979323846f + ( 0.5f * 3.14159265358979323846f ) );
|
|
vector float twoPI = (vector float)( 2.0f * 3.14159265358979323846f);
|
|
#endif
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4;
|
|
|
|
vector float vecMod;
|
|
vector float vecResult;
|
|
|
|
// fix the range if needbe
|
|
vecMod = vec_floor( Divide( v, twoPI ) );
|
|
vecResult = vec_nmsub( vecMod, twoPI, v );
|
|
|
|
vector float vecPIminusA = vec_sub( PI, vecResult );
|
|
vector float vecAminus2PI = vec_sub( vecResult, twoPI );
|
|
|
|
vecCmp1 = vec_cmplt( vecResult, PI );
|
|
vecCmp2 = vec_cmpgt( vecResult, halfPI );
|
|
|
|
// these are the ones where a > PI + HALF_PI so set a = a - TWO_PI
|
|
vecCmp3 = vec_cmpgt( vecResult, oneandhalfPI );
|
|
|
|
// we also want to set a = PI - a everywhere that !(a < PI) and !(a > PI + HALF_PI)
|
|
vecCmp4 = vec_and( vec_xor( vecCmp3, (vector bool int)(1) ), vec_xor( vecCmp1, (vector bool int)(1) ) ); // everywhere that both of those are false
|
|
|
|
// these are ones where a < PI and a > HALF_PI so we set a = PI - a
|
|
vecCmp1 = vec_and( vecCmp1, vecCmp2 );
|
|
vecCmp1 = vec_or( vecCmp1, vecCmp4 );
|
|
|
|
// put the correct values into place
|
|
vecResult = vec_sel( vecResult, vecPIminusA, vecCmp1 );
|
|
vecResult = vec_sel( vecResult, vecAminus2PI, vecCmp3 );
|
|
|
|
// calculate answer
|
|
vector float vecASquared = vec_madd( vecResult, vecResult, zero );
|
|
vector float vecEst = vec_madd( (vector float)(-2.39e-08f), vecASquared, (vector float)(2.7526e-06f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.98409e-04f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(8.3333315e-03f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(-1.666666664e-01f) );
|
|
vecEst = vec_madd( vecEst, vecASquared, (vector float)(1.0f) );
|
|
return vec_madd( vecResult, vecEst, zero );
|
|
}
|
|
|
|
/*
|
|
===============
|
|
vecSplatWithRunTime
|
|
|
|
For each element in vector:
|
|
n = v(i)
|
|
===============
|
|
*/
|
|
// splats an element across a vector using a runtime variable
|
|
inline vector float vecSplatWithRunTime( vector float v, int i ) {
|
|
vector unsigned char rotate = vec_lvsl( i * sizeof( float ), (int*) 0L );
|
|
v = vec_perm( v, v, rotate );
|
|
return vec_splat( v, 0 );
|
|
}
|
|
|
|
|
|
/*
|
|
===============
|
|
FastScalarInvSqrt
|
|
|
|
n = 1 / sqrt( f )
|
|
===============
|
|
*/
|
|
inline float FastScalarInvSqrt( float f ) {
|
|
#ifdef PPC_INTRINSICS
|
|
float estimate;
|
|
const float kSmallestFloat = FLT_MIN;
|
|
|
|
//Calculate a 5 bit starting estimate for the reciprocal sqrt
|
|
estimate = __frsqrte ( f + kSmallestFloat );
|
|
|
|
//if you require less precision, you may reduce the number of loop iterations.
|
|
// This will do 2 rounds of NR
|
|
estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
|
|
estimate = estimate + 0.5f * estimate * ( 1.0f - f * estimate * estimate );
|
|
return estimate;
|
|
#else
|
|
return idMath::InvSqrt( f );
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
===============
|
|
FastScalarInvSqrt_x3
|
|
|
|
arg1 = 1 / sqrt( arg1 )
|
|
arg2 = 1 / sqrt( arg2 )
|
|
arg3 = 1 / sqrt( arg3 )
|
|
===============
|
|
*/
|
|
inline void FastScalarInvSqrt_x3( float *arg1, float *arg2, float *arg3 ) {
|
|
#ifdef PPC_INTRINSICS
|
|
register float estimate1, estimate2, estimate3;
|
|
const float kSmallestFloat = FLT_MIN;
|
|
|
|
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
|
|
estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
|
|
estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
|
|
estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
|
|
|
|
// two rounds newton-raphson
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
|
|
*arg1 = estimate1;
|
|
*arg2 = estimate2;
|
|
*arg3 = estimate3;
|
|
#else
|
|
*arg1 = idMath::InvSqrt( *arg1 );
|
|
*arg2 = idMath::InvSqrt( *arg2 );
|
|
*arg3 = idMath::InvSqrt( *arg3 );
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
===============
|
|
FastScalarInvSqrt_x6
|
|
|
|
arg1 = 1 / sqrt( arg1 )
|
|
arg2 = 1 / sqrt( arg2 )
|
|
arg3 = 1 / sqrt( arg3 )
|
|
arg4 = 1 / sqrt( arg4 )
|
|
arg5 = 1 / sqrt( arg5 )
|
|
arg6 = 1 / sqrt( arg6 )
|
|
|
|
On a G5, you've got 2 pipeline stages to fill. (2 FPU's with 6 stages each)
|
|
===============
|
|
*/
|
|
inline void FastScalarInvSqrt_x6( float *arg1, float *arg2, float *arg3, float *arg4, float *arg5, float *arg6 ) {
|
|
#ifdef PPC_INTRINSICS
|
|
register float estimate1, estimate2, estimate3, estimate4, estimate5, estimate6;
|
|
const float kSmallestFloat = FLT_MIN;
|
|
|
|
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
|
|
estimate1 = __frsqrte ( *arg1 + kSmallestFloat );
|
|
estimate2 = __frsqrte ( *arg2 + kSmallestFloat );
|
|
estimate3 = __frsqrte ( *arg3 + kSmallestFloat );
|
|
estimate4 = __frsqrte ( *arg4 + kSmallestFloat );
|
|
estimate5 = __frsqrte ( *arg5 + kSmallestFloat );
|
|
estimate6 = __frsqrte ( *arg6 + kSmallestFloat );
|
|
|
|
// two rounds newton-raphson
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
|
|
estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
|
|
estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
|
|
|
|
estimate1 = estimate1 + 0.5f * estimate1 * ( 1.0f - *arg1 * estimate1 * estimate1 );
|
|
estimate2 = estimate2 + 0.5f * estimate2 * ( 1.0f - *arg2 * estimate2 * estimate2 );
|
|
estimate3 = estimate3 + 0.5f * estimate3 * ( 1.0f - *arg3 * estimate3 * estimate3 );
|
|
estimate4 = estimate4 + 0.5f * estimate4 * ( 1.0f - *arg4 * estimate4 * estimate4 );
|
|
estimate5 = estimate5 + 0.5f * estimate5 * ( 1.0f - *arg5 * estimate5 * estimate5 );
|
|
estimate6 = estimate6 + 0.5f * estimate6 * ( 1.0f - *arg6 * estimate6 * estimate6 );
|
|
|
|
*arg1 = estimate1;
|
|
*arg2 = estimate2;
|
|
*arg3 = estimate3;
|
|
*arg4 = estimate4;
|
|
*arg5 = estimate5;
|
|
*arg6 = estimate6;
|
|
#else
|
|
*arg1 = idMath::InvSqrt( *arg1 );
|
|
*arg2 = idMath::InvSqrt( *arg2 );
|
|
*arg3 = idMath::InvSqrt( *arg3 );
|
|
*arg4 = idMath::InvSqrt( *arg4 );
|
|
*arg5 = idMath::InvSqrt( *arg5 );
|
|
*arg6 = idMath::InvSqrt( *arg6 );
|
|
#endif
|
|
}
|
|
|
|
|
|
// End Helper Functions
|
|
|
|
#ifdef ENABLE_SIMPLE_MATH
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Add
|
|
|
|
dst[i] = constant + src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Add( float *dst, const float constant, const float *src, const int count ) {
|
|
vector float v0, v1, v2, v3;
|
|
vector float v0_low, v0_hi, v1_hi;
|
|
vector unsigned char permVec;
|
|
vector float constVec;
|
|
int i;
|
|
|
|
// handle unaligned cases at beginning
|
|
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant + src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), (vector unsigned char)(1) );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v0_hi, v1_hi, permVec );
|
|
|
|
v2 = vec_add( v0, constVec );
|
|
v3 = vec_add( v1, constVec );
|
|
|
|
// store results
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant + src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Add
|
|
|
|
dst[i] = src0[i] + src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Add( float *dst, const float *src0, const float *src1, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
|
|
int i;
|
|
|
|
//unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src0[i] + src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
v4 = vec_add( v0, v1 );
|
|
v5 = vec_add( v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] + src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Sub
|
|
|
|
dst[i] = constant - src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Sub( float *dst, const float constant, const float *src, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
register vector unsigned char permVec;
|
|
register vector float constVec;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant - src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute vector and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = vec_sub( constVec, v0 );
|
|
v3 = vec_sub( constVec, v1 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant - src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Sub
|
|
|
|
dst[i] = src0[i] - src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Sub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src0[i] - src1[i];
|
|
}
|
|
|
|
//calculate permute and do first loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
v4 = vec_sub( v0, v1 );
|
|
v5 = vec_sub( v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] - src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Mul
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Mul( float *dst, const float constant, const float *src, const int count) {
|
|
register vector float v0, v0_low, v0_hi, v1_low, v1_hi, v1, v2, v3;
|
|
register vector float constVec;
|
|
register vector unsigned char permVec;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
int i;
|
|
|
|
// handle unaligned data at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = constant * src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = vec_madd( constVec, v0, zeroVector );
|
|
v3 = vec_madd( constVec, v1, zeroVector );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant * src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Mul
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Mul( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
register vector float constVec = (vector float)(0.0);
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] * src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//no such thing as regular multiply so we do
|
|
//multiply then add zero
|
|
v4 = vec_madd( v0, v1, constVec );
|
|
v5 = vec_madd( v2, v3, constVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] * src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Div
|
|
|
|
dst[i] = constant / divisor[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Div( float *dst, const float constant, const float *divisor, const int count ) {
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
register vector unsigned char permVec;
|
|
register vector float constVec;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = constant / divisor[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do first loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &divisor[i] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &divisor[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &divisor[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &divisor[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = Divide( constVec, v0 );
|
|
v3 = Divide( constVec, v1 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = constant / divisor[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Div
|
|
|
|
dst[i] = src0[i] / src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Div( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] / src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
v4 = Divide( v0, v1 );
|
|
v5 = Divide( v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] / src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulAdd
|
|
|
|
dst[i] += constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float constant, const float *src, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
register vector float constVec;
|
|
//src
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] += constant * src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
|
|
// at this point, dst is known to be aligned
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
|
|
v4 = vec_madd( constVec, v0, v1 );
|
|
v5 = vec_madd( constVec, v2, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] += constant * src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulAdd
|
|
|
|
dst[i] += src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulAdd( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
|
|
int i;
|
|
|
|
//unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] += src0[i] * src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
// load sources
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//we know dst is aligned because we handled unaligned cases
|
|
//up front
|
|
v4 = vec_ld( 0, &dst[i] );
|
|
v5 = vec_ld( 16, &dst[i] );
|
|
|
|
v6 = vec_madd( v0, v1, v4 );
|
|
v7 = vec_madd( v2, v3, v5 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v6, v7 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] += src0[i] * src1[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulSub
|
|
|
|
dst[i] -= constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float constant, const float *src, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
register vector float constVec;
|
|
//src
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] -= constant * src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
|
|
//we know dst will be aligned here because we already handled the preceeding
|
|
//unaligned cases
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
|
|
v4 = vec_nmsub( v0, constVec, v1 );
|
|
v5 = vec_nmsub( v2, constVec, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] -= constant * src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulSub
|
|
|
|
dst[i] -= src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulSub( float *dst, const float *src0, const float *src1, const int count ) {
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i;
|
|
|
|
//unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] -= src0[i] * src1[i];
|
|
}
|
|
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src0[i] );
|
|
v3_hi = vec_ld( 0, &src1[i] );
|
|
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
// load sources
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src0[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src1[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//we know dst is aligned because we handled unaligned cases
|
|
//up front
|
|
v4 = vec_ld( 0, &dst[i] );
|
|
v5 = vec_ld( 16, &dst[i] );
|
|
|
|
v6 = vec_nmsub( v0, v1, v4 );
|
|
v7 = vec_nmsub( v2, v3, v5 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v6, v7 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] -= src0[i] * src1[i];
|
|
}
|
|
}
|
|
|
|
#endif /* ENABLE_SIMPLE_MATH */
|
|
|
|
#ifdef ENABLE_DOT
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idVec3 *src, const int count ) {
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
register vector float vecX, vecY, vecZ;
|
|
vector float vecX2, vecY2, vecZ2;
|
|
const float *addr = src[0].ToFloatPtr();
|
|
float tempVal[4];
|
|
float constVal[4];
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
|
|
// permute vectors
|
|
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
|
|
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
|
|
|
|
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
|
|
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
|
|
|
|
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
|
|
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
|
|
|
|
int i;
|
|
|
|
// for scalar cleanup, if necessary
|
|
constVal[0] = constant[0];
|
|
constVal[1] = constant[1];
|
|
constVal[2] = constant[2];
|
|
constVal[3] = 0;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vecLd2 = vec_ld( 11, constant.ToFloatPtr() );
|
|
vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
|
|
|
|
|
|
// populate const vectors
|
|
vecConstX = vec_splat( vecLd1, 0 );
|
|
vecConstY = vec_splat( vecLd1, 1 );
|
|
vecConstZ = vec_splat( vecLd1, 2 );
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, addr );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i];
|
|
}
|
|
|
|
for ( ; i + 7 < count; i += 8 ) {
|
|
float *vecPtr = (float*)( addr + (i*3) );
|
|
vector float v0, v1, v2, v3, v4, v5;
|
|
|
|
v0 = vecOld; //vec_ld( 0, vecPtr );
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
v3 = vec_ld( 47, vecPtr );
|
|
v4 = vec_ld( 63, vecPtr );
|
|
v5 = vec_ld( 79, vecPtr );
|
|
vecOld = vec_ld( 95, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, v3, permVec );
|
|
|
|
vecLd4 = vec_perm( v3, v4, permVec );
|
|
vecLd5 = vec_perm( v4, v5, permVec );
|
|
vecLd6 = vec_perm( v5, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors
|
|
vecX = vec_perm( vecLd1, vecLd2, permX1 );
|
|
vecY = vec_perm( vecLd1, vecLd2, permY1 );
|
|
vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
|
|
vecX = vec_perm( vecX, vecLd3, permX2 );
|
|
vecY = vec_perm( vecY, vecLd3, permY2 );
|
|
vecZ = vec_perm( vecZ, vecLd3, permZ2 );
|
|
|
|
vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
|
|
vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
|
|
vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
|
|
vecX2 = vec_perm( vecX2, vecLd6, permX2 );
|
|
vecY2 = vec_perm( vecY2, vecLd6, permY2 );
|
|
vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
|
|
|
|
// do multiply
|
|
vecX = vec_madd( vecX, vecConstX, zeroVector );
|
|
vecY = vec_madd( vecY, vecConstY, vecX );
|
|
vecZ = vec_madd( vecZ, vecConstZ, vecY );
|
|
|
|
vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
|
|
vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
|
|
vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < count; i++ ) {
|
|
// look up whats at the address we want, cast it as float pointer, then
|
|
// dereference that pointer
|
|
tempVal[0] = *( addr + (i*3) + 0 );
|
|
tempVal[1] = *( addr + (i*3) + 1 );
|
|
tempVal[2] = *( addr + (i*3) + 2 );
|
|
dst[i] = constVal[0] * tempVal[0] + constVal[1] * tempVal[1] + constVal[2] * tempVal[2];
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idPlane *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant * src[(X)].Normal() + src[(X)][3];
|
|
|
|
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
float constVal[4];
|
|
float srcVal[3];
|
|
float srcI3;
|
|
float tempVal;
|
|
|
|
vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
|
|
vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
|
|
vector float vecX, vecY, vecZ, vecI3;
|
|
vector float vecX2, vecY2, vecZ2, vecI32;
|
|
vector float vecConstX, vecConstY, vecConstZ;
|
|
|
|
constVal[0] = constant[0];
|
|
constVal[1] = constant[1];
|
|
constVal[2] = constant[2];
|
|
constVal[3] = 1;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vector float v1 = vec_ld( 11, constant.ToFloatPtr() );
|
|
vector float vecConst = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( vecConst, 0 );
|
|
vecConstY = vec_splat( vecConst, 1 );
|
|
vecConstZ = vec_splat( vecConst, 2 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i].Normal() + src[i][3];
|
|
}
|
|
|
|
const float *addr = src[i].ToFloatPtr();
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, addr );
|
|
|
|
for ( ; i + 7 < count; i += 8 ) {
|
|
float *planePtr = (float*)( addr + (i*PLANE_OFFSET) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
v0 = vecOld; //vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
v2 = vec_ld( 31, planePtr );
|
|
v3 = vec_ld( 47, planePtr );
|
|
v4 = vec_ld( 63, planePtr );
|
|
v5 = vec_ld( 79, planePtr );
|
|
v6 = vec_ld( 95, planePtr );
|
|
v7 = vec_ld( 111, planePtr );
|
|
vecOld = vec_ld( 127, planePtr );
|
|
|
|
vecPlaneLd1 = vec_perm( v0, v1, permVec );
|
|
vecPlaneLd2 = vec_perm( v1, v2, permVec );
|
|
vecPlaneLd3 = vec_perm( v2, v3, permVec );
|
|
vecPlaneLd4 = vec_perm( v3, v4, permVec );
|
|
|
|
vecPlaneLd5 = vec_perm( v4, v5, permVec );
|
|
vecPlaneLd6 = vec_perm( v5, v6, permVec );
|
|
vecPlaneLd7 = vec_perm( v6, v7, permVec );
|
|
vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors, since this is square its basically
|
|
// a matrix transpose
|
|
v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
|
|
v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
|
|
v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
|
|
v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
|
|
|
|
vecX = vec_mergeh( v0, v1 );
|
|
vecY = vec_mergel( v0, v1 );
|
|
vecZ = vec_mergeh( v2, v3 );
|
|
vecI3 = vec_mergel( v2, v3 );
|
|
|
|
v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
|
|
v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
|
|
v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
|
|
v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
|
|
|
|
vecX2 = vec_mergeh( v4, v5 );
|
|
vecY2 = vec_mergel( v4, v5 );
|
|
vecZ2 = vec_mergeh( v6, v7 );
|
|
vecI32 = vec_mergel( v6, v7 );
|
|
|
|
// do calculation
|
|
v6 = vec_madd( vecZ, vecConstZ, vecI3 );
|
|
v5 = vec_madd( vecY, vecConstY, v6 );
|
|
v4 = vec_madd( vecX, vecConstX, v5 );
|
|
|
|
v0 = vec_madd( vecZ2, vecConstZ, vecI32 );
|
|
v1 = vec_madd( vecY2, vecConstY, v0 );
|
|
v2 = vec_madd( vecX2, vecConstX, v1 );
|
|
|
|
// store results
|
|
ALIGNED_STORE2( &dst[i], v4, v2 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
// populate srcVal with src X Y Z
|
|
srcVal[0] = *(addr + (i*PLANE_OFFSET) + 0 );
|
|
srcVal[1] = *(addr + (i*PLANE_OFFSET) + 1 );
|
|
srcVal[2] = *(addr + (i*PLANE_OFFSET) + 2 );
|
|
|
|
// put src[i][3] into srcI3
|
|
srcI3 = *(addr + (i*PLANE_OFFSET) + 3 );
|
|
|
|
tempVal = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
|
|
dst[i] = tempVal + srcI3;
|
|
}
|
|
}
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i].xyz;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
|
|
|
|
// idDrawVert size is 60 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
int i;
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 11, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
// permute into constant vectors
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < count ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
for ( ; i+3 < count; i += 4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 11, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 11, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 11, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 11, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
|
|
// store results
|
|
vec_st( vecSrcZ1, 0, &dst[i] );
|
|
}
|
|
|
|
for ( ; i < count; i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant * src[i].xyz;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant * src[(X)].xyz;
|
|
|
|
// idDrawVert size is 64 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
int i;
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 11, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
// permute into constant vectors
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
|
|
for ( ; i+3 < count; i += 4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
|
|
// store results
|
|
vec_st( vecSrcZ1, 0, &dst[i] );
|
|
}
|
|
|
|
for ( ; i < count; i++ ) {
|
|
dst[i] = constant * src[i].xyz;
|
|
}
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i] + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idVec3 *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)] + constant[3];
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
register vector float vecX, vecY, vecZ, vecX2, vecY2, vecZ2;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float vecConstX, vecConstY, vecConstZ;
|
|
register vector float vecConst3;
|
|
|
|
idVec3 constNormal = constant.Normal();
|
|
float const3 = constant[3];
|
|
|
|
// permute vectors
|
|
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
|
|
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
|
|
|
|
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
|
|
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
|
|
|
|
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
|
|
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
|
|
|
|
int i;
|
|
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vecLd1 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vecLd2 = vec_ld( 15, constant.ToFloatPtr() );
|
|
vecLd1 = vec_perm( vecLd1, vecLd2, constPerm );
|
|
|
|
// populate const vec
|
|
vecConstX = vec_splat( vecLd1, 0 );
|
|
vecConstY = vec_splat( vecLd1, 1 );
|
|
vecConstZ = vec_splat( vecLd1, 2 );
|
|
|
|
// put constant to add in vector
|
|
vecConst3 = loadSplatUnalignedScalar( &const3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i] + constant[3];
|
|
}
|
|
|
|
const float *addr = src[i].ToFloatPtr();
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, addr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, addr );
|
|
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
float *vecPtr = (float*)( addr + (i*3) );
|
|
vector float v0, v1, v2, v3, v4, v5;
|
|
|
|
v0 = vecOld; //vec_ld( 0, vecPtr );
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
v3 = vec_ld( 47, vecPtr );
|
|
v4 = vec_ld( 63, vecPtr );
|
|
v5 = vec_ld( 79, vecPtr );
|
|
vecOld = vec_ld( 95, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, v3, permVec );
|
|
|
|
vecLd4 = vec_perm( v3, v4, permVec );
|
|
vecLd5 = vec_perm( v4, v5, permVec );
|
|
vecLd6 = vec_perm( v5, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors
|
|
vecX = vec_perm( vecLd1, vecLd2, permX1 );
|
|
vecY = vec_perm( vecLd1, vecLd2, permY1 );
|
|
vecZ = vec_perm( vecLd1, vecLd2, permZ1 );
|
|
vecX = vec_perm( vecX, vecLd3, permX2 );
|
|
vecY = vec_perm( vecY, vecLd3, permY2 );
|
|
vecZ = vec_perm( vecZ, vecLd3, permZ2 );
|
|
|
|
vecX2 = vec_perm( vecLd4, vecLd5, permX1 );
|
|
vecY2 = vec_perm( vecLd4, vecLd5, permY1 );
|
|
vecZ2 = vec_perm( vecLd4, vecLd5, permZ1 );
|
|
vecX2 = vec_perm( vecX2, vecLd6, permX2 );
|
|
vecY2 = vec_perm( vecY2, vecLd6, permY2 );
|
|
vecZ2 = vec_perm( vecZ2, vecLd6, permZ2 );
|
|
|
|
// calculate dot product
|
|
vecX = vec_madd( vecX, vecConstX, zeroVector );
|
|
vecY = vec_madd( vecY, vecConstY, vecX );
|
|
vecZ = vec_madd( vecZ, vecConstZ, vecY );
|
|
|
|
vecX2 = vec_madd( vecX2, vecConstX, zeroVector );
|
|
vecY2 = vec_madd( vecY2, vecConstY, vecX2 );
|
|
vecZ2 = vec_madd( vecZ2, vecConstZ, vecY2 );
|
|
|
|
// add in constant[3]
|
|
vecZ = vec_add( vecZ, vecConst3 );
|
|
vecZ2 = vec_add( vecZ2, vecConst3 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE2( &dst[i], vecZ, vecZ2 );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < count; i++ ) {
|
|
dst[i] = constNormal * src[i] + const3;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idPlane *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].Normal() + constant[3] * src[(X)][3];
|
|
|
|
// check plane size
|
|
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
|
|
|
|
float constVal[4];
|
|
float srcVal[4];
|
|
|
|
int i;
|
|
const float *constPtr = constant.ToFloatPtr();
|
|
|
|
register vector float vecX, vecY, vecZ, vecI3;
|
|
register vector float vecX2, vecY2, vecZ2, vecI32;
|
|
|
|
vector float vecPlaneLd1, vecPlaneLd2, vecPlaneLd3, vecPlaneLd4;
|
|
vector float vecPlaneLd5, vecPlaneLd6, vecPlaneLd7, vecPlaneLd8;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
|
|
|
|
constVal[0] = *(constPtr);
|
|
constVal[1] = *(constPtr+1);
|
|
constVal[2] = *(constPtr+2);
|
|
constVal[3] = *(constPtr+3);
|
|
|
|
// populate const vector
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
vector float v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
vector float v1 = vec_ld( 15, constant.ToFloatPtr() );
|
|
vector float vecConst = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( vecConst, 0 );
|
|
vecConstY = vec_splat( vecConst, 1 );
|
|
vecConstZ = vec_splat( vecConst, 2 );
|
|
vecConstI3 = vec_splat( vecConst, 3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
}
|
|
|
|
const float *srcPtr = src[i].ToFloatPtr();
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, srcPtr );
|
|
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
float *planePtr = (float*)( srcPtr + (i*PLANE_OFFSET) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
v0 = vecOld; // vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
v2 = vec_ld( 31, planePtr );
|
|
v3 = vec_ld( 47, planePtr );
|
|
v4 = vec_ld( 63, planePtr );
|
|
v5 = vec_ld( 79, planePtr );
|
|
v6 = vec_ld( 95, planePtr );
|
|
v7 = vec_ld( 111, planePtr );
|
|
vecOld = vec_ld( 127, planePtr );
|
|
|
|
vecPlaneLd1 = vec_perm( v0, v1, permVec );
|
|
vecPlaneLd2 = vec_perm( v1, v2, permVec );
|
|
vecPlaneLd3 = vec_perm( v2, v3, permVec );
|
|
vecPlaneLd4 = vec_perm( v3, v4, permVec );
|
|
|
|
vecPlaneLd5 = vec_perm( v4, v5, permVec );
|
|
vecPlaneLd6 = vec_perm( v5, v6, permVec );
|
|
vecPlaneLd7 = vec_perm( v6, v7, permVec );
|
|
vecPlaneLd8 = vec_perm( v7, vecOld, permVec );
|
|
|
|
// permute into X Y Z vectors, since this is square its basically
|
|
// a matrix transpose
|
|
v0 = vec_mergeh( vecPlaneLd1, vecPlaneLd3 );
|
|
v1 = vec_mergeh( vecPlaneLd2, vecPlaneLd4 );
|
|
v2 = vec_mergel( vecPlaneLd1, vecPlaneLd3 );
|
|
v3 = vec_mergel( vecPlaneLd2, vecPlaneLd4 );
|
|
|
|
vecX = vec_mergeh( v0, v1 );
|
|
vecY = vec_mergel( v0, v1 );
|
|
vecZ = vec_mergeh( v2, v3 );
|
|
vecI3 = vec_mergel( v2, v3 );
|
|
|
|
v4 = vec_mergeh( vecPlaneLd5, vecPlaneLd7 );
|
|
v5 = vec_mergeh( vecPlaneLd6, vecPlaneLd8 );
|
|
v6 = vec_mergel( vecPlaneLd5, vecPlaneLd7 );
|
|
v7 = vec_mergel( vecPlaneLd6, vecPlaneLd8 );
|
|
|
|
vecX2 = vec_mergeh( v4, v5 );
|
|
vecY2 = vec_mergel( v4, v5 );
|
|
vecZ2 = vec_mergeh( v6, v7 );
|
|
vecI32 = vec_mergel( v6, v7 );
|
|
|
|
// do calculation
|
|
v4 = vec_madd( vecConstX, vecX, zeroVector );
|
|
v5 = vec_madd( vecConstY, vecY, v4 );
|
|
v6 = vec_madd( vecConstZ, vecZ, v5 );
|
|
v7 = vec_madd( vecConstI3, vecI3, v6 );
|
|
|
|
v0 = vec_madd( vecConstX, vecX2, zeroVector );
|
|
v1 = vec_madd( vecConstY, vecY2, v0 );
|
|
v2 = vec_madd( vecConstZ, vecZ2, v1 );
|
|
v3 = vec_madd( vecConstI3, vecI32, v2 );
|
|
|
|
//store result
|
|
ALIGNED_STORE2( &dst[i], v7, v3 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
//dst[i] = constant.Normal() * src[i].Normal() + constant[3] * src[i][3];
|
|
srcVal[0] = *(srcPtr + (i*PLANE_OFFSET) + 0 );
|
|
srcVal[1] = *(srcPtr + (i*PLANE_OFFSET) + 1 );
|
|
srcVal[2] = *(srcPtr + (i*PLANE_OFFSET) + 2 );
|
|
srcVal[3] = *(srcPtr + (i*PLANE_OFFSET) + 3 );
|
|
dst[i] = srcVal[0] * constVal[0] + srcVal[1] * constVal[1] + srcVal[2] * constVal[2] + constVal[3] * srcVal[3];
|
|
}
|
|
}
|
|
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
|
|
|
|
// idDrawVert size is 60 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
int i;
|
|
const float *constPtr = constant.ToFloatPtr();
|
|
const float *srcPtr = src[0].xyz.ToFloatPtr();
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float vecDest1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
float constVal[4];
|
|
float srcVal[3];
|
|
|
|
constVal[0] = *(constPtr+0);
|
|
constVal[1] = *(constPtr+1);
|
|
constVal[2] = *(constPtr+2);
|
|
constVal[3] = *(constPtr+3);
|
|
|
|
// populate const vec
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 15, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
vecConstI3 = vec_splat( v0, 3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
}
|
|
|
|
// every fourth one will have the same alignment, so can store these. Make sure we
|
|
// have enough so we don't run off the end of the array
|
|
if ( i+3 < count ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
for ( ; i+3 < count; i+=4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 11, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 11, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 11, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 11, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
|
|
|
|
// store results
|
|
vec_st( vecDest1, 0, &dst[i] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
// dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
|
|
dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
|
|
dst[i] += constVal[3];
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idPlane &constant, const idDrawVert *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = constant.Normal() * src[(X)].xyz + constant[3];
|
|
|
|
// idDrawVert size is 60 bytes
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof( float ) );
|
|
|
|
int i;
|
|
const float *constPtr = constant.ToFloatPtr();
|
|
const float *srcPtr = src[0].xyz.ToFloatPtr();
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecConstX, vecConstY, vecConstZ, vecConstI3;
|
|
register vector float vecSrcX1, vecSrcY1, vecSrcZ1;
|
|
register vector float vecDest1;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
float constVal[4];
|
|
float srcVal[3];
|
|
|
|
constVal[0] = *(constPtr+0);
|
|
constVal[1] = *(constPtr+1);
|
|
constVal[2] = *(constPtr+2);
|
|
constVal[3] = *(constPtr+3);
|
|
|
|
// populate const vec
|
|
vector unsigned char constPerm = vec_lvsl( 0, constant.ToFloatPtr() );
|
|
v0 = vec_ld( 0, constant.ToFloatPtr() );
|
|
v1 = vec_ld( 15, constant.ToFloatPtr() );
|
|
v0 = vec_perm( v0, v1, constPerm );
|
|
|
|
vecConstX = vec_splat( v0, 0 );
|
|
vecConstY = vec_splat( v0, 1 );
|
|
vecConstZ = vec_splat( v0, 2 );
|
|
vecConstI3 = vec_splat( v0, 3 );
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
}
|
|
|
|
for ( ; i+3 < count; i+=4 ) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
// transpose into X Y Z vectors
|
|
v1 = vec_mergeh( v0, v4 );
|
|
v3 = vec_mergeh( v2, v6 );
|
|
v5 = vec_mergel( v0, v4 );
|
|
v7 = vec_mergel( v2, v6 );
|
|
|
|
vecSrcX1 = vec_mergeh( v1, v3 );
|
|
vecSrcY1 = vec_mergel( v1, v3 );
|
|
vecSrcZ1 = vec_mergeh( v5, v7 );
|
|
|
|
// now calculate dot product
|
|
vecSrcX1 = vec_madd( vecSrcX1, vecConstX, zeroVector );
|
|
vecSrcY1 = vec_madd( vecSrcY1, vecConstY, vecSrcX1 );
|
|
vecSrcZ1 = vec_madd( vecSrcZ1, vecConstZ, vecSrcY1 );
|
|
vecDest1 = vec_add( vecSrcZ1, vecConstI3 );
|
|
|
|
// store results
|
|
vec_st( vecDest1, 0, &dst[i] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
srcVal[0] = *(srcPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
srcVal[1] = *(srcPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
srcVal[2] = *(srcPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
// dst[i] = constant.Normal() * src[i].xyz + constant[3];
|
|
|
|
dst[i] = constVal[0] * srcVal[0] + constVal[1] * srcVal[1] + constVal[2] * srcVal[2];
|
|
dst[i] += constVal[3];
|
|
}
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dst[i] = src0[i] * src1[i];
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float *dst, const idVec3 *src0, const idVec3 *src1, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] * src1[(X)];
|
|
|
|
int i;
|
|
float src0Val[3];
|
|
float src1Val[3];
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
vector float vecLd7, vecLd8, vecLd9, vecLd10, vecLd11, vecLd12;
|
|
register vector float vecX0, vecY0, vecZ0, vecX1, vecY1, vecZ1;
|
|
register vector float vecX02, vecY02, vecZ02, vecX12, vecY12, vecZ12;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
// permute vectors
|
|
register vector unsigned char permX1 = (vector unsigned char)(0,1,2,3,12,13,14,15,24,25,26,27,28,29,30,31); //last 4 bytes are junk
|
|
register vector unsigned char permX2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
|
|
register vector unsigned char permY1 = (vector unsigned char)(4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); //last 4 bytes are junk
|
|
register vector unsigned char permY2 = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
|
|
register vector unsigned char permZ1 = (vector unsigned char)(8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); //last 8 bytes are junk
|
|
register vector unsigned char permZ2 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
|
|
|
|
// handle unaligned case at beginning
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src0[i] * src1[i];
|
|
}
|
|
|
|
const float *src0Ptr = src0[i].ToFloatPtr();
|
|
const float *src1Ptr = src1[i].ToFloatPtr();
|
|
vector unsigned char permVec1 = vec_add( vec_lvsl( -1, src0Ptr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, src1Ptr ), (vector unsigned char)(1) );
|
|
vector float vecOld0 = vec_ld( 0, src0Ptr );
|
|
vector float vecOld1 = vec_ld( 0, src1Ptr );
|
|
|
|
for ( i = 0; i+7 < count; i += 8 ) {
|
|
float *s0Ptr = (float*)( src0Ptr + (i*3) );
|
|
float *s1Ptr = (float*)( src1Ptr + (i*3) );
|
|
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
|
|
v0 = vecOld0;
|
|
v1 = vec_ld( 15, s0Ptr );
|
|
v2 = vec_ld( 31, s0Ptr );
|
|
v3 = vec_ld( 47, s0Ptr );
|
|
v4 = vec_ld( 63, s0Ptr );
|
|
v5 = vec_ld( 79, s0Ptr );
|
|
vecOld0 = vec_ld( 95, s0Ptr );
|
|
|
|
v6 = vecOld1;
|
|
v7 = vec_ld( 15, s1Ptr );
|
|
v8 = vec_ld( 31, s1Ptr );
|
|
v9 = vec_ld( 47, s1Ptr );
|
|
v10 = vec_ld( 63, s1Ptr );
|
|
v11 = vec_ld( 79, s1Ptr );
|
|
vecOld1 = vec_ld( 95, s1Ptr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec1 );
|
|
vecLd2 = vec_perm( v1, v2, permVec1 );
|
|
vecLd3 = vec_perm( v2, v3, permVec1 );
|
|
vecLd4 = vec_perm( v3, v4, permVec1 );
|
|
vecLd5 = vec_perm( v4, v5, permVec1 );
|
|
vecLd6 = vec_perm( v5, vecOld0, permVec1 );
|
|
|
|
vecLd7 = vec_perm( v6, v7, permVec2 );
|
|
vecLd8 = vec_perm( v7, v8, permVec2 );
|
|
vecLd9 = vec_perm( v8, v9, permVec2 );
|
|
vecLd10 = vec_perm( v9, v10, permVec2 );
|
|
vecLd11 = vec_perm( v10, v11, permVec2 );
|
|
vecLd12 = vec_perm( v11, vecOld1, permVec2 );
|
|
|
|
// permute into X Y Z vectors
|
|
vecX0 = vec_perm( vecLd1, vecLd2, permX1 );
|
|
vecY0 = vec_perm( vecLd1, vecLd2, permY1 );
|
|
vecZ0 = vec_perm( vecLd1, vecLd2, permZ1 );
|
|
vecX0 = vec_perm( vecX0, vecLd3, permX2 );
|
|
vecY0 = vec_perm( vecY0, vecLd3, permY2 );
|
|
vecZ0 = vec_perm( vecZ0, vecLd3, permZ2 );
|
|
|
|
vecX02 = vec_perm( vecLd4, vecLd5, permX1 );
|
|
vecY02 = vec_perm( vecLd4, vecLd5, permY1 );
|
|
vecZ02 = vec_perm( vecLd4, vecLd5, permZ1 );
|
|
vecX02 = vec_perm( vecX02, vecLd6, permX2 );
|
|
vecY02 = vec_perm( vecY02, vecLd6, permY2 );
|
|
vecZ02 = vec_perm( vecZ02, vecLd6, permZ2 );
|
|
|
|
vecX1 = vec_perm( vecLd7, vecLd8, permX1 );
|
|
vecY1 = vec_perm( vecLd7, vecLd8, permY1 );
|
|
vecZ1 = vec_perm( vecLd7, vecLd8, permZ1 );
|
|
vecX1 = vec_perm( vecX1, vecLd9, permX2 );
|
|
vecY1 = vec_perm( vecY1, vecLd9, permY2 );
|
|
vecZ1 = vec_perm( vecZ1, vecLd9, permZ2 );
|
|
|
|
vecX12 = vec_perm( vecLd10, vecLd11, permX1 );
|
|
vecY12 = vec_perm( vecLd10, vecLd11, permY1 );
|
|
vecZ12 = vec_perm( vecLd10, vecLd11, permZ1 );
|
|
vecX12 = vec_perm( vecX12, vecLd12, permX2 );
|
|
vecY12 = vec_perm( vecY12, vecLd12, permY2 );
|
|
vecZ12 = vec_perm( vecZ12, vecLd12, permZ2 );
|
|
|
|
// do multiply
|
|
vecX0 = vec_madd( vecX0, vecX1, zeroVector );
|
|
vecY0 = vec_madd( vecY0, vecY1, vecX0 );
|
|
vecZ0 = vec_madd( vecZ0, vecZ1, vecY0 );
|
|
vecX02 = vec_madd( vecX02, vecX12, zeroVector );
|
|
vecY02 = vec_madd( vecY02, vecY12, vecX02 );
|
|
vecZ02 = vec_madd( vecZ02, vecZ12, vecY02 );
|
|
|
|
// store out results
|
|
ALIGNED_STORE2( &dst[i], vecZ0, vecZ02 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
// dst[i] = src0[i] * src1[i];
|
|
src0Val[0] = *( src0Ptr + (i*3) + 0 );
|
|
src0Val[1] = *( src0Ptr + (i*3) + 1 );
|
|
src0Val[2] = *( src0Ptr + (i*3) + 2 );
|
|
|
|
src1Val[0] = *( src1Ptr + (i*3) + 0 );
|
|
src1Val[1] = *( src1Ptr + (i*3) + 1 );
|
|
src1Val[2] = *( src1Ptr + (i*3) + 2 );
|
|
|
|
dst[i] = src0Val[0] * src1Val[0] + src0Val[1] * src1Val[1] + src0Val[2] * src1Val[2];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Dot
|
|
|
|
dot = src1[0] * src2[0] + src1[1] * src2[1] + src1[2] * src2[2] + ...
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Dot( float &dot, const float *src1, const float *src2, const int count ) {
|
|
dot = 0.0f;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float zeroVector;
|
|
register vector float runningTotal1, runningTotal2;
|
|
//src0
|
|
register vector float v0_low, v0_hi, v2_low, v2_hi;
|
|
//src1
|
|
register vector float v1_low, v1_hi, v3_low, v3_hi;
|
|
//permute vectors
|
|
register vector unsigned char permVec1, permVec2;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
|
|
int i = 0;
|
|
|
|
runningTotal1 = (vector float)(0.0);
|
|
runningTotal2 = (vector float)(0.0);
|
|
zeroVector = (vector float)(0.0);
|
|
|
|
if ( count >= 8 ) {
|
|
//calculate permute and do loads
|
|
permVec1 = vec_add( vec_lvsl( -1, (int*) &src1[i] ), oneCharVector );
|
|
permVec2 = vec_add( vec_lvsl( -1, (int*) &src2[i] ), oneCharVector );
|
|
v2_hi = vec_ld( 0, &src1[i] );
|
|
v3_hi = vec_ld( 0, &src2[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load sources
|
|
v0_low = v2_hi;
|
|
v0_hi = vec_ld( 15, &src1[i] );
|
|
v2_low = v0_hi;
|
|
v2_hi = vec_ld( 31, &src1[i] );
|
|
|
|
v1_low = v3_hi;
|
|
v1_hi = vec_ld( 15, &src2[i] );
|
|
v3_low = v1_hi;
|
|
v3_hi = vec_ld( 31, &src2[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec1 );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec2 );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec1 );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec2 );
|
|
|
|
//multiply together and keep running sum
|
|
runningTotal1 = vec_madd( v0, v1, runningTotal1 );
|
|
runningTotal2 = vec_madd( v2, v3, runningTotal2 );
|
|
}
|
|
|
|
runningTotal1 = vec_add( runningTotal1, runningTotal2 );
|
|
|
|
// sum accross vector
|
|
v0 = vec_add( runningTotal1, vec_sld( runningTotal1, runningTotal1, 8 ) );
|
|
v1 = vec_add( v0, vec_sld( v0, v0, 4 ) );
|
|
runningTotal1 = vec_splat( v1, 0 );
|
|
vec_ste( runningTotal1, 0, &dot );
|
|
}
|
|
|
|
//handle cleanup. when profiling the game, we found that most of the counts to this function were small, so it
|
|
// spends a lot of time in this scalar code. It's already really really fast (eg 1 TB tick) for scalar code for
|
|
// counts less than 50, so not much point in trying to get vector code in on the action
|
|
for ( ; i < count ; i++ ) {
|
|
dot += src1[i] * src2[i];
|
|
}
|
|
|
|
}
|
|
#endif /* ENABLE_DOT */
|
|
|
|
#ifdef ENABLE_COMPARES
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGT
|
|
|
|
dst[i] = src0[i] > constant;
|
|
============
|
|
*/
|
|
|
|
void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] > constant;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] > constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmpgt( v0, constVec );
|
|
vr2 = vec_cmpgt( v1, constVec );
|
|
vr3 = vec_cmpgt( v2, constVec );
|
|
vr4 = vec_cmpgt( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] > constant;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGT
|
|
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpGT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] |= ( src0[(X)] > constant ) << bitNum;
|
|
|
|
// Temp vector registers
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmpgt( vs0, constVec );
|
|
vtbi1 = vec_cmpgt( vs1, constVec );
|
|
vtbi2 = vec_cmpgt( vs2, constVec );
|
|
vtbi3 = vec_cmpgt( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1 to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] > constant ) << bitNum;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGE
|
|
|
|
dst[i] = src0[i] >= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] >= constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmpge( v0, constVec );
|
|
vr2 = vec_cmpge( v1, constVec );
|
|
vr3 = vec_cmpge( v2, constVec );
|
|
vr4 = vec_cmpge( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] >= constant;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpGE
|
|
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpGE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmpge( vs0, constVec );
|
|
vtbi1 = vec_cmpge( vs1, constVec );
|
|
vtbi2 = vec_cmpge( vs2, constVec );
|
|
vtbi3 = vec_cmpge( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1L to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] >= constant ) << bitNum;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLT
|
|
|
|
dst[i] = src0[i] < constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] < constant;
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] < constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmplt( v0, constVec );
|
|
vr2 = vec_cmplt( v1, constVec );
|
|
vr3 = vec_cmplt( v2, constVec );
|
|
vr4 = vec_cmplt( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] < constant;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLT
|
|
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] |= ( src0[(X)] < constant ) << bitNum;
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( i = 0 ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmplt( vs0, constVec );
|
|
vtbi1 = vec_cmplt( vs1, constVec );
|
|
vtbi2 = vec_cmplt( vs2, constVec );
|
|
vtbi3 = vec_cmplt( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1L to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] < constant ) << bitNum;
|
|
}
|
|
|
|
}
|
|
//#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLE
|
|
|
|
dst[i] = src0[i] <= constant;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src0[(X)] <= constant;
|
|
register vector float v0, v1, v2, v3;
|
|
register vector bool int vr1, vr2, vr3, vr4;
|
|
register vector bool short vs1, vs2;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi, v2_low, v2_hi, v3_low, v3_hi;
|
|
register vector unsigned char vc1;
|
|
register vector bool char vbc1;
|
|
register vector float constVec;
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector unsigned char permVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] = src0[i] <= constant;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
v3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
// load values
|
|
v0_low = v3_hi;
|
|
v0_hi = vec_ld( 15, &src0[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src0[i] );
|
|
v2_low = v1_hi;
|
|
v2_hi = vec_ld( 47, &src0[i] );
|
|
v3_low = v2_hi;
|
|
v3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_perm( v2_low, v2_hi, permVec );
|
|
v3 = vec_perm( v3_low, v3_hi, permVec );
|
|
|
|
//do comparison
|
|
vr1 = vec_cmple( v0, constVec );
|
|
vr2 = vec_cmple( v1, constVec );
|
|
vr3 = vec_cmple( v2, constVec );
|
|
vr4 = vec_cmple( v3, constVec );
|
|
|
|
// pack results into shorts
|
|
vs1 = vec_pack(vr1, vr2);
|
|
vs2 = vec_pack(vr3, vr4);
|
|
|
|
// pack results into byte
|
|
vbc1 = vec_pack(vs1, vs2);
|
|
|
|
//AND with 1 to get true=1 not true=255
|
|
vc1 = vec_and( vbc1, oneVector );
|
|
|
|
//store results
|
|
vec_st( vc1, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src0[i] <= constant;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CmpLE
|
|
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::CmpLE( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] |= ( src0[(X)] <= constant ) << bitNum;
|
|
register vector bool int vtbi0, vtbi1, vtbi2, vtbi3;
|
|
register vector bool short vtbs0, vtbs1;
|
|
register vector bool char vtbc0;
|
|
register vector unsigned char vtuc0;
|
|
register vector unsigned char permVec, permVec2;
|
|
|
|
// dest vectors
|
|
register vector unsigned char vd;
|
|
// bitNum vectors
|
|
register vector unsigned char bitNumVec;
|
|
// src0 vectors
|
|
register vector float vs0, vs1, vs2, vs3;
|
|
register vector float vs0_low, vs0_hi, vs1_low, vs1_hi, vs2_low, vs2_hi, vs3_low, vs3_hi;
|
|
// constant vector
|
|
register vector float constVec;
|
|
// all one's
|
|
register vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count );i++ ) {
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//bitNum is unaligned.
|
|
permVec2 = vec_lvsl( 0, &bitNum );
|
|
vtuc0 = vec_ld( 0, &bitNum );
|
|
bitNumVec = vec_perm( vtuc0, vtuc0, permVec2 );
|
|
bitNumVec = vec_splat( bitNumVec, 0 );
|
|
|
|
//calculate permute and do loads
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src0[i] ), oneVector );
|
|
vs3_hi = vec_ld( 0, &src0[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < count; i += 16 ) {
|
|
//load sources (floats)
|
|
vs0_low = vs3_hi;
|
|
vs0_hi = vec_ld( 15, &src0[i] );
|
|
vs1_low = vs0_hi;
|
|
vs1_hi = vec_ld( 31, &src0[i] );
|
|
vs2_low = vs1_hi;
|
|
vs2_hi = vec_ld( 47, &src0[i] );
|
|
vs3_low = vs2_hi;
|
|
vs3_hi = vec_ld( 63, &src0[i] );
|
|
|
|
//permute into the vectors we want
|
|
vs0 = vec_perm( vs0_low, vs0_hi, permVec );
|
|
vs1 = vec_perm( vs1_low, vs1_hi, permVec );
|
|
vs2 = vec_perm( vs2_low, vs2_hi, permVec );
|
|
vs3 = vec_perm( vs3_low, vs3_hi, permVec );
|
|
|
|
//load dest (bytes) as unsigned char
|
|
vd = vec_ld( 0, &dst[i] );
|
|
|
|
// do comparison and get bool int result
|
|
vtbi0 = vec_cmple( vs0, constVec );
|
|
vtbi1 = vec_cmple( vs1, constVec );
|
|
vtbi2 = vec_cmple( vs2, constVec );
|
|
vtbi3 = vec_cmple( vs3, constVec );
|
|
|
|
// pack results into shorts
|
|
vtbs0 = vec_pack(vtbi0, vtbi1);
|
|
vtbs1 = vec_pack(vtbi2, vtbi3);
|
|
|
|
// pack results into byte
|
|
vtbc0 = vec_pack(vtbs0, vtbs1);
|
|
|
|
//and with 1L to get true=1 instead of true=255
|
|
vtuc0 = vec_and(vtbc0, oneVector);
|
|
vtuc0 = vec_sl(vtuc0, bitNumVec );
|
|
|
|
//or with original
|
|
vd = vec_or( vd, vtuc0 );
|
|
|
|
vec_st( vd, 0, &dst[i] );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] |= ( src0[i] <= constant ) << bitNum;
|
|
}
|
|
}
|
|
#endif /* ENABLE_COMPARES */
|
|
|
|
#ifdef ENABLE_MINMAX
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( float &min, float &max, const float *src, const int count ) {
|
|
min = idMath::INFINITY; max = -idMath::INFINITY;
|
|
//#define OPER(X) if ( src[(X)] < min ) {min = src[(X)];} if ( src[(X)] > max ) {max = src[(X)];}
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float maxVec, minVec, tempMin, tempMax;
|
|
register vector unsigned char permVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
vector unsigned char oneCharVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
//calculate permute and do first load to
|
|
//get a starting point for min and max
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[0] ), oneCharVector );
|
|
v1_hi = vec_ld( 0, &src[0] );
|
|
|
|
maxVec = loadSplatUnalignedScalar( &max );
|
|
minVec = loadSplatUnalignedScalar( &min );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load sources
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
// minimum
|
|
v2 = vec_min( v0, v1 );
|
|
minVec = vec_min( minVec, v2 );
|
|
// maximum
|
|
v3 = vec_max( v0, v1 );
|
|
maxVec = vec_max( maxVec, v3 );
|
|
}
|
|
|
|
//minVec and maxVec hold the min/max elements from the array, but now
|
|
//we need to figure out which particular element it is
|
|
|
|
tempMin = minVec;
|
|
tempMax = maxVec;
|
|
|
|
// rotate vector around and compare to itself to find the real min/max
|
|
tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 8 ) );
|
|
tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 8 ) );
|
|
tempMin = vec_min( tempMin, vec_sld( tempMin, tempMin, 4 ) );
|
|
tempMax = vec_max( tempMax, vec_sld( tempMax, tempMax, 4 ) );
|
|
minVec = vec_splat( tempMin, 0 );
|
|
maxVec = vec_splat( tempMax, 0 );
|
|
vec_ste( minVec, 0, &min );
|
|
vec_ste( maxVec, 0, &max );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < count; i++ ) {
|
|
if ( src[i] < min ) {
|
|
min = src[i];
|
|
}
|
|
if ( src[i] > max ) {
|
|
max = src[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec2 &min, idVec2 &max, const idVec2 *src, const int count ) {
|
|
min[0] = min[1] = idMath::INFINITY; max[0] = max[1] = -idMath::INFINITY;
|
|
//#define OPER(X) const idVec2 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; }
|
|
|
|
idVec2 v;
|
|
int i = 0;
|
|
int j;
|
|
|
|
const float *srcPtr = src[0].ToFloatPtr();
|
|
register vector float vecLd1, vecLd2, vecLd3, vecLd4;
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
|
|
if ( count > 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr ), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, srcPtr );
|
|
|
|
for ( i = 0, j = 0; i+7 < count; i += 8, j += 4) {
|
|
// load data
|
|
float *vecPtr = (float*)( srcPtr + (j*4) );
|
|
vector float v0, v1, v2, v3;
|
|
|
|
v0 = vecOld;
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
v3 = vec_ld( 47, vecPtr );
|
|
vecOld = vec_ld( 63, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, v3, permVec );
|
|
vecLd4 = vec_perm( v3, vecOld, permVec );
|
|
|
|
// each of these vectors contains 2 elements
|
|
// looks like | X Y X Y | X Y X Y
|
|
v0 = vec_min( vecLd1, vecLd2 );
|
|
v1 = vec_min( vecLd3, vecLd4 );
|
|
v0 = vec_min( v0, v1 );
|
|
|
|
v2 = vec_max( vecLd1, vecLd2 );
|
|
v3 = vec_max( vecLd3, vecLd4 );
|
|
v2 = vec_max( v2, v3 );
|
|
|
|
// since its always X Y X Y we don't have to re-merge each time. we can wait
|
|
// until the end
|
|
vecMin = vec_min( v0, vecMin );
|
|
vecMax = vec_max( v2, vecMax );
|
|
}
|
|
|
|
vecMin = vec_min( vecMin, vec_sld( vecMin, vecMin, 8 ) );
|
|
vecMax = vec_max( vecMax, vec_sld( vecMax, vecMax, 8 ) );
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMax, 0 );
|
|
v3 = vec_splat( vecMax, 1 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &max[0] );
|
|
vec_ste( v3, 0, &max[1] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[i];
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idVec3 *src, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
//#define OPER(X) const idVec3 &v = src[(X)]; if ( v[0] < min[0] ) { min[0] = v[0]; } if ( v[0] > max[0] ) { max[0] = v[0]; } if ( v[1] < min[1] ) { min[1] = v[1]; } if ( v[1] > max[1] ) { max[1] = v[1]; } if ( v[2] < min[2] ) { min[2] = v[2]; } if ( v[2] > max[2] ) { max[2] = v[2]; }
|
|
|
|
int i = 0;
|
|
const float *srcPtr = src[0].ToFloatPtr();
|
|
idVec3 v;
|
|
|
|
register vector float vecLd1, vecLd2, vecLd3;
|
|
register vector float vecMin, vecMax;
|
|
register vector float vecSrc1, vecSrc2, vecSrc3, vecSrc4;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, srcPtr), (vector unsigned char)(1) );
|
|
vector float vecOld = vec_ld( 0, srcPtr );
|
|
|
|
// 4 elements at a time
|
|
for ( ; i+3 < count; i += 4 ) {
|
|
float *vecPtr = (float*)( srcPtr + (i*3) );
|
|
vector float v0, v1, v2;
|
|
|
|
v0 = vecOld;
|
|
v1 = vec_ld( 15, vecPtr );
|
|
v2 = vec_ld( 31, vecPtr );
|
|
vecOld = vec_ld( 47, vecPtr );
|
|
|
|
vecLd1 = vec_perm( v0, v1, permVec );
|
|
vecLd2 = vec_perm( v1, v2, permVec );
|
|
vecLd3 = vec_perm( v2, vecOld, permVec );
|
|
|
|
// put each idVec3 into its own vector as X Y Z (crap)
|
|
vecSrc1 = vecLd1;
|
|
vecSrc2 = vec_sld( vecLd1, vecLd2, 12 );
|
|
vecSrc3 = vec_sld( vecLd2, vecLd3, 8 );
|
|
vecSrc4 = vec_sld( vecLd3, vecLd3, 4 );
|
|
|
|
// do min and max
|
|
vecMin1 = vec_min( vecSrc1, vecSrc2 );
|
|
vecMin2 = vec_min( vecSrc3, vecSrc4 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( vecSrc1, vecSrc2 );
|
|
vecMax2 = vec_max( vecSrc3, vecSrc4 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax1, vecMax );
|
|
}
|
|
|
|
// store results
|
|
vector float v0, v1, v2, v3, v4, v5;
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i ++ ) {
|
|
v = src[i];
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
|
|
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
idVec3 v;
|
|
int i = 0;
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char vertPerm1 = vec_add( vec_lvsl( -1, (float*) src[i].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vector unsigned char vertPerm2 = vec_add( vec_lvsl( -1, (float*) src[i+1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vector unsigned char vertPerm3 = vec_add( vec_lvsl( -1, (float*) src[i+2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vector unsigned char vertPerm4 = vec_add( vec_lvsl( -1, (float*) src[i+3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 11, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 11, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 11, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 11, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[i].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int count ) {
|
|
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
idVec3 v;
|
|
int i = 0;
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[i].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
|
|
idVec3 v;
|
|
int i = 0;
|
|
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char vertPerm1;
|
|
vector unsigned char vertPerm2;
|
|
vector unsigned char vertPerm3;
|
|
vector unsigned char vertPerm4;
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
|
|
|
|
vertPerm1 = vec_add( vec_lvsl( -1, vertPtr ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, vertPtr2 ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, vertPtr3 ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, vertPtr4 ), (vector unsigned char)(1) );
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v2 = vec_perm( v2, v3, vertPerm2 );
|
|
v4 = vec_perm( v4, v5, vertPerm3 );
|
|
v6 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[indexes[i]].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MinMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MinMax( idVec3 &min, idVec3 &max, const idDrawVert *src, const int *indexes, const int count ) {
|
|
min[0] = min[1] = min[2] = idMath::INFINITY; max[0] = max[1] = max[2] = -idMath::INFINITY;
|
|
|
|
idVec3 v;
|
|
int i = 0;
|
|
|
|
register vector float vecMin, vecMax;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float vecMin1, vecMin2, vecMax1, vecMax2;
|
|
|
|
if ( count >= 4 ) {
|
|
|
|
vecMin = (vector float)(FLT_MAX);
|
|
vecMax = (vector float)(FLT_MIN);
|
|
|
|
vector unsigned char vertPerm1;
|
|
vector unsigned char vertPerm2;
|
|
vector unsigned char vertPerm3;
|
|
vector unsigned char vertPerm4;
|
|
|
|
for ( ; i+3 < count; i += 4) {
|
|
const float *vertPtr = src[indexes[i]].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = src[indexes[i+1]].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = src[indexes[i+2]].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = src[indexes[i+3]].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
vecMin1 = vec_min( v0, v2 );
|
|
vecMin2 = vec_min( v4, v6 );
|
|
vecMin1 = vec_min( vecMin1, vecMin2 );
|
|
vecMin = vec_min( vecMin, vecMin1 );
|
|
|
|
vecMax1 = vec_max( v0, v2 );
|
|
vecMax2 = vec_max( v4, v6 );
|
|
vecMax1 = vec_max( vecMax1, vecMax2 );
|
|
vecMax = vec_max( vecMax, vecMax1 );
|
|
}
|
|
|
|
// now we have min/max vectors in X Y Z form, store out
|
|
v0 = vec_splat( vecMin, 0 );
|
|
v1 = vec_splat( vecMin, 1 );
|
|
v2 = vec_splat( vecMin, 2 );
|
|
v3 = vec_splat( vecMax, 0 );
|
|
v4 = vec_splat( vecMax, 1 );
|
|
v5 = vec_splat( vecMax, 2 );
|
|
|
|
vec_ste( v0, 0, &min[0] );
|
|
vec_ste( v1, 0, &min[1] );
|
|
vec_ste( v2, 0, &min[2] );
|
|
vec_ste( v3, 0, &max[0] );
|
|
vec_ste( v4, 0, &max[1] );
|
|
vec_ste( v5, 0, &max[2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < count; i++ ) {
|
|
v = src[indexes[i]].xyz;
|
|
|
|
if ( v[0] < min[0] ) {
|
|
min[0] = v[0];
|
|
}
|
|
if ( v[0] > max[0] ) {
|
|
max[0] = v[0];
|
|
}
|
|
|
|
if ( v[1] < min[1] ) {
|
|
min[1] = v[1];
|
|
}
|
|
if ( v[1] > max[1] ) {
|
|
max[1] = v[1];
|
|
}
|
|
|
|
if ( v[2] > max[2] ) {
|
|
max[2] = v[2];
|
|
}
|
|
|
|
if ( v[2] < min[2] ) {
|
|
min[2] = v[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#endif /* ENABLE_MINMAX */
|
|
|
|
#ifdef ENABLE_CLAMP
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Clamp
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Clamp( float *dst, const float *src, const float min, const float max, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)] > max ? max : src[(X)];
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
register vector unsigned char permVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
register vector float minVec, maxVec;
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
|
|
}
|
|
|
|
//splat min/max into a vector
|
|
minVec = loadSplatUnalignedScalar( &min );
|
|
maxVec = loadSplatUnalignedScalar( &max );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
//apply minimum
|
|
v2 = vec_max( v0, minVec );
|
|
v3 = vec_max( v1, minVec );
|
|
|
|
//apply maximum
|
|
v4 = vec_min( v2, maxVec );
|
|
v5 = vec_min( v3, maxVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i] > max ? max : src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ClampMin
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::ClampMin( float *dst, const float *src, const float min, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)] < min ? min : src[(X)];
|
|
register vector float v0, v1, v2, v3;
|
|
register vector unsigned char permVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
register vector float constVec;
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &min );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
|
|
v2 = vec_max( v0, constVec );
|
|
v3 = vec_max( v1, constVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src[i] < min ? min : src[i];
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ClampMax
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::ClampMax( float *dst, const float *src, const float max, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)] > max ? max : src[(X)];
|
|
register vector float v0, v1, v2, v3;
|
|
register vector unsigned char permVec;
|
|
register vector float constVec;
|
|
register vector float v0_low, v0_hi, v1_low, v1_hi;
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
int i = 0;
|
|
|
|
//handle unaligned at start
|
|
for ( ; NOT_16BYTE_ALIGNED( dst[i] ) && ( i < count ); i++ ) {
|
|
dst[i] = src[i] < max ? max : src[i];
|
|
}
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &max );
|
|
|
|
//calculate permute and do first load
|
|
permVec = vec_add( vec_lvsl( -1, (int*) &src[i] ), oneVector );
|
|
v1_hi = vec_ld( 0, &src[i] );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count; i += 8 ) {
|
|
//load source
|
|
v0_low = v1_hi;
|
|
v0_hi = vec_ld( 15, &src[i] );
|
|
v1_low = v0_hi;
|
|
v1_hi = vec_ld( 31, &src[i] );
|
|
|
|
v0 = vec_perm( v0_low, v0_hi, permVec );
|
|
v1 = vec_perm( v1_low, v1_hi, permVec );
|
|
v2 = vec_min( v0, constVec );
|
|
v3 = vec_min( v1, constVec );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < count ; i++ ) {
|
|
dst[i] = src[i] < max ? max : src[i];
|
|
}
|
|
}
|
|
|
|
#endif /* ENABLE_CLAMP */
|
|
|
|
#ifdef ENABLE_16ROUTINES
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Zero16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Zero16( float *dst, const int count ) {
|
|
memset( dst, 0, count * sizeof( float ) );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Negate16
|
|
|
|
Assumptions:
|
|
dst is aligned
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Negate16( float *dst, const int count ) {
|
|
//#define OPER(X) ptr[(X)] ^= ( 1 << 31 ) // IEEE 32 bits float sign bit
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
int i = 0;
|
|
vector float v0, v1, v2, v3;
|
|
|
|
//know its 16-byte aligned
|
|
for ( ; i + 7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_ld( 16, &dst[i] );
|
|
|
|
v2 = vec_sub( (vector float)(0), v0 );
|
|
v3 = vec_sub( (vector float)(0), v1 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_sub( (vector float)(0), v0 );
|
|
vec_st( v1, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Copy16
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Copy16( float *dst, const float *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src[(X)]
|
|
memcpy( dst, src, sizeof(float) * count );
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Add16
|
|
|
|
Assumptions:
|
|
Assumes dst, src1, src2 all start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Add16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src1[(X)] + src2[(X)]
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src1 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src1[0] ) );
|
|
// src2 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src2[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i = 0;
|
|
|
|
//know all data is 16-byte aligned, so vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
//load sources
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 16, &src1[i] );
|
|
v2 = vec_ld( 0, &src2[i] );
|
|
v3 = vec_ld( 16, &src2[i] );
|
|
v4 = vec_add( v0, v2 );
|
|
v5 = vec_add( v1, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 0, &src2[i] );
|
|
v2 = vec_add( v0, v1 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Sub16
|
|
|
|
Assumptions:
|
|
Assumes that dst, src1, and src2 all start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Sub16( float *dst, const float *src1, const float *src2, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src1[(X)] - src2[(X)]
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src1 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src1[0] ) );
|
|
// src2 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src2[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i = 0;
|
|
|
|
//know data is aligned, so vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
//load sources
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 16, &src1[i] );
|
|
v2 = vec_ld( 0, &src2[i] );
|
|
v3 = vec_ld( 16, &src2[i] );
|
|
v4 = vec_sub( v0, v2 );
|
|
v5 = vec_sub( v1, v3 );
|
|
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 0, &src2[i] );
|
|
v2 = vec_sub( v0, v1 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::Mul16
|
|
|
|
Assumptions:
|
|
Assumes that dst and src1 start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::Mul16( float *dst, const float *src1, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] = src1[(X)] * constant
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src1 is aligned
|
|
assert( IS_16BYTE_ALIGNED( src1[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float constVec;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
int i = 0;
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//know data is aligned, so vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
//load source
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_ld( 16, &src1[i] );
|
|
v2 = vec_madd( constVec, v0, zeroVector );
|
|
v3 = vec_madd( constVec, v1, zeroVector );
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src1[i] );
|
|
v1 = vec_madd( constVec, v0, zeroVector );
|
|
vec_st( v1, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::AddAssign16
|
|
|
|
Assumptions:
|
|
Assumes that dst and src start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::AddAssign16( float *dst, const float *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] += src[(X)]
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src is aligned
|
|
assert( IS_16BYTE_ALIGNED( src[0] ) );
|
|
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i = 0;
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 16, &src[i] );
|
|
v2 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
v4 = vec_add( v0, v2 );
|
|
v5 = vec_add( v1, v3 );
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v2 = vec_add( v0, v1 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::SubAssign16
|
|
|
|
Assumptions:
|
|
Assumes that dst and src start at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::SubAssign16( float *dst, const float *src, const int count ) {
|
|
//#define OPER(X) dst[(X)] -= src[(X)]
|
|
register vector float v0, v1, v2, v3, v4, v5;
|
|
int i=0;
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// src is aligned
|
|
assert( IS_16BYTE_ALIGNED( src[0] ) );
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 16, &src[i] );
|
|
v2 = vec_ld( 0, &dst[i] );
|
|
v3 = vec_ld( 16, &dst[i] );
|
|
v4 = vec_sub( v2, v0 );
|
|
v5 = vec_sub( v3, v1 );
|
|
ALIGNED_STORE2( &dst[i], v4, v5 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &src[i] );
|
|
v1 = vec_ld( 0, &dst[i] );
|
|
v2 = vec_sub( v1, v0 );
|
|
vec_st( v2, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MulAssign16
|
|
|
|
Assumptions:
|
|
Assumes that dst starts at aligned address and count is multiple of 4
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MulAssign16( float *dst, const float constant, const int count ) {
|
|
//#define OPER(X) dst[(X)] *= constant
|
|
|
|
// dst is aligned
|
|
assert( IS_16BYTE_ALIGNED( dst[0] ) );
|
|
// round count up to next 4 if needbe
|
|
int count2 = ( count + 3 ) & ~3;
|
|
|
|
register vector float v0, v1, v2, v3;
|
|
register vector float constVec;
|
|
int i = 0;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
|
|
//splat constant into a vector
|
|
constVec = loadSplatUnalignedScalar( &constant );
|
|
|
|
//vectorize!
|
|
for ( ; i+7 < count2; i += 8 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_ld( 16, &dst[i] );
|
|
v2 = vec_madd( v0, constVec, zeroVector );
|
|
v3 = vec_madd( v1, constVec, zeroVector );
|
|
ALIGNED_STORE2( &dst[i], v2, v3 );
|
|
}
|
|
|
|
for ( ; i < count2; i += 4 ) {
|
|
v0 = vec_ld( 0, &dst[i] );
|
|
v1 = vec_madd( v0, constVec, zeroVector );
|
|
vec_st( v1, 0, &dst[i] );
|
|
}
|
|
}
|
|
|
|
#endif /* ENABLE_16ROUTINES */
|
|
|
|
#ifdef ENABLE_LOWER_TRIANGULAR
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MatX_LowerTriangularSolve
|
|
|
|
solves x in L * x = b for the first n rows of L
|
|
if skip > 0 the first skip elements of x are assumed to be valid already
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
|
|
void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
|
|
|
|
int i, j;
|
|
const float *lptr;
|
|
const float *lptr2;
|
|
const float *lptr3;
|
|
const float *lptr4;
|
|
float sum;
|
|
float sum2;
|
|
float sum3;
|
|
float sum4;
|
|
float tempSum;
|
|
float tempSum2;
|
|
float tempSum3;
|
|
float tempSum4;
|
|
vector float vecSum1 = (vector float)(0.0);
|
|
vector float vecSum2 = (vector float)(0.0);
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
|
|
vector float zeroVector = (vector float)(0.0);
|
|
vector float vecSum3, vecSum4, vecSum5, vecSum6, vecSum7, vecSum8;
|
|
|
|
vector unsigned char vecPermX = vec_add( vec_lvsl( -1, &x[0] ), (vector unsigned char)(1) );
|
|
|
|
// unrolled this loop a bit
|
|
for ( i = skip; i+3 < n; i+=4 ) {
|
|
sum = b[i];
|
|
sum2 = b[i+1];
|
|
sum3 = b[i+2];
|
|
sum4 = b[i+3];
|
|
|
|
vecSum1 = zeroVector;
|
|
vecSum2 = zeroVector;
|
|
vecSum3 = vecSum4 = vecSum5 = vecSum6 = vecSum7 = vecSum8 = zeroVector;
|
|
lptr = L[i];
|
|
lptr2 = L[i+1];
|
|
lptr3 = L[i+2];
|
|
lptr4 = L[i+3];
|
|
|
|
vector unsigned char vecPermLptr1 = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
|
|
vector unsigned char vecPermLptr2 = vec_add( vec_lvsl( -1, lptr2 ), (vector unsigned char)(1) );
|
|
vector unsigned char vecPermLptr3 = vec_add( vec_lvsl( -1, lptr3 ), (vector unsigned char)(1) );
|
|
vector unsigned char vecPermLptr4 = vec_add( vec_lvsl( -1, lptr4 ), (vector unsigned char)(1) );
|
|
|
|
for ( j = 0 ; j+7 < i; j+=8 ) {
|
|
|
|
v0 = vec_ld( 0, &x[j] );
|
|
v1 = vec_ld( 15, &x[j] );
|
|
vector float vecExtraX = vec_ld( 31, &x[j] );
|
|
v0 = vec_perm( v0, v1, vecPermX );
|
|
v1 = vec_perm( v1, vecExtraX, vecPermX );
|
|
|
|
v2 = vec_ld( 0, lptr + j );
|
|
v3 = vec_ld( 15, lptr + j );
|
|
vector float vecExtra1 = vec_ld( 31, lptr + j );
|
|
v2 = vec_perm( v2, v3, vecPermLptr1 );
|
|
v3 = vec_perm( v3, vecExtra1, vecPermLptr1 );
|
|
|
|
v4 = vec_ld( 0, lptr2 + j );
|
|
v5 = vec_ld( 15, lptr2 + j );
|
|
vector float vecExtra2 = vec_ld( 31, lptr2 + j );
|
|
v4 = vec_perm( v4, v5, vecPermLptr2 );
|
|
v5 = vec_perm( v5, vecExtra2, vecPermLptr2 );
|
|
|
|
v6 = vec_ld( 0, lptr3 + j );
|
|
v7 = vec_ld( 15, lptr3 + j );
|
|
vector float vecExtra3 = vec_ld( 31, lptr3 + j );
|
|
v6 = vec_perm( v6, v7, vecPermLptr3 );
|
|
v7 = vec_perm( v7, vecExtra3, vecPermLptr3 );
|
|
|
|
v8 = vec_ld( 0, lptr4 + j );
|
|
v9 = vec_ld( 15, lptr4 + j );
|
|
vector float vecExtra4 = vec_ld( 31, lptr4 + j );
|
|
v8 = vec_perm( v8, v9, vecPermLptr4 );
|
|
v9 = vec_perm( v9, vecExtra4, vecPermLptr4 );
|
|
|
|
vecSum1 = vec_madd( v2, v0, vecSum1 );
|
|
vecSum2 = vec_madd( v3, v1, vecSum2 );
|
|
|
|
vecSum3 = vec_madd( v4, v0, vecSum3 );
|
|
vecSum4 = vec_madd( v5, v1, vecSum4 );
|
|
|
|
vecSum5 = vec_madd( v6, v0, vecSum5 );
|
|
vecSum6 = vec_madd( v7, v1, vecSum6 );
|
|
|
|
vecSum7 = vec_madd( v8, v0, vecSum7 );
|
|
vecSum8 = vec_madd( v9, v1, vecSum8 );
|
|
}
|
|
|
|
// if we ran the unrolled code, we need to sum accross the vectors
|
|
// to find out how much to subtract from sum
|
|
if ( j > 0 ) {
|
|
vecSum1 = vec_add( vecSum1, vecSum2 );
|
|
vecSum3 = vec_add( vecSum3, vecSum4 );
|
|
vecSum5 = vec_add( vecSum5, vecSum6 );
|
|
vecSum7 = vec_add( vecSum7, vecSum8 );
|
|
//sum accross the vectors
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
|
|
|
|
vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 8 ) );
|
|
vecSum3 = vec_add( vecSum3, vec_sld( vecSum3, vecSum3, 4 ) );
|
|
|
|
vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 8 ) );
|
|
vecSum5 = vec_add( vecSum5, vec_sld( vecSum5, vecSum5, 4 ) );
|
|
|
|
vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 8 ) );
|
|
vecSum7 = vec_add( vecSum7, vec_sld( vecSum7, vecSum7, 4 ) );
|
|
|
|
//move the result to the FPU
|
|
vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
|
|
vec_ste( vec_splat( vecSum3, 0 ), 0, &tempSum2 );
|
|
vec_ste( vec_splat( vecSum5, 0 ), 0, &tempSum3 );
|
|
vec_ste( vec_splat( vecSum7, 0 ), 0, &tempSum4 );
|
|
|
|
sum -= tempSum;
|
|
sum2 -= tempSum2;
|
|
sum3 -= tempSum3;
|
|
sum4 -= tempSum4;
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; j < i; j++ ) {
|
|
sum -= lptr[j] * x[j];
|
|
sum2 -= lptr2[j] * x[j];
|
|
sum3 -= lptr3[j] * x[j];
|
|
sum4 -= lptr4[j] * x[j];
|
|
}
|
|
|
|
// store the 4 results at a time
|
|
sum2 -= ( lptr2[i] * sum );
|
|
sum3 = sum3 - ( lptr3[i+1] * sum2 ) - ( lptr3[i] * sum );
|
|
sum4 = sum4 - ( lptr4[i+2] * sum3 ) - ( lptr4[i+1] * sum2 ) - ( lptr4[i] * sum );
|
|
|
|
x[i] = sum;
|
|
x[i+1] = sum2;
|
|
x[i+2] = sum3;
|
|
x[i+3] = sum4;
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < n; i++ ) {
|
|
sum = b[i];
|
|
vecSum1 = zeroVector;
|
|
vecSum2 = zeroVector;
|
|
lptr = L[i];
|
|
vector unsigned char vecPermLptr = vec_add( vec_lvsl( -1, lptr ), (vector unsigned char)(1) );
|
|
|
|
for ( j = 0 ; j+7 < i; j+=8 ) {
|
|
|
|
v0 = vec_ld( 0, &x[j] );
|
|
v2 = vec_ld( 15, &x[j] );
|
|
vector float vecExtraX = vec_ld( 31, &x[j] );
|
|
v0 = vec_perm( v0, v2, vecPermX );
|
|
v2 = vec_perm( v2, vecExtraX, vecPermX );
|
|
|
|
v1 = vec_ld( 0, lptr + j );
|
|
v3 = vec_ld( 15, lptr + j );
|
|
vector float vecExtra = vec_ld( 31, lptr + j );
|
|
v1 = vec_perm( v1, v3, vecPermLptr );
|
|
v3 = vec_perm( v3, vecExtra, vecPermLptr );
|
|
|
|
vecSum1 = vec_madd( v1, v0, vecSum1 );
|
|
vecSum2 = vec_madd( v3, v2, vecSum2 );
|
|
}
|
|
|
|
// if we ran the unrolled code, we need to sum accross the vectors
|
|
// to find out how much to subtract from sum
|
|
if ( j > 0 ) {
|
|
//sum accross the vectors
|
|
vecSum1 = vec_add( vecSum1, vecSum2 );
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 8 ) );
|
|
vecSum1 = vec_add( vecSum1, vec_sld( vecSum1, vecSum1, 4 ) );
|
|
|
|
//move the result to the FPU
|
|
vec_ste( vec_splat( vecSum1, 0 ), 0, &tempSum );
|
|
sum -= tempSum;
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; j < i; j++ ) {
|
|
sum -= lptr[j] * x[j];
|
|
}
|
|
x[i] = sum;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose
|
|
|
|
solves x in L.Transpose() * x = b for the first n rows of L
|
|
L has to be a lower triangular matrix with (implicit) ones on the diagonal
|
|
x == b is allowed
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
|
|
|
|
int nc;
|
|
const float *lptr;
|
|
|
|
lptr = L.ToFloatPtr();
|
|
nc = L.GetNumColumns();
|
|
|
|
float x0, x1, x2, x3, x4, x5, x6;
|
|
// unrolled cases for n < 8
|
|
if ( n < 8 ) {
|
|
switch( n ) {
|
|
// using local variables to avoid aliasing issues
|
|
case 0:
|
|
return;
|
|
case 1:
|
|
x[0] = b[0];
|
|
return;
|
|
case 2:
|
|
x1 = b[1];
|
|
x0 = b[0] - lptr[1*nc+0] * x1;
|
|
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
case 3:
|
|
x2 = b[2];
|
|
x1 = b[1] - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
case 4:
|
|
x3 = b[3];
|
|
x2 = b[2] - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
|
|
return;
|
|
case 5:
|
|
x4 = b[4];
|
|
x3 = b[3] - lptr[4*nc+3] * x4;
|
|
x2 = b[2] - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[4] = x4;
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
case 6:
|
|
x5 = b[5];
|
|
x4 = b[4] - lptr[5*nc+4] * x5;
|
|
x3 = b[3] - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
|
|
x2 = b[2] - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[5] = x5;
|
|
x[4] = x4;
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
|
|
return;
|
|
case 7:
|
|
x6 = b[6];
|
|
x5 = b[5] - lptr[6*nc+5] * x6;
|
|
x4 = b[4] - lptr[6*nc+4] * x6 - lptr[5*nc+4] * x5;
|
|
x3 = b[3] - lptr[6*nc+3] * x6 - lptr[5*nc+3] * x5 - lptr[4*nc+3] * x4;
|
|
x2 = b[2] - lptr[6*nc+2] * x6 - lptr[5*nc+2] * x5 - lptr[4*nc+2] * x4 - lptr[3*nc+2] * x3;
|
|
x1 = b[1] - lptr[6*nc+1] * x6 - lptr[5*nc+1] * x5 - lptr[4*nc+1] * x4 - lptr[3*nc+1] * x3 - lptr[2*nc+1] * x2;
|
|
x0 = b[0] - lptr[6*nc+0] * x6 - lptr[5*nc+0] * x5 - lptr[4*nc+0] * x4 - lptr[3*nc+0] * x3 - lptr[2*nc+0] * x2 - lptr[1*nc+0] * x1;
|
|
|
|
x[6] = x6;
|
|
x[5] = x5;
|
|
x[4] = x4;
|
|
x[3] = x3;
|
|
x[2] = x2;
|
|
x[1] = x1;
|
|
x[0] = x0;
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
int i, j;
|
|
register float s0, s1, s2, s3;
|
|
float *xptr;
|
|
|
|
lptr = L.ToFloatPtr() + n * nc + n - 4;
|
|
xptr = x + n;
|
|
|
|
// process 4 rows at a time
|
|
for ( i = n; i >= 4; i -= 4 ) {
|
|
s0 = b[i-4];
|
|
s1 = b[i-3];
|
|
s2 = b[i-2];
|
|
s3 = b[i-1];
|
|
// process 4x4 blocks
|
|
for ( j = 0; j < n-i; j += 4 ) {
|
|
s0 -= lptr[(j+0)*nc+0] * xptr[j+0];
|
|
s1 -= lptr[(j+0)*nc+1] * xptr[j+0];
|
|
s2 -= lptr[(j+0)*nc+2] * xptr[j+0];
|
|
s3 -= lptr[(j+0)*nc+3] * xptr[j+0];
|
|
s0 -= lptr[(j+1)*nc+0] * xptr[j+1];
|
|
s1 -= lptr[(j+1)*nc+1] * xptr[j+1];
|
|
s2 -= lptr[(j+1)*nc+2] * xptr[j+1];
|
|
s3 -= lptr[(j+1)*nc+3] * xptr[j+1];
|
|
s0 -= lptr[(j+2)*nc+0] * xptr[j+2];
|
|
s1 -= lptr[(j+2)*nc+1] * xptr[j+2];
|
|
s2 -= lptr[(j+2)*nc+2] * xptr[j+2];
|
|
s3 -= lptr[(j+2)*nc+3] * xptr[j+2];
|
|
s0 -= lptr[(j+3)*nc+0] * xptr[j+3];
|
|
s1 -= lptr[(j+3)*nc+1] * xptr[j+3];
|
|
s2 -= lptr[(j+3)*nc+2] * xptr[j+3];
|
|
s3 -= lptr[(j+3)*nc+3] * xptr[j+3];
|
|
}
|
|
// process left over of the 4 rows
|
|
s0 -= lptr[0-1*nc] * s3;
|
|
s1 -= lptr[1-1*nc] * s3;
|
|
s2 -= lptr[2-1*nc] * s3;
|
|
s0 -= lptr[0-2*nc] * s2;
|
|
s1 -= lptr[1-2*nc] * s2;
|
|
s0 -= lptr[0-3*nc] * s1;
|
|
// store result
|
|
xptr[-4] = s0;
|
|
xptr[-3] = s1;
|
|
xptr[-2] = s2;
|
|
xptr[-1] = s3;
|
|
// update pointers for next four rows
|
|
lptr -= 4 + 4 * nc;
|
|
xptr -= 4;
|
|
}
|
|
// process left over rows
|
|
for ( i--; i >= 0; i-- ) {
|
|
s0 = b[i];
|
|
lptr = L[0] + i;
|
|
for ( j = i + 1; j < n; j++ ) {
|
|
s0 -= lptr[j*nc] * x[j];
|
|
}
|
|
x[i] = s0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MatX_LDLT_Factor
|
|
============
|
|
*/
|
|
unsigned char VPCALL idSIMD_AltiVec::MatX_LDLT_Factor( idMatX &mat, idVecX &invDiag, const int n ) {
|
|
int i, j, k, nc;
|
|
float *v, *diag, *mptr;
|
|
float s0, s1, s2, s3, sum, d;
|
|
float s0_2, s1_2, s2_2, s3_2, sum_2;
|
|
float *mptr2;
|
|
|
|
v = (float *) _alloca16( n * sizeof( float ) );
|
|
diag = (float *) _alloca16( n * sizeof( float ) );
|
|
|
|
nc = mat.GetNumColumns();
|
|
|
|
if ( n <= 0 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
|
|
sum = mptr[0];
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
diag[0] = sum;
|
|
invDiag[0] = d = 1.0f / sum;
|
|
|
|
if ( n <= 1 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 1; j < n; j++ ) {
|
|
mptr[j*nc+0] = ( mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[1];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
sum = mptr[1] - s0;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[1][1] = sum;
|
|
diag[1] = sum;
|
|
invDiag[1] = d = 1.0f / sum;
|
|
|
|
if ( n <= 2 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 2; j < n; j++ ) {
|
|
mptr[j*nc+1] = ( mptr[j*nc+1] - v[0] * mptr[j*nc+0] ) * d;
|
|
}
|
|
|
|
mptr = mat[2];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
sum = mptr[2] - s0 - s1;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[2][2] = sum;
|
|
diag[2] = sum;
|
|
invDiag[2] = d = 1.0f / sum;
|
|
|
|
if ( n <= 3 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 3; j < n; j++ ) {
|
|
mptr[j*nc+2] = ( mptr[j*nc+2] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] ) * d;
|
|
}
|
|
|
|
mptr = mat[3];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
sum = mptr[3] - s0 - s1 - s2;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[3][3] = sum;
|
|
diag[3] = sum;
|
|
invDiag[3] = d = 1.0f / sum;
|
|
|
|
if ( n <= 4 ) {
|
|
return true;
|
|
}
|
|
|
|
mptr = mat[0];
|
|
for ( j = 4; j < n; j++ ) {
|
|
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
|
}
|
|
|
|
for ( i = 4; i < n; i++ ) {
|
|
|
|
mptr = mat[i];
|
|
|
|
v[0] = diag[0] * mptr[0]; s0 = v[0] * mptr[0];
|
|
v[1] = diag[1] * mptr[1]; s1 = v[1] * mptr[1];
|
|
v[2] = diag[2] * mptr[2]; s2 = v[2] * mptr[2];
|
|
v[3] = diag[3] * mptr[3]; s3 = v[3] * mptr[3];
|
|
for ( k = 4; k < i-3; k += 4 ) {
|
|
v[k+0] = diag[k+0] * mptr[k+0]; s0 += v[k+0] * mptr[k+0];
|
|
v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
v[k+2] = diag[k+2] * mptr[k+2]; s2 += v[k+2] * mptr[k+2];
|
|
v[k+3] = diag[k+3] * mptr[k+3]; s3 += v[k+3] * mptr[k+3];
|
|
}
|
|
switch( i - k ) {
|
|
case 3: v[k+2] = diag[k+2] * mptr[k+2]; s0 += v[k+2] * mptr[k+2];
|
|
case 2: v[k+1] = diag[k+1] * mptr[k+1]; s1 += v[k+1] * mptr[k+1];
|
|
case 1: v[k+0] = diag[k+0] * mptr[k+0]; s2 += v[k+0] * mptr[k+0];
|
|
}
|
|
sum = s3;
|
|
sum += s2;
|
|
sum += s1;
|
|
sum += s0;
|
|
sum = mptr[i] - sum;
|
|
|
|
if ( sum == 0.0f ) {
|
|
return false;
|
|
}
|
|
|
|
mat[i][i] = sum;
|
|
diag[i] = sum;
|
|
invDiag[i] = d = 1.0f / sum;
|
|
|
|
if ( i + 1 >= n ) {
|
|
return true;
|
|
}
|
|
|
|
// unrolling madness!
|
|
mptr = mat[i+1];
|
|
mptr2 = mat[i+1] + nc;
|
|
|
|
for ( j = i+1; j+1 < n; j+=2 ) {
|
|
s0 = mptr[0] * v[0];
|
|
s1 = mptr[1] * v[1];
|
|
s2 = mptr[2] * v[2];
|
|
s3 = mptr[3] * v[3];
|
|
|
|
s0_2 = mptr2[0] * v[0];
|
|
s1_2 = mptr2[1] * v[1];
|
|
s2_2 = mptr2[2] * v[2];
|
|
s3_2 = mptr2[3] * v[3];
|
|
|
|
for ( k = 4; k < i-7; k += 8 ) {
|
|
s0 += mptr[k+0] * v[k+0];
|
|
s1 += mptr[k+1] * v[k+1];
|
|
s2 += mptr[k+2] * v[k+2];
|
|
s3 += mptr[k+3] * v[k+3];
|
|
s0 += mptr[k+4] * v[k+4];
|
|
s1 += mptr[k+5] * v[k+5];
|
|
s2 += mptr[k+6] * v[k+6];
|
|
s3 += mptr[k+7] * v[k+7];
|
|
|
|
s0_2 += mptr2[k+0] * v[k+0];
|
|
s1_2 += mptr2[k+1] * v[k+1];
|
|
s2_2 += mptr2[k+2] * v[k+2];
|
|
s3_2 += mptr2[k+3] * v[k+3];
|
|
s0_2 += mptr2[k+4] * v[k+4];
|
|
s1_2 += mptr2[k+5] * v[k+5];
|
|
s2_2 += mptr2[k+6] * v[k+6];
|
|
s3_2 += mptr2[k+7] * v[k+7];
|
|
}
|
|
|
|
switch( i - k ) {
|
|
case 7: s0 += mptr[k+6] * v[k+6]; s0_2 += mptr2[k+6] * v[k+6];
|
|
case 6: s1 += mptr[k+5] * v[k+5]; s1_2 += mptr2[k+5] * v[k+5];
|
|
case 5: s2 += mptr[k+4] * v[k+4]; s2_2 += mptr2[k+4] * v[k+4];
|
|
case 4: s3 += mptr[k+3] * v[k+3]; s3_2 += mptr2[k+3] * v[k+3];
|
|
case 3: s0 += mptr[k+2] * v[k+2]; s0_2 += mptr2[k+2] * v[k+2];
|
|
case 2: s1 += mptr[k+1] * v[k+1]; s1_2 += mptr2[k+1] * v[k+1];
|
|
case 1: s2 += mptr[k+0] * v[k+0]; s2_2 += mptr2[k+0] * v[k+0];
|
|
}
|
|
// disassociate these adds
|
|
s3 += s2;
|
|
s1 += s0;
|
|
sum = s1 + s3;
|
|
|
|
s3_2 += s2_2;
|
|
s1_2 += s0_2;
|
|
sum_2 = s1_2 + s3_2;
|
|
|
|
mptr[i] = ( mptr[i] - sum ) * d;
|
|
mptr2[i] = ( mptr2[i] - sum_2 ) * d;
|
|
|
|
mptr += nc*2;
|
|
mptr2 += nc*2;
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; j < n; j++ ) {
|
|
s0 = mptr[0] * v[0];
|
|
s1 = mptr[1] * v[1];
|
|
s2 = mptr[2] * v[2];
|
|
s3 = mptr[3] * v[3];
|
|
for ( k = 4; k < i-7; k += 8 ) {
|
|
s0 += mptr[k+0] * v[k+0];
|
|
s1 += mptr[k+1] * v[k+1];
|
|
s2 += mptr[k+2] * v[k+2];
|
|
s3 += mptr[k+3] * v[k+3];
|
|
s0 += mptr[k+4] * v[k+4];
|
|
s1 += mptr[k+5] * v[k+5];
|
|
s2 += mptr[k+6] * v[k+6];
|
|
s3 += mptr[k+7] * v[k+7];
|
|
}
|
|
switch( i - k ) {
|
|
case 7: s0 += mptr[k+6] * v[k+6];
|
|
case 6: s1 += mptr[k+5] * v[k+5];
|
|
case 5: s2 += mptr[k+4] * v[k+4];
|
|
case 4: s3 += mptr[k+3] * v[k+3];
|
|
case 3: s0 += mptr[k+2] * v[k+2];
|
|
case 2: s1 += mptr[k+1] * v[k+1];
|
|
case 1: s2 += mptr[k+0] * v[k+0];
|
|
}
|
|
// disassociate these adds
|
|
s3 += s2;
|
|
s1 += s0;
|
|
sum = s1 + s3;
|
|
mptr[i] = ( mptr[i] - sum ) * d;
|
|
mptr += nc;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
#endif /* ENABLE_LOWER_TRIANGULAR */
|
|
|
|
|
|
#ifdef LIVE_VICARIOUSLY
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::BlendJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::BlendJoints( idJointQuat *joints, const idJointQuat *blendJoints, const float lerp, const int *index, const int numJoints ) {
|
|
int i;
|
|
|
|
// since lerp is a constant, we can special case the two cases if they're true
|
|
if ( lerp <= 0.0f ) {
|
|
// this sets joints back to joints. No sense in doing no work, so just return
|
|
return;
|
|
}
|
|
|
|
if ( lerp >= 1.0f ) {
|
|
// this copies each q from blendJoints to joints and copies each t from blendJoints to joints
|
|
memcpy( joints[0].q.ToFloatPtr(), blendJoints[0].q.ToFloatPtr(), sizeof(idJointQuat) * numJoints );
|
|
return;
|
|
}
|
|
|
|
vector float vecLerp = loadSplatUnalignedScalar( &lerp );
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
for ( i = 0; i+3 < numJoints; i+=4 ) {
|
|
int j = index[i];
|
|
int j2 = index[i+1];
|
|
int j3 = index[i+2];
|
|
int j4 = index[i+3];
|
|
|
|
// slerp
|
|
const float *jointPtr = joints[j].q.ToFloatPtr();
|
|
const float *blendPtr = blendJoints[j].q.ToFloatPtr();
|
|
const float *jointPtr2 = joints[j2].q.ToFloatPtr();
|
|
const float *blendPtr2 = blendJoints[j2].q.ToFloatPtr();
|
|
const float *jointPtr3 = joints[j3].q.ToFloatPtr();
|
|
const float *blendPtr3 = blendJoints[j3].q.ToFloatPtr();
|
|
const float *jointPtr4 = joints[j4].q.ToFloatPtr();
|
|
const float *blendPtr4 = blendJoints[j4].q.ToFloatPtr();
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, jointPtr2 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec3 = vec_add( vec_lvsl( -1, jointPtr3 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec4 = vec_add( vec_lvsl( -1, jointPtr4 ), (vector unsigned char)(1) );
|
|
|
|
vector unsigned char permVec5 = vec_add( vec_lvsl( -1, blendPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec6 = vec_add( vec_lvsl( -1, blendPtr2 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec7 = vec_add( vec_lvsl( -1, blendPtr3 ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec8 = vec_add( vec_lvsl( -1, blendPtr4 ), (vector unsigned char)(1) );
|
|
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
|
|
vector float v12, v13, v14, v15, v16;
|
|
vector float vecFromX, vecFromY, vecFromZ, vecFromW;
|
|
vector float vecToX, vecToY, vecToZ, vecToW;
|
|
|
|
// load up the the idJointQuats from joints
|
|
v0 = vec_ld( 0, jointPtr );
|
|
v1 = vec_ld( 15, jointPtr );
|
|
v2 = vec_perm( v0, v1, permVec );
|
|
|
|
v3 = vec_ld( 0, jointPtr2 );
|
|
v4 = vec_ld( 15, jointPtr2 );
|
|
v5 = vec_perm( v3, v4, permVec2 );
|
|
|
|
v6 = vec_ld( 0, jointPtr3 );
|
|
v7 = vec_ld( 15, jointPtr3 );
|
|
v8 = vec_perm( v6, v7, permVec3 );
|
|
|
|
v9 = vec_ld( 0, jointPtr4 );
|
|
v10 = vec_ld( 15, jointPtr4 );
|
|
v11 = vec_perm( v9, v10, permVec4 );
|
|
|
|
// planarizing, so put each x y z w into its own vector
|
|
v0 = vec_mergeh( v2, v8 );
|
|
v1 = vec_mergeh( v5, v11 );
|
|
v3 = vec_mergel( v2, v8 );
|
|
v4 = vec_mergel( v5, v11 );
|
|
|
|
vecFromX = vec_mergeh( v0, v1 );
|
|
vecFromY = vec_mergel( v0, v1 );
|
|
vecFromZ = vec_mergeh( v3, v4 );
|
|
vecFromW = vec_mergel( v3, v4 );
|
|
|
|
// load up idJointQuats from blendJoints
|
|
v5 = vec_ld( 0, blendPtr );
|
|
v6 = vec_ld( 15, blendPtr );
|
|
v7 = vec_perm( v5, v6, permVec5 );
|
|
|
|
v8 = vec_ld( 0, blendPtr2 );
|
|
v9 = vec_ld( 15, blendPtr2 );
|
|
v10 = vec_perm( v8, v9, permVec6 );
|
|
|
|
v11 = vec_ld( 0, blendPtr3 );
|
|
v12 = vec_ld( 15, blendPtr3 );
|
|
v13 = vec_perm( v11, v12, permVec7 );
|
|
|
|
v14 = vec_ld( 0, blendPtr4 );
|
|
v15 = vec_ld( 15, blendPtr4 );
|
|
v16 = vec_perm( v14, v15, permVec8 );
|
|
|
|
// put these into their own vectors too
|
|
v5 = vec_mergeh( v7, v13 );
|
|
v6 = vec_mergeh( v10, v16 );
|
|
v8 = vec_mergel( v7, v13 );
|
|
v9 = vec_mergel( v10, v16 );
|
|
|
|
vecToX = vec_mergeh( v5, v6 );
|
|
vecToY = vec_mergel( v5, v6 );
|
|
vecToZ = vec_mergeh( v8, v9 );
|
|
vecToW = vec_mergel( v8, v9 );
|
|
|
|
// calculate cosom
|
|
vector float vecCosom = vec_madd( vecFromX, vecToX, (vector float)(0) );
|
|
vecCosom = vec_madd( vecFromY, vecToY, vecCosom );
|
|
vecCosom = vec_madd( vecFromZ, vecToZ, vecCosom );
|
|
vecCosom = vec_madd( vecFromW, vecToW, vecCosom );
|
|
|
|
// if cosom is < 0, negate it and set temp to negated elements in to. otherwise, set temp to
|
|
// to
|
|
vector bool int vecCmp, vecCmp2;
|
|
vecCmp = vec_cmplt( vecCosom, zeroVector );
|
|
|
|
// negate if needed
|
|
vecToX = vec_sel( vecToX, vec_madd( vecToX, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecToY = vec_sel( vecToY, vec_madd( vecToY, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecToZ = vec_sel( vecToZ, vec_madd( vecToZ, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecToW = vec_sel( vecToW, vec_madd( vecToW, (vector float)(-1), zeroVector ), vecCmp );
|
|
vecCosom = vec_sel( vecCosom, vec_madd( vecCosom, (vector float)(-1), zeroVector ), vecCmp );
|
|
|
|
// check if we need to calculate scale
|
|
vecCmp2 = vec_cmpgt( vec_sub( (vector float)(1), vecCosom ), (vector float)(1e-6f) );
|
|
vector float vecScale0 = vec_sub( (vector float)(1), vecLerp );
|
|
vector float vecScale1 = vec_splat( vecLerp, 0 );
|
|
|
|
vector float vecWork1 = vec_sub( (vector float)(1), vec_madd( vecCosom, vecCosom, zeroVector ) );
|
|
vector float vecWork2 = ReciprocalSquareRoot( vecWork1 );
|
|
vector float vecWork3 = VectorATan16( vec_madd( vecWork1, vecWork2, zeroVector ), vecCosom );
|
|
|
|
vecWork1 = vec_madd( VectorSin16( vec_madd( vecScale0, vecWork3, zeroVector ) ), vecWork2, zeroVector );
|
|
vecWork2 = vec_madd( VectorSin16( vec_madd( vecLerp, vecWork3, zeroVector ) ), vecWork2, zeroVector );
|
|
|
|
// see which ones we have to insert into our scale0 and scale1 vectors
|
|
vecScale0 = vec_sel( vecScale0, vecWork1, vecCmp2 );
|
|
vecScale1 = vec_sel( vecScale1, vecWork2, vecCmp2 );
|
|
|
|
// multiply each element by the scale
|
|
vecFromX = vec_madd( vecFromX, vecScale0, zeroVector );
|
|
vecFromY = vec_madd( vecFromY, vecScale0, zeroVector );
|
|
vecFromZ = vec_madd( vecFromZ, vecScale0, zeroVector );
|
|
vecFromW = vec_madd( vecFromW, vecScale0, zeroVector );
|
|
|
|
// multiply temp by scale and add to result
|
|
vecFromX = vec_madd( vecToX, vecScale1, vecFromX );
|
|
vecFromY = vec_madd( vecToY, vecScale1, vecFromY );
|
|
vecFromZ = vec_madd( vecToZ, vecScale1, vecFromZ );
|
|
vecFromW = vec_madd( vecToW, vecScale1, vecFromW );
|
|
|
|
// do a transform again to get the results back to vectors we can store out
|
|
v5 = vec_mergeh( vecFromX, vecFromZ );
|
|
v6 = vec_mergeh( vecFromY, vecFromW );
|
|
v8 = vec_mergel( vecFromX, vecFromZ );
|
|
v9 = vec_mergel( vecFromY, vecFromW );
|
|
|
|
vecToX = vec_mergeh( v5, v6 );
|
|
vecToY = vec_mergel( v5, v6 );
|
|
vecToZ = vec_mergeh( v8, v9 );
|
|
vecToW = vec_mergel( v8, v9 );
|
|
|
|
vector unsigned char storePerm1 = vec_lvsr( 0, jointPtr );
|
|
vector unsigned char storePerm2 = vec_lvsr( 0, jointPtr2 );
|
|
vector unsigned char storePerm3 = vec_lvsr( 0, jointPtr3 );
|
|
vector unsigned char storePerm4 = vec_lvsr( 0, jointPtr4 );
|
|
|
|
// right rotate the input data
|
|
vecToX = vec_perm( vecToX, vecToX, storePerm1 );
|
|
vecToY = vec_perm( vecToY, vecToY, storePerm2 );
|
|
vecToZ = vec_perm( vecToZ, vecToZ, storePerm3 );
|
|
vecToW = vec_perm( vecToW, vecToW, storePerm4 );
|
|
|
|
vec_ste( vecToX, 0, (float*) jointPtr );
|
|
vec_ste( vecToX, 4, (float*) jointPtr );
|
|
vec_ste( vecToX, 8, (float*) jointPtr );
|
|
vec_ste( vecToX, 12, (float*) jointPtr );
|
|
|
|
vec_ste( vecToY, 0, (float*) jointPtr2 );
|
|
vec_ste( vecToY, 4, (float*) jointPtr2 );
|
|
vec_ste( vecToY, 8, (float*) jointPtr2 );
|
|
vec_ste( vecToY, 12, (float*) jointPtr2 );
|
|
|
|
vec_ste( vecToZ, 0, (float*) jointPtr3 );
|
|
vec_ste( vecToZ, 4, (float*) jointPtr3 );
|
|
vec_ste( vecToZ, 8, (float*) jointPtr3 );
|
|
vec_ste( vecToZ, 12, (float*) jointPtr3 );
|
|
|
|
vec_ste( vecToW, 0, (float*) jointPtr4 );
|
|
vec_ste( vecToW, 4, (float*) jointPtr4 );
|
|
vec_ste( vecToW, 8, (float*) jointPtr4 );
|
|
vec_ste( vecToW, 12, (float*) jointPtr4 );
|
|
|
|
// lerp is v1 + l * ( v2 - v1 );
|
|
// the idVec3 T is going to be 12 bytes after the Q, so we can do this without calling ToFloatPtr() again. since its
|
|
float *jointVecPtr = (float*)( jointPtr + 4 );
|
|
float *jointVecPtr2 = (float*)( jointPtr2 + 4 );
|
|
float *jointVecPtr3 = (float*)( jointPtr3 + 4 );
|
|
float *jointVecPtr4 = (float*)( jointPtr4 + 4 );
|
|
|
|
v0 = vec_ld( 0, jointVecPtr );
|
|
v1 = vec_ld( 11, jointVecPtr );
|
|
vector float vecLd1 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, jointVecPtr ), (vector unsigned char)(1) ) );
|
|
|
|
v2 = vec_ld( 0, jointVecPtr2 );
|
|
v3 = vec_ld( 11, jointVecPtr2 );
|
|
vector float vecLd2 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, jointVecPtr2 ), (vector unsigned char)(1) ) );
|
|
|
|
v4 = vec_ld( 0, jointVecPtr3 );
|
|
v5 = vec_ld( 11, jointVecPtr3 );
|
|
vector float vecLd3 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, jointVecPtr3 ), (vector unsigned char)(1) ) );
|
|
|
|
v6 = vec_ld( 0, jointVecPtr4 );
|
|
v7 = vec_ld( 11, jointVecPtr4 );
|
|
vector float vecLd4 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, jointVecPtr4 ), (vector unsigned char)(1) ) );
|
|
|
|
vector float vecVecX, vecVecY, vecVecZ;
|
|
vecVecX = vecVecY = vecVecZ = zeroVector;
|
|
|
|
// planarize
|
|
v0 = vec_mergeh( vecLd1, vecLd3 );
|
|
v1 = vec_mergeh( vecLd2, vecLd4 );
|
|
v3 = vec_mergel( vecLd1, vecLd3 );
|
|
v4 = vec_mergel( vecLd2, vecLd4 );
|
|
|
|
vecVecX = vec_mergeh( v0, v1 );
|
|
vecVecY = vec_mergel( v0, v1 );
|
|
vecVecZ = vec_mergeh( v3, v4 );
|
|
|
|
// load blend joint idvec3's
|
|
float *blendVecPtr = (float*)( blendPtr + 4 );
|
|
float *blendVecPtr2 =(float*)( blendPtr2 + 4 );
|
|
float *blendVecPtr3 = (float*)( blendPtr3 + 4 );
|
|
float *blendVecPtr4 = (float*)( blendPtr4 + 4 );
|
|
|
|
v0 = vec_ld( 0, blendVecPtr );
|
|
v1 = vec_ld( 11, blendVecPtr );
|
|
vector float vecLd5 = vec_perm( v0, v1, vec_add( vec_lvsl( -1, blendVecPtr ), (vector unsigned char)(1) ) );
|
|
|
|
v2 = vec_ld( 0, blendVecPtr2 );
|
|
v3 = vec_ld( 11, blendVecPtr2 );
|
|
vector float vecLd6 = vec_perm( v2, v3, vec_add( vec_lvsl( -1, blendVecPtr2 ), (vector unsigned char)(1) ) );
|
|
|
|
v4 = vec_ld( 0, blendVecPtr3 );
|
|
v5 = vec_ld( 11, blendVecPtr3 );
|
|
vector float vecLd7 = vec_perm( v4, v5, vec_add( vec_lvsl( -1, blendVecPtr3 ), (vector unsigned char)(1) ) );
|
|
|
|
v6 = vec_ld( 0, blendVecPtr4 );
|
|
v7 = vec_ld( 11, blendVecPtr4 );
|
|
vector float vecLd8 = vec_perm( v6, v7, vec_add( vec_lvsl( -1, blendVecPtr4 ), (vector unsigned char)(1) ) );
|
|
|
|
vector float vecBlendX, vecBlendY, vecBlendZ;
|
|
vecBlendX = vecBlendY = vecBlendZ = zeroVector;
|
|
|
|
// planarize
|
|
v0 = vec_mergeh( vecLd5, vecLd7 );
|
|
v1 = vec_mergeh( vecLd6, vecLd8 );
|
|
v3 = vec_mergel( vecLd5, vecLd7 );
|
|
v4 = vec_mergel( vecLd6, vecLd8 );
|
|
|
|
vecBlendX = vec_mergeh( v0, v1 );
|
|
vecBlendY = vec_mergel( v0, v1 );
|
|
vecBlendZ = vec_mergeh( v3, v4 );
|
|
|
|
// do subtraction
|
|
vecWork1 = vec_sub( vecBlendX, vecVecX );
|
|
vecWork2 = vec_sub( vecBlendY, vecVecY );
|
|
vecWork3 = vec_sub( vecBlendZ, vecVecZ );
|
|
|
|
// multiply by lerp and add to v1
|
|
vecVecX = vec_madd( vecWork1, vecLerp, vecVecX );
|
|
vecVecY = vec_madd( vecWork2, vecLerp, vecVecY );
|
|
vecVecZ = vec_madd( vecWork3, vecLerp, vecVecZ );
|
|
|
|
// put it back in original form
|
|
v0 = vec_mergeh( vecVecX, vecVecZ );
|
|
v1 = vec_mergeh( vecVecY, zeroVector );
|
|
v3 = vec_mergel( vecVecX, vecVecZ );
|
|
v4 = vec_mergel( vecVecY, zeroVector );
|
|
|
|
// generate vectors to store
|
|
vecWork1 = vec_mergeh( v0, v1 );
|
|
vecWork2 = vec_mergel( v0, v1 );
|
|
vecWork3 = vec_mergeh( v3, v4 );
|
|
vector float vecWork4 = vec_mergel( v3, v4 );
|
|
|
|
// store the T values
|
|
storePerm1 = vec_lvsr( 0, jointVecPtr );
|
|
storePerm2 = vec_lvsr( 0, jointVecPtr2 );
|
|
storePerm3 = vec_lvsr( 0, jointVecPtr3 );
|
|
storePerm4 = vec_lvsr( 0, jointVecPtr4 );
|
|
|
|
// right rotate the input data
|
|
vecWork1 = vec_perm( vecWork1, vecWork1, storePerm1 );
|
|
vecWork2 = vec_perm( vecWork2, vecWork2, storePerm2 );
|
|
vecWork3 = vec_perm( vecWork3, vecWork3, storePerm3 );
|
|
vecWork4 = vec_perm( vecWork4, vecWork4, storePerm4 );
|
|
|
|
vec_ste( vecWork1, 0, (float*) jointVecPtr );
|
|
vec_ste( vecWork1, 4, (float*) jointVecPtr );
|
|
vec_ste( vecWork1, 8, (float*) jointVecPtr );
|
|
|
|
vec_ste( vecWork2, 0, (float*) jointVecPtr2 );
|
|
vec_ste( vecWork2, 4, (float*) jointVecPtr2 );
|
|
vec_ste( vecWork2, 8, (float*) jointVecPtr2 );
|
|
|
|
vec_ste( vecWork3, 0, (float*) jointVecPtr3 );
|
|
vec_ste( vecWork3, 4, (float*) jointVecPtr3 );
|
|
vec_ste( vecWork3, 8, (float*) jointVecPtr3 );
|
|
|
|
vec_ste( vecWork4, 0, (float*) jointVecPtr4 );
|
|
vec_ste( vecWork4, 4, (float*) jointVecPtr4 );
|
|
vec_ste( vecWork4, 8, (float*) jointVecPtr4 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numJoints; i++ ) {
|
|
int j = index[i];
|
|
joints[j].q.Slerp( joints[j].q, blendJoints[j].q, lerp );
|
|
joints[j].t.Lerp( joints[j].t, blendJoints[j].t, lerp );
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ConvertJointQuatsToJointMats
|
|
============
|
|
*/
|
|
|
|
// SSE doesn't vectorize this, and I don't think we should either. Its mainly just copying data, there's very little math involved and
|
|
// it's not easily parallelizable
|
|
void VPCALL idSIMD_AltiVec::ConvertJointQuatsToJointMats( idJointMat *jointMats, const idJointQuat *jointQuats, const int numJoints ) {
|
|
|
|
for ( int i = 0; i < numJoints; i++ ) {
|
|
|
|
const float *q = jointQuats[i].q.ToFloatPtr();
|
|
float *m = jointMats[i].ToFloatPtr();
|
|
|
|
m[0*4+3] = q[4];
|
|
m[1*4+3] = q[5];
|
|
m[2*4+3] = q[6];
|
|
|
|
float x2 = q[0] + q[0];
|
|
float y2 = q[1] + q[1];
|
|
float z2 = q[2] + q[2];
|
|
|
|
{
|
|
float xx = q[0] * x2;
|
|
float yy = q[1] * y2;
|
|
float zz = q[2] * z2;
|
|
|
|
m[0*4+0] = 1.0f - yy - zz;
|
|
m[1*4+1] = 1.0f - xx - zz;
|
|
m[2*4+2] = 1.0f - xx - yy;
|
|
}
|
|
|
|
{
|
|
float yz = q[1] * z2;
|
|
float wx = q[3] * x2;
|
|
|
|
m[2*4+1] = yz - wx;
|
|
m[1*4+2] = yz + wx;
|
|
}
|
|
|
|
{
|
|
float xy = q[0] * y2;
|
|
float wz = q[3] * z2;
|
|
|
|
m[1*4+0] = xy - wz;
|
|
m[0*4+1] = xy + wz;
|
|
}
|
|
|
|
{
|
|
float xz = q[0] * z2;
|
|
float wy = q[3] * y2;
|
|
|
|
m[0*4+2] = xz - wy;
|
|
m[2*4+0] = xz + wy;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::ConvertJointMatsToJointQuats
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::ConvertJointMatsToJointQuats( idJointQuat *jointQuats, const idJointMat *jointMats, const int numJoints ) {
|
|
|
|
int index;
|
|
|
|
// Since we use very little of the data we have to pull in for the altivec version, we end up with
|
|
// a lot of wasted math. Rather than try to force it to use altivec, I wrote an optimized version
|
|
// of InvSqrt for the G5, and made it use that instead. With only this change, we get a little
|
|
// bigger than 50% speedup, which is not too shabby. Should really replace idMath::InvSqrt with
|
|
// my function so everyone can benefit on G5.
|
|
|
|
for ( index = 0; index < numJoints; index++ ) {
|
|
|
|
idJointQuat jq;
|
|
float trace;
|
|
float s;
|
|
float t;
|
|
int i;
|
|
int j;
|
|
int k;
|
|
|
|
static int next[3] = { 1, 2, 0 };
|
|
|
|
float *mat = (float*)( jointMats[index].ToFloatPtr() );
|
|
trace = mat[0 * 4 + 0] + mat[1 * 4 + 1] + mat[2 * 4 + 2];
|
|
|
|
if ( trace > 0.0f ) {
|
|
|
|
t = trace + 1.0f;
|
|
//s = idMath::InvSqrt( t ) * 0.5f;
|
|
s = FastScalarInvSqrt( t ) * 0.5f;
|
|
|
|
jq.q[3] = s * t;
|
|
jq.q[0] = ( mat[1 * 4 + 2] - mat[2 * 4 + 1] ) * s;
|
|
jq.q[1] = ( mat[2 * 4 + 0] - mat[0 * 4 + 2] ) * s;
|
|
jq.q[2] = ( mat[0 * 4 + 1] - mat[1 * 4 + 0] ) * s;
|
|
|
|
} else {
|
|
|
|
i = 0;
|
|
if ( mat[1 * 4 + 1] > mat[0 * 4 + 0] ) {
|
|
i = 1;
|
|
}
|
|
if ( mat[2 * 4 + 2] > mat[i * 4 + i] ) {
|
|
i = 2;
|
|
}
|
|
j = next[i];
|
|
k = next[j];
|
|
|
|
t = ( mat[i * 4 + i] - ( mat[j * 4 + j] + mat[k * 4 + k] ) ) + 1.0f;
|
|
//s = idMath::InvSqrt( t ) * 0.5f;
|
|
s = FastScalarInvSqrt( t ) * 0.5f;
|
|
|
|
jq.q[i] = s * t;
|
|
jq.q[3] = ( mat[j * 4 + k] - mat[k * 4 + j] ) * s;
|
|
jq.q[j] = ( mat[i * 4 + j] + mat[j * 4 + i] ) * s;
|
|
jq.q[k] = ( mat[i * 4 + k] + mat[k * 4 + i] ) * s;
|
|
}
|
|
|
|
jq.t[0] = mat[0 * 4 + 3];
|
|
jq.t[1] = mat[1 * 4 + 3];
|
|
jq.t[2] = mat[2 * 4 + 3];
|
|
jointQuats[index] = jq;
|
|
}
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::TransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
int i;
|
|
#if 0
|
|
for( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] *= jointMats[parents[i]];
|
|
}
|
|
#else
|
|
|
|
// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
|
|
// on what the parents array looks like. This is true in the test code.
|
|
for ( i = firstJoint; i <= lastJoint; i++ ) {
|
|
assert( parents[i] < i );
|
|
float *jointPtr = jointMats[i].ToFloatPtr();
|
|
float *parentPtr = jointMats[parents[i]].ToFloatPtr();
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
// we need to load up 12 float elements that make up the Mat
|
|
v0 = vec_ld( 0, jointPtr );
|
|
v1 = vec_ld( 15, jointPtr );
|
|
v2 = vec_ld( 31, jointPtr );
|
|
v3 = vec_ld( 47, jointPtr );
|
|
|
|
// load parents
|
|
v4 = vec_ld( 0, parentPtr );
|
|
v5 = vec_ld( 15, parentPtr );
|
|
v6 = vec_ld( 31, parentPtr );
|
|
v7 = vec_ld( 47, parentPtr );
|
|
|
|
// permute into vectors
|
|
vector float vecJointMat1 = vec_perm( v0, v1, permVec );
|
|
vector float vecJointMat2 = vec_perm( v1, v2, permVec );
|
|
vector float vecJointMat3 = vec_perm( v2, v3, permVec );
|
|
|
|
vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
|
|
vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
|
|
vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
|
|
|
|
vector float zero = (vector float)(0);
|
|
vector float C1, C2, C3;
|
|
|
|
// matrix multiply
|
|
C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero ); // m(0 to 3) * a(0)
|
|
C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat2, 0 ), zero ); // m(4 to 7) * a(4)
|
|
C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat3, 0 ), zero ); // m(8 to 11) * a(8)
|
|
|
|
C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat1, 1 ), C1 ); // add in m(4 to 7) * a(1)
|
|
C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 ); // add in m(4 to 7) * a(5)
|
|
C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat3, 1 ), C3 ); // add in m(4 to 7) * a(9)
|
|
|
|
C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat1, 2 ), C1 );
|
|
C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat2, 2 ), C2 );
|
|
C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
|
|
|
|
// do the addition at the end
|
|
vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
|
|
C1 = vec_add( C1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
|
|
C2 = vec_add( C2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
|
|
C3 = vec_add( C3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
|
|
|
|
// store results
|
|
UNALIGNED_STORE3( (float*) jointPtr, C1, C2, C3 );
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UntransformJoints
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::UntransformJoints( idJointMat *jointMats, const int *parents, const int firstJoint, const int lastJoint ) {
|
|
int i;
|
|
#if 0
|
|
for( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
jointMats[i] /= jointMats[parents[i]];
|
|
}
|
|
#else
|
|
// I don't think you can unroll this since the next iteration of the loop might depending on the previous iteration, depending
|
|
// on what the parents array looks like. This is true in the test code.
|
|
for ( i = lastJoint; i >= firstJoint; i-- ) {
|
|
assert( parents[i] < i );
|
|
float *jointPtr = jointMats[i].ToFloatPtr();
|
|
float *parentPtr = jointMats[parents[i]].ToFloatPtr();
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, jointPtr ), (vector unsigned char)(1) );
|
|
vector unsigned char permVec2 = vec_add( vec_lvsl( -1, parentPtr ), (vector unsigned char)(1) );
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
// we need to load up 12 float elements that make up the Mat
|
|
v0 = vec_ld( 0, jointPtr );
|
|
v1 = vec_ld( 15, jointPtr );
|
|
v2 = vec_ld( 31, jointPtr );
|
|
v3 = vec_ld( 47, jointPtr );
|
|
|
|
// load parents
|
|
v4 = vec_ld( 0, parentPtr );
|
|
v5 = vec_ld( 15, parentPtr );
|
|
v6 = vec_ld( 31, parentPtr );
|
|
v7 = vec_ld( 47, parentPtr );
|
|
|
|
// permute into vectors
|
|
vector float vecJointMat1 = vec_perm( v0, v1, permVec );
|
|
vector float vecJointMat2 = vec_perm( v1, v2, permVec );
|
|
vector float vecJointMat3 = vec_perm( v2, v3, permVec );
|
|
|
|
vector float vecParentMat1 = vec_perm( v4, v5, permVec2 );
|
|
vector float vecParentMat2 = vec_perm( v5, v6, permVec2 );
|
|
vector float vecParentMat3 = vec_perm( v6, v7, permVec2 );
|
|
|
|
vector float zero = (vector float)(0);
|
|
vector float C1, C2, C3;
|
|
|
|
// do subtraction at the beginning
|
|
vector unsigned char permZeroAndLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,28,29,30,31);
|
|
vecJointMat1 = vec_sub( vecJointMat1, vec_perm( zero, vecParentMat1, permZeroAndLast ) );
|
|
vecJointMat2 = vec_sub( vecJointMat2, vec_perm( zero, vecParentMat2, permZeroAndLast ) );
|
|
vecJointMat3 = vec_sub( vecJointMat3, vec_perm( zero, vecParentMat3, permZeroAndLast ) );
|
|
|
|
// matrix multiply
|
|
C1 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 0 ), zero );
|
|
C2 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 1 ), zero );
|
|
C3 = vec_madd( vecJointMat1, vec_splat( vecParentMat1, 2 ), zero );
|
|
|
|
C1 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 0 ), C1 );
|
|
C2 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 1 ), C2 );
|
|
C3 = vec_madd( vecJointMat2, vec_splat( vecParentMat2, 2 ), C3 );
|
|
|
|
C1 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 0 ), C1 );
|
|
C2 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 1 ), C2 );
|
|
C3 = vec_madd( vecJointMat3, vec_splat( vecParentMat3, 2 ), C3 );
|
|
|
|
// store results back
|
|
vector unsigned char storePerm = vec_lvsr( 0, jointPtr );
|
|
|
|
// right rotate the input data
|
|
C1 = vec_perm( C1, C1, storePerm );
|
|
C2 = vec_perm( C2, C2, storePerm );
|
|
C3 = vec_perm( C3, C3, storePerm );
|
|
|
|
vec_ste( C1, 0, (float*) jointPtr );
|
|
vec_ste( C1, 4, (float*) jointPtr );
|
|
vec_ste( C1, 8, (float*) jointPtr );
|
|
vec_ste( C1, 12, (float*) jointPtr );
|
|
|
|
vec_ste( C2, 16, (float*) jointPtr );
|
|
vec_ste( C2, 20, (float*) jointPtr );
|
|
vec_ste( C2, 24, (float*) jointPtr );
|
|
vec_ste( C2, 28, (float*) jointPtr );
|
|
|
|
vec_ste( C3, 32, (float*) jointPtr );
|
|
vec_ste( C3, 36, (float*) jointPtr );
|
|
vec_ste( C3, 40, (float*) jointPtr );
|
|
vec_ste( C3, 44, (float*) jointPtr );
|
|
}
|
|
|
|
#endif
|
|
}
|
|
|
|
#endif /* LIVE_VICARIOUSLY */
|
|
|
|
#ifdef ENABLE_CULL
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TracePointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
byte tOr;
|
|
tOr = 0;
|
|
|
|
// pointers
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
|
|
vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector float zeroVector = (vector float)(0);
|
|
vector float vecRadius;
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
|
|
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
|
|
vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecTotals;
|
|
vector unsigned int tempIntSum;
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
// load constants
|
|
vecRadius = loadSplatUnalignedScalar( &radius );
|
|
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
int i = 0;
|
|
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
|
|
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
|
|
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
|
|
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
|
|
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
|
|
vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
|
|
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
|
|
vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
|
|
|
|
// vec1Sum1 now holds d0, d1, d2, d3. calculate the
|
|
// difference with +radius and -radius
|
|
vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
|
|
vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
|
|
vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
|
|
vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
|
|
vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
|
|
vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
|
|
vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
|
|
vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
|
|
|
|
// do compare
|
|
vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
|
|
vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
|
|
vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
|
|
vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
|
|
vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
|
|
vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
// OR (add) them all together
|
|
vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
|
|
vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
|
|
vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
|
|
vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
|
|
|
|
vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
|
|
tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_mergeh( vecTotals, tempIntSum );
|
|
tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
|
|
tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
|
|
|
|
// store out results
|
|
vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
|
|
tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
|
|
vec_ste( tempSt, 0, &cullBitVal[0] );
|
|
vec_ste( tempSt, 4, &cullBitVal[0] );
|
|
vec_ste( tempSt, 8, &cullBitVal[0] );
|
|
vec_ste( tempSt, 12, &cullBitVal[0] );
|
|
|
|
tOr |= cullBitVal[0];
|
|
tOr |= cullBitVal[1];
|
|
tOr |= cullBitVal[2];
|
|
tOr |= cullBitVal[3];
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, t;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
|
|
t = d0 + radius;
|
|
bits = FLOATSIGNBITSET( t ) << 0;
|
|
t = d1 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 1;
|
|
t = d2 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 2;
|
|
t = d3 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 3;
|
|
|
|
t = d0 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 4;
|
|
t = d1 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 5;
|
|
t = d2 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 6;
|
|
t = d3 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 7;
|
|
|
|
bits ^= 0x0F; // flip lower four bits
|
|
|
|
tOr |= bits;
|
|
cullBits[i] = bits;
|
|
}
|
|
|
|
totalOr = tOr;
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::TracePointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::TracePointCull( byte *cullBits, byte &totalOr, const float radius, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
byte tOr;
|
|
tOr = 0;
|
|
|
|
// pointers
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0,1,2,3);
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4,5,6,7);
|
|
vector unsigned int vecFlipBits = (vector unsigned int)(0x0F);
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector float zeroVector = (vector float)(0);
|
|
vector float vecRadius;
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float vec1Sum1, vec1Sum2, vec1Sum3, vec1Sum4;
|
|
vector unsigned char vecPermLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vecPermHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector float vecDPlusRadius1, vecDPlusRadius2, vecDPlusRadius3, vecDPlusRadius4;
|
|
vector float vecDMinusRadius1, vecDMinusRadius2, vecDMinusRadius3, vecDMinusRadius4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4, vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecTotals;
|
|
vector unsigned int tempIntSum;
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
// load constants
|
|
vecRadius = loadSplatUnalignedScalar( &radius );
|
|
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
int i = 0;
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
vecXYZ1 = vec_ld( 0, vertPtr );
|
|
vecXYZ2 = vec_ld( 0, vertPtr2 );
|
|
vecXYZ3 = vec_ld( 0, vertPtr3 );
|
|
vecXYZ4 = vec_ld( 0, vertPtr4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane3 );
|
|
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec1Sum3 );
|
|
vec1Sum3 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec1Sum3 );
|
|
vec1Sum3 = vec_add( vec1Sum3, vecPlane3 );
|
|
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec1Sum4 );
|
|
vec1Sum4 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec1Sum4 );
|
|
vec1Sum4 = vec_add( vec1Sum4, vecPlane3 );
|
|
|
|
// vec1Sum1 now holds d0, d1, d2, d3. calculate the
|
|
// difference with +radius and -radius
|
|
vecDPlusRadius1 = vec_add( vec1Sum1, vecRadius );
|
|
vecDMinusRadius1 = vec_sub( vec1Sum1, vecRadius );
|
|
vecDPlusRadius2 = vec_add( vec1Sum2, vecRadius );
|
|
vecDMinusRadius2 = vec_sub( vec1Sum2, vecRadius );
|
|
vecDPlusRadius3 = vec_add( vec1Sum3, vecRadius );
|
|
vecDMinusRadius3 = vec_sub( vec1Sum3, vecRadius );
|
|
vecDPlusRadius4 = vec_add( vec1Sum4, vecRadius );
|
|
vecDMinusRadius4 = vec_sub( vec1Sum4, vecRadius );
|
|
|
|
// do compare
|
|
vecCmp1 = vec_cmplt( vecDPlusRadius1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecDMinusRadius1, zeroVector );
|
|
vecCmp3 = vec_cmplt( vecDPlusRadius2, zeroVector );
|
|
vecCmp4 = vec_cmplt( vecDMinusRadius2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vecDPlusRadius3, zeroVector );
|
|
vecCmp6 = vec_cmplt( vecDMinusRadius3, zeroVector );
|
|
vecCmp7 = vec_cmplt( vecDPlusRadius4, zeroVector );
|
|
vecCmp8 = vec_cmplt( vecDMinusRadius4, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
// OR (add) them all together
|
|
vecBitShifted1 = vec_add( vecBitShifted1, vecBitShifted2 );
|
|
vecBitShifted3 = vec_add( vecBitShifted3, vecBitShifted4 );
|
|
vecBitShifted5 = vec_add( vecBitShifted5, vecBitShifted6 );
|
|
vecBitShifted7 = vec_add( vecBitShifted7, vecBitShifted8 );
|
|
|
|
vecTotals = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecTotals = vec_add( vecTotals, vec_sld( vecTotals, vecTotals, 4 ) );
|
|
tempIntSum = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_mergeh( vecTotals, tempIntSum );
|
|
tempIntSum = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermHalves );
|
|
tempIntSum = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
tempIntSum = vec_add( tempIntSum, vec_sld( tempIntSum, tempIntSum, 4 ) );
|
|
vecTotals = vec_perm( vecTotals, tempIntSum, vecPermLast );
|
|
|
|
// store out results
|
|
vector unsigned int tempSt = vec_xor( vecTotals, vecFlipBits );
|
|
tempSt = vec_perm( tempSt, tempSt, cullBitPerm );
|
|
vec_ste( tempSt, 0, &cullBitVal[0] );
|
|
vec_ste( tempSt, 4, &cullBitVal[0] );
|
|
vec_ste( tempSt, 8, &cullBitVal[0] );
|
|
vec_ste( tempSt, 12, &cullBitVal[0] );
|
|
|
|
tOr |= cullBitVal[0];
|
|
tOr |= cullBitVal[1];
|
|
tOr |= cullBitVal[2];
|
|
tOr |= cullBitVal[3];
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, t;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
|
|
t = d0 + radius;
|
|
bits = FLOATSIGNBITSET( t ) << 0;
|
|
t = d1 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 1;
|
|
t = d2 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 2;
|
|
t = d3 + radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 3;
|
|
|
|
t = d0 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 4;
|
|
t = d1 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 5;
|
|
t = d2 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 6;
|
|
t = d3 - radius;
|
|
bits |= FLOATSIGNBITSET( t ) << 7;
|
|
|
|
bits ^= 0x0F; // flip lower four bits
|
|
|
|
tOr |= bits;
|
|
cullBits[i] = bits;
|
|
}
|
|
|
|
totalOr = tOr;
|
|
}
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DecalPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
|
|
vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 16 );
|
|
v1 = vec_ld( 15, planePtr + 16 );
|
|
vecPlane4 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 20 );
|
|
v3 = vec_ld( 15, planePtr + 20 );
|
|
vecPlane5 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
v0 = vec_mergeh( vecPlane4, zeroVector );
|
|
v1 = vec_mergeh( vecPlane5, zeroVector );
|
|
v2 = vec_mergel( vecPlane4, zeroVector );
|
|
v3 = vec_mergel( vecPlane5, zeroVector );
|
|
|
|
vecPlane4 = vec_mergeh( v0, v1 );
|
|
vecPlane5 = vec_mergel( v0, v1 );
|
|
vecPlane6 = vec_mergeh( v2, v3 );
|
|
vecPlane7 = vec_mergel( v2, v3 );
|
|
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
|
|
vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
|
|
vector unsigned int vecR1, vecR2, vecR3, vecR4;
|
|
vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int vBits[4];
|
|
vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
|
|
|
|
i = 0;
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
|
|
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
|
|
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
|
|
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
|
|
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
|
|
vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
|
|
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
|
|
vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
|
|
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
|
|
vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
|
|
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
|
|
vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
|
|
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
|
|
vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
|
|
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
|
|
vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
|
|
|
|
vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
|
|
vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
|
|
vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
|
|
vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
|
|
vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
|
|
vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
//OR them all together (this is the same as adding them, since they're all only 1 bit set)
|
|
vecR1 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
|
|
vecR1 = vec_add(vecR1, vecBitShifted2 );
|
|
vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
vecR2 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
|
|
vecR2 = vec_add(vecR2, vecBitShifted4 );
|
|
vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
|
|
|
|
vecR3 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
|
|
vecR3 = vec_add(vecR3, vecBitShifted6 );
|
|
vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
|
|
|
|
vecR4 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
|
|
vecR4 = vec_add(vecR4, vecBitShifted8 );
|
|
vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
|
|
|
|
// take the first element from each vector and put them into vecR1
|
|
vecR1 = vec_mergeh( vecR1, vecR2 );
|
|
vecR3 = vec_mergeh( vecR3, vecR4 );
|
|
vecR1 = vec_perm( vecR1, vecR3, permHalves );
|
|
|
|
// XOR with 0x3F to flip lower 6 bits
|
|
vecR1 = vec_xor( vecR1, vecFlipBits );
|
|
|
|
// store out results. don't have 16 at a time so let's just
|
|
// do this and avoid alignment concerns
|
|
vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
|
|
vec_ste( vecR1, 0, &vBits[0] );
|
|
vec_ste( vecR1, 4, &vBits[0] );
|
|
vec_ste( vecR1, 8, &vBits[0] );
|
|
vec_ste( vecR1, 12, &vBits[0] );
|
|
|
|
cullBits[i] = vBits[0];
|
|
cullBits[i+1] = vBits[1];
|
|
cullBits[i+2] = vBits[2];
|
|
cullBits[i+3] = vBits[3];
|
|
}
|
|
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
d4 = planes[4].Distance( v );
|
|
d5 = planes[5].Distance( v );
|
|
|
|
// they check if the sign bit is set by casting as long and shifting right 31 places.
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 3;
|
|
bits |= FLOATSIGNBITSET( d4 ) << 4;
|
|
bits |= FLOATSIGNBITSET( d5 ) << 5;
|
|
|
|
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DecalPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DecalPointCull( byte *cullBits, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3, vecPlane4, vecPlane5, vecPlane6, vecPlane7;
|
|
vector float zeroVector = (vector float)(0.0);
|
|
vector unsigned char vecPerm;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
|
|
// populate planes
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 4 );
|
|
v3 = vec_ld( 15, planePtr + 4 );
|
|
vecPlane1 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 8 );
|
|
v1 = vec_ld( 15, planePtr + 8 );
|
|
vecPlane2 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 12 );
|
|
v3 = vec_ld( 15, planePtr + 12 );
|
|
vecPlane3 = vec_perm( v2, v3, vecPerm );
|
|
|
|
v0 = vec_ld( 0, planePtr + 16 );
|
|
v1 = vec_ld( 15, planePtr + 16 );
|
|
vecPlane4 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 0, planePtr + 20 );
|
|
v3 = vec_ld( 15, planePtr + 20 );
|
|
vecPlane5 = vec_perm( v2, v3, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane2 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane3 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane2 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane3 );
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
v0 = vec_mergeh( vecPlane4, zeroVector );
|
|
v1 = vec_mergeh( vecPlane5, zeroVector );
|
|
v2 = vec_mergel( vecPlane4, zeroVector );
|
|
v3 = vec_mergel( vecPlane5, zeroVector );
|
|
|
|
vecPlane4 = vec_mergeh( v0, v1 );
|
|
vecPlane5 = vec_mergel( v0, v1 );
|
|
vecPlane6 = vec_mergeh( v2, v3 );
|
|
vecPlane7 = vec_mergel( v2, v3 );
|
|
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector float vec1Sum1, vec1Sum2, vec2Sum1, vec2Sum2, vec3Sum1, vec3Sum2, vec4Sum1, vec4Sum2;
|
|
vector unsigned int vecShift1 = (vector unsigned int)(0, 1, 2, 3 );
|
|
vector unsigned int vecShift2 = (vector unsigned int)(4, 5, 0, 0 );
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp3, vecCmp4, vecCmp5, vecCmp6, vecCmp7, vecCmp8;
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted3, vecBitShifted4;
|
|
vector unsigned int vecBitShifted5, vecBitShifted6, vecBitShifted7, vecBitShifted8;
|
|
vector unsigned int vecFlipBits = (vector unsigned int)( 0x3F, 0x3F, 0x3F, 0x3F );
|
|
vector unsigned int vecR1, vecR2, vecR3, vecR4;
|
|
vector unsigned char permHalves = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int vBits[4];
|
|
vector unsigned char vBitPerm = vec_lvsr( 0, &vBits[4] );
|
|
|
|
i = 0;
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane0, zeroVector );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane1, vec1Sum1 );
|
|
vec1Sum1 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane2, vec1Sum1 );
|
|
vec1Sum1 = vec_add( vec1Sum1, vecPlane3 );
|
|
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 0 ), vecPlane4, zeroVector );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 1 ), vecPlane5, vec1Sum2 );
|
|
vec1Sum2 = vec_madd( vec_splat( vecXYZ1, 2 ), vecPlane6, vec1Sum2 );
|
|
vec1Sum2 = vec_add( vec1Sum2, vecPlane7 );
|
|
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane0, zeroVector );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane1, vec2Sum1 );
|
|
vec2Sum1 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane2, vec2Sum1 );
|
|
vec2Sum1 = vec_add( vec2Sum1, vecPlane3 );
|
|
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 0 ), vecPlane4, zeroVector );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 1 ), vecPlane5, vec2Sum2 );
|
|
vec2Sum2 = vec_madd( vec_splat( vecXYZ2, 2 ), vecPlane6, vec2Sum2 );
|
|
vec2Sum2 = vec_add( vec2Sum2, vecPlane7 );
|
|
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane0, zeroVector );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane1, vec3Sum1 );
|
|
vec3Sum1 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane2, vec3Sum1 );
|
|
vec3Sum1 = vec_add( vec3Sum1, vecPlane3 );
|
|
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 0 ), vecPlane4, zeroVector );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 1 ), vecPlane5, vec3Sum2 );
|
|
vec3Sum2 = vec_madd( vec_splat( vecXYZ3, 2 ), vecPlane6, vec3Sum2 );
|
|
vec3Sum2 = vec_add( vec3Sum2, vecPlane7 );
|
|
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane0, zeroVector );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane1, vec4Sum1 );
|
|
vec4Sum1 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane2, vec4Sum1 );
|
|
vec4Sum1 = vec_add( vec4Sum1, vecPlane3 );
|
|
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 0 ), vecPlane4, zeroVector );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 1 ), vecPlane5, vec4Sum2 );
|
|
vec4Sum2 = vec_madd( vec_splat( vecXYZ4, 2 ), vecPlane6, vec4Sum2 );
|
|
vec4Sum2 = vec_add( vec4Sum2, vecPlane7 );
|
|
|
|
vecCmp1 = vec_cmplt( vec1Sum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vec1Sum2, zeroVector );
|
|
vecCmp3 = vec_cmplt( vec2Sum1, zeroVector );
|
|
vecCmp4 = vec_cmplt( vec2Sum2, zeroVector );
|
|
vecCmp5 = vec_cmplt( vec3Sum1, zeroVector );
|
|
vecCmp6 = vec_cmplt( vec3Sum2, zeroVector );
|
|
vecCmp7 = vec_cmplt( vec4Sum1, zeroVector );
|
|
vecCmp8 = vec_cmplt( vec4Sum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
vecCmp3 = vec_and( vecCmp3, oneIntVector );
|
|
vecCmp4 = vec_and( vecCmp4, oneIntVector );
|
|
vecCmp5 = vec_and( vecCmp5, oneIntVector );
|
|
vecCmp6 = vec_and( vecCmp6, oneIntVector );
|
|
vecCmp7 = vec_and( vecCmp7, oneIntVector );
|
|
vecCmp8 = vec_and( vecCmp8, oneIntVector );
|
|
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift1 );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift2 );
|
|
vecBitShifted3 = vec_sl( (vector unsigned int)vecCmp3, vecShift1 );
|
|
vecBitShifted4 = vec_sl( (vector unsigned int)vecCmp4, vecShift2 );
|
|
vecBitShifted5 = vec_sl( (vector unsigned int)vecCmp5, vecShift1 );
|
|
vecBitShifted6 = vec_sl( (vector unsigned int)vecCmp6, vecShift2 );
|
|
vecBitShifted7 = vec_sl( (vector unsigned int)vecCmp7, vecShift1 );
|
|
vecBitShifted8 = vec_sl( (vector unsigned int)vecCmp8, vecShift2 );
|
|
|
|
//OR them all together (this is the same as adding them, since they're all only 1 bit set)
|
|
vecR1 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR1 = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 8 ) );
|
|
vecR1 = vec_add( vecR1, vec_sld( vecR1, vecR1, 4 ) );
|
|
vecR1 = vec_add(vecR1, vecBitShifted2 );
|
|
vecR1 = vec_or( vecR1, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
vecR2 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR2 = vec_add( vecBitShifted3, vec_sld( vecBitShifted3, vecBitShifted3, 8 ) );
|
|
vecR2 = vec_add( vecR2, vec_sld( vecR2, vecR2, 4 ) );
|
|
vecR2 = vec_add(vecR2, vecBitShifted4 );
|
|
vecR2 = vec_or( vecR2, vec_sld( vecBitShifted4, vecBitShifted4, 4 ) );
|
|
|
|
vecR3 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR3 = vec_add( vecBitShifted5, vec_sld( vecBitShifted5, vecBitShifted5, 8 ) );
|
|
vecR3 = vec_add( vecR3, vec_sld( vecR3, vecR3, 4 ) );
|
|
vecR3 = vec_add(vecR3, vecBitShifted6 );
|
|
vecR3 = vec_or( vecR3, vec_sld( vecBitShifted6, vecBitShifted6, 4 ) );
|
|
|
|
vecR4 = (vector unsigned int)(0); //zeroIntVector;
|
|
vecR4 = vec_add( vecBitShifted7, vec_sld( vecBitShifted7, vecBitShifted7, 8 ) );
|
|
vecR4 = vec_add( vecR4, vec_sld( vecR4, vecR4, 4 ) );
|
|
vecR4 = vec_add(vecR4, vecBitShifted8 );
|
|
vecR4 = vec_or( vecR4, vec_sld( vecBitShifted8, vecBitShifted8, 4 ) );
|
|
|
|
// take the first element from each vector and put them into vecR1
|
|
vecR1 = vec_mergeh( vecR1, vecR2 );
|
|
vecR3 = vec_mergeh( vecR3, vecR4 );
|
|
vecR1 = vec_perm( vecR1, vecR3, permHalves );
|
|
|
|
// XOR with 0x3F to flip lower 6 bits
|
|
vecR1 = vec_xor( vecR1, vecFlipBits );
|
|
|
|
// store out results. don't have 16 at a time so let's just
|
|
// do this and avoid alignment concerns
|
|
vecR1 = vec_perm( vecR1, vecR1, vBitPerm );
|
|
vec_ste( vecR1, 0, &vBits[0] );
|
|
vec_ste( vecR1, 4, &vBits[0] );
|
|
vec_ste( vecR1, 8, &vBits[0] );
|
|
vec_ste( vecR1, 12, &vBits[0] );
|
|
|
|
cullBits[i] = vBits[0];
|
|
cullBits[i+1] = vBits[1];
|
|
cullBits[i+2] = vBits[2];
|
|
cullBits[i+3] = vBits[3];
|
|
}
|
|
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1, d2, d3, d4, d5;
|
|
const idVec3 &v = verts[i].xyz;
|
|
|
|
d0 = planes[0].Distance( v );
|
|
d1 = planes[1].Distance( v );
|
|
d2 = planes[2].Distance( v );
|
|
d3 = planes[3].Distance( v );
|
|
d4 = planes[4].Distance( v );
|
|
d5 = planes[5].Distance( v );
|
|
|
|
// they check if the sign bit is set by casting as long and shifting right 31 places.
|
|
bits = FLOATSIGNBITSET( d0 ) << 0;
|
|
bits |= FLOATSIGNBITSET( d1 ) << 1;
|
|
bits |= FLOATSIGNBITSET( d2 ) << 2;
|
|
bits |= FLOATSIGNBITSET( d3 ) << 3;
|
|
bits |= FLOATSIGNBITSET( d4 ) << 4;
|
|
bits |= FLOATSIGNBITSET( d5 ) << 5;
|
|
|
|
cullBits[i] = bits ^ 0x3F; // flip lower 6 bits
|
|
}
|
|
}
|
|
|
|
|
|
#endif /*DRAWVERT_PADDED */
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::OverlayPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
|
|
float p0x, p0y, p0z, p0d;
|
|
float p1x, p1y, p1z, p1d;
|
|
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
const float *vertPtr = verts[0].xyz.ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector unsigned char vecPerm;
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
p0x = *(planePtr + 0);
|
|
p0y = *(planePtr + 1);
|
|
p0z = *(planePtr + 2);
|
|
p0d = *(planePtr + 3);
|
|
p1x = *(planePtr + 4);
|
|
p1y = *(planePtr + 5);
|
|
p1z = *(planePtr + 6);
|
|
p1d = *(planePtr + 7);
|
|
|
|
// populate the planes
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 31, planePtr );
|
|
vecPlane1 = vec_perm( v1, v2, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane0 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane1 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane0 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane1);
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float oneVector = (vector float)(1);
|
|
|
|
vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
|
|
vector float negTwoVector = (vector float)(-2);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
|
|
vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
|
|
vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
|
|
vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
|
|
i = 0;
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
vecXYZ1 = vec_perm( v0, v1, vertPerm1 );
|
|
vecXYZ2 = vec_perm( v2, v3, vertPerm2 );
|
|
vecXYZ3 = vec_perm( v4, v5, vertPerm3 );
|
|
vecXYZ4 = vec_perm( v6, v7, vertPerm4 );
|
|
|
|
// like a splat, but only doing halves
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
|
|
vecSum1 = vec_add( vecSum1, vecPlane3 );
|
|
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
|
|
vecSum2 = vec_add( vecSum2, vecPlane3 );
|
|
|
|
// store out results
|
|
UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
|
|
|
|
// bit manipulation
|
|
vecCmp1 = vec_cmplt( vecSum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecSum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
|
|
// store out and write to cullBits
|
|
// finally, a use for algebra! 1-x = x + 1 - 2x
|
|
vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
|
|
vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
|
|
vecSum1Inv = vec_add( vecSum1Inv, oneVector );
|
|
vecSum2Inv = vec_add( vecSum2Inv, oneVector );
|
|
|
|
// do the same comparisons for the inverted d0/d1
|
|
vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
|
|
vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
|
|
vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
|
|
|
|
// shift them as needed
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
|
|
vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
|
|
vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
|
|
|
|
// OR them all together. since only 1 bit is set for each value, thats
|
|
// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
|
|
vector unsigned int vecResult;
|
|
vector unsigned int vecResult2;
|
|
vector unsigned int vecResult3;
|
|
vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
|
|
|
|
vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
// vecResult now holds the values without the inverses yet, so add those
|
|
vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
|
|
vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
|
|
vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
|
|
vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
|
|
|
|
vecResult = vec_add( vecResult, vecResult2 );
|
|
|
|
//store out results
|
|
vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
|
|
vec_ste( vecResult, 0, &cullBitVal[0] );
|
|
vec_ste( vecResult, 4, &cullBitVal[0] );
|
|
vec_ste( vecResult, 8, &cullBitVal[0] );
|
|
vec_ste( vecResult, 12, &cullBitVal[0] );
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1;
|
|
float vx, vy, vz;
|
|
|
|
vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
|
|
d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
|
|
d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
|
|
texCoords[i][0] = d0;
|
|
texCoords[i][1] = d1;
|
|
|
|
bits = ( d0 >= 0 ) ? 0 : 1;
|
|
d0 = 1.0f - d0;
|
|
bits |= ( d1 >= 0 ) ? 0 : 1*2;
|
|
d1 = 1.0f - d1;
|
|
|
|
bits |= ( d0 >= 0 ) ? 0: 1*4;
|
|
bits |= ( d1 >= 0 ) ? 0: 1*8;
|
|
|
|
cullBits[i] = bits;
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::OverlayPointCull
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::OverlayPointCull( byte *cullBits, idVec2 *texCoords, const idPlane *planes, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
|
|
int i;
|
|
|
|
float p0x, p0y, p0z, p0d;
|
|
float p1x, p1y, p1z, p1d;
|
|
|
|
const float *planePtr = planes[0].ToFloatPtr();
|
|
const float *vertPtr = verts[0].xyz.ToFloatPtr();
|
|
|
|
vector float vecPlane0, vecPlane1, vecPlane2, vecPlane3;
|
|
vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
vector unsigned char vecPerm;
|
|
vector float zeroVector = (vector float)(0);
|
|
|
|
p0x = *(planePtr + 0);
|
|
p0y = *(planePtr + 1);
|
|
p0z = *(planePtr + 2);
|
|
p0d = *(planePtr + 3);
|
|
p1x = *(planePtr + 4);
|
|
p1y = *(planePtr + 5);
|
|
p1z = *(planePtr + 6);
|
|
p1d = *(planePtr + 7);
|
|
|
|
// populate the planes
|
|
vecPerm = vec_add( vec_lvsl( -1, planePtr ), (vector unsigned char)(1) );
|
|
v0 = vec_ld( 0, planePtr );
|
|
v1 = vec_ld( 15, planePtr );
|
|
vecPlane0 = vec_perm( v0, v1, vecPerm );
|
|
|
|
v2 = vec_ld( 31, planePtr );
|
|
vecPlane1 = vec_perm( v1, v2, vecPerm );
|
|
|
|
// transpose
|
|
v0 = vec_mergeh( vecPlane0, vecPlane0 );
|
|
v1 = vec_mergeh( vecPlane1, vecPlane1 );
|
|
v2 = vec_mergel( vecPlane0, vecPlane0 );
|
|
v3 = vec_mergel( vecPlane1, vecPlane1);
|
|
|
|
vecPlane0 = vec_mergeh( v0, v1 );
|
|
vecPlane1 = vec_mergel( v0, v1 );
|
|
vecPlane2 = vec_mergeh( v2, v3 );
|
|
vecPlane3 = vec_mergel( v2, v3 );
|
|
|
|
vector float vecXYZ1, vecXYZ2, vecXYZ3, vecXYZ4;
|
|
vector float oneVector = (vector float)(1);
|
|
|
|
vector float vecSum1, vecSum2, vecSum1Inv,vecSum2Inv;
|
|
|
|
vector bool int vecCmp1, vecCmp2, vecCmp1Inv, vecCmp2Inv;
|
|
vector float negTwoVector = (vector float)(-2);
|
|
vector unsigned int vecBitShifted1, vecBitShifted2, vecBitShifted1Inv, vecBitShifted2Inv;
|
|
vector unsigned int vecShift = (vector unsigned int)( 0, 1, 0, 1 );
|
|
vector unsigned int vecShiftInv = (vector unsigned int)( 2, 3, 2, 3 );
|
|
vector unsigned char vecPermFirstThird = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27);
|
|
vector bool int oneIntVector = (vector bool int)(1);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
unsigned int cullBitVal[4];
|
|
vector unsigned char cullBitPerm = vec_lvsr( 0, &cullBitVal[0] );
|
|
|
|
i = 0;
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
vecXYZ1 = vec_ld( 0, vertPtr );
|
|
vecXYZ2 = vec_ld( 0, vertPtr2 );
|
|
vecXYZ3 = vec_ld( 0, vertPtr3 );
|
|
vecXYZ4 = vec_ld( 0, vertPtr4 );
|
|
|
|
// like a splat, but only doing halves
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum1 );
|
|
vecSum1 = vec_madd( vec_perm( vecXYZ1, vecXYZ2, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum1 );
|
|
vecSum1 = vec_add( vecSum1, vecPlane3 );
|
|
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(0,1,2,3,0,1,2,3,16,17,18,19,16,17,18,19) ), vecPlane0, zeroVector );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(4,5,6,7,4,5,6,7,20,21,22,23,20,21,22,23) ) , vecPlane1, vecSum2 );
|
|
vecSum2 = vec_madd( vec_perm( vecXYZ3, vecXYZ4, (vector unsigned char)(8,9,10,11,8,9,10,11,24,25,26,27,24,25,26,27) ), vecPlane2, vecSum2 );
|
|
vecSum2 = vec_add( vecSum2, vecPlane3 );
|
|
|
|
// store out results
|
|
UNALIGNED_STORE2( &texCoords[i][0], vecSum1, vecSum2 );
|
|
|
|
// bit manipulation
|
|
vecCmp1 = vec_cmplt( vecSum1, zeroVector );
|
|
vecCmp2 = vec_cmplt( vecSum2, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1 = vec_and( vecCmp1, oneIntVector );
|
|
vecCmp2 = vec_and( vecCmp2, oneIntVector );
|
|
|
|
// store out and write to cullBits
|
|
// finally, a use for algebra! 1-x = x + 1 - 2x
|
|
vecSum1Inv = vec_madd( vecSum1, negTwoVector, vecSum1 );
|
|
vecSum2Inv = vec_madd( vecSum2, negTwoVector, vecSum2 );
|
|
vecSum1Inv = vec_add( vecSum1Inv, oneVector );
|
|
vecSum2Inv = vec_add( vecSum2Inv, oneVector );
|
|
|
|
// do the same comparisons for the inverted d0/d1
|
|
vecCmp1Inv = vec_cmplt( vecSum1Inv, zeroVector );
|
|
vecCmp2Inv = vec_cmplt( vecSum2Inv, zeroVector );
|
|
|
|
//and it with 1 so we multiply by 1 not 1111's
|
|
vecCmp1Inv = vec_and( vecCmp1Inv, oneIntVector );
|
|
vecCmp2Inv = vec_and( vecCmp2Inv, oneIntVector );
|
|
|
|
// shift them as needed
|
|
vecBitShifted1 = vec_sl( (vector unsigned int)vecCmp1, vecShift );
|
|
vecBitShifted2 = vec_sl( (vector unsigned int)vecCmp2, vecShift );
|
|
vecBitShifted1Inv = vec_sl( (vector unsigned int)vecCmp1Inv, vecShiftInv );
|
|
vecBitShifted2Inv = vec_sl( (vector unsigned int)vecCmp2Inv, vecShiftInv );
|
|
|
|
// OR them all together. since only 1 bit is set for each value, thats
|
|
// the same as adding them. add up d0 + d1 + d0Inv + d1Inv
|
|
vector unsigned int vecResult;
|
|
vector unsigned int vecResult2;
|
|
vector unsigned int vecResult3;
|
|
vecResult = vec_add( vecBitShifted1, vec_sld( vecBitShifted1, vecBitShifted1, 4 ) );
|
|
|
|
vecResult2 = vec_add( vecBitShifted2, vec_sld( vecBitShifted2, vecBitShifted2, 4 ) );
|
|
|
|
// vecResult now holds the values without the inverses yet, so add those
|
|
vecResult = vec_perm( vecResult, vecResult2, vecPermFirstThird );
|
|
vecResult2 = vec_add( vecBitShifted1Inv, vec_sld( vecBitShifted1Inv, vecBitShifted1Inv, 4 ) );
|
|
vecResult3 = vec_add( vecBitShifted2Inv, vec_sld( vecBitShifted2Inv, vecBitShifted2Inv, 4 ) );
|
|
vecResult2 = vec_perm( vecResult2, vecResult3, vecPermFirstThird );
|
|
|
|
vecResult = vec_add( vecResult, vecResult2 );
|
|
|
|
//store out results
|
|
vecResult = vec_perm( vecResult, vecResult, cullBitPerm );
|
|
vec_ste( vecResult, 0, &cullBitVal[0] );
|
|
vec_ste( vecResult, 4, &cullBitVal[0] );
|
|
vec_ste( vecResult, 8, &cullBitVal[0] );
|
|
vec_ste( vecResult, 12, &cullBitVal[0] );
|
|
|
|
cullBits[i] = cullBitVal[0];
|
|
cullBits[i+1] = cullBitVal[1];
|
|
cullBits[i+2] = cullBitVal[2];
|
|
cullBits[i+3] = cullBitVal[3];
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
byte bits;
|
|
float d0, d1;
|
|
float vx, vy, vz;
|
|
|
|
vx = *( vertPtr + (i*DRAWVERT_OFFSET) + 0 );
|
|
vy = *( vertPtr + (i*DRAWVERT_OFFSET) + 1 );
|
|
vz = *( vertPtr + (i*DRAWVERT_OFFSET) + 2 );
|
|
|
|
d0 = p0x * vx + p0y * vy + p0z * vz + p0d;
|
|
d1 = p1x * vx + p1y * vy + p1z * vz + p1d;
|
|
texCoords[i][0] = d0;
|
|
texCoords[i][1] = d1;
|
|
|
|
bits = ( d0 >= 0 ) ? 0 : 1;
|
|
d0 = 1.0f - d0;
|
|
bits |= ( d1 >= 0 ) ? 0 : 1*2;
|
|
d1 = 1.0f - d1;
|
|
|
|
bits |= ( d0 >= 0 ) ? 0: 1*4;
|
|
bits |= ( d1 >= 0 ) ? 0: 1*8;
|
|
|
|
cullBits[i] = bits;
|
|
}
|
|
}
|
|
|
|
|
|
#endif /* DRAWVERT_PADDED */
|
|
|
|
#endif /* ENABLE_CULL */
|
|
|
|
#ifdef ENABLE_DERIVE
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::DeriveTriPlanes
|
|
|
|
Derives a plane equation for each triangle.
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::DeriveTriPlanes( idPlane *planes, const idDrawVert *verts, const int numVerts, const int *indexes, const int numIndexes ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// idPlane size
|
|
assert( sizeof(idPlane) == PLANE_OFFSET * sizeof(float) );
|
|
int i;
|
|
|
|
vector float vecD0, vecD1, vecD2, vecD3, vecD4, vecD5, vecD6, vecD7;
|
|
vector float vecVertA, vecVertB, vecVertC;
|
|
vector float vecVertA2, vecVertB2, vecVertC2;
|
|
vector float vecVertA3, vecVertB3, vecVertC3;
|
|
vector float vecVertA4, vecVertB4, vecVertC4;
|
|
|
|
vector float vecN, vecN2, vecN3, vecN4;
|
|
vector float vecWork1, vecWork2, vecWork3, vecWork4, vecWork5, vecWork6, vecWork7, vecWork8;
|
|
vector unsigned char vecPerm1 = (vector unsigned char)(4,5,6,7,8,9,10,11,0,1,2,3,12,13,14,15);
|
|
vector unsigned char vecPerm2 = (vector unsigned char)(8,9,10,11,0,1,2,3,4,5,6,7,12,13,14,15);
|
|
vector float vecF;
|
|
vector float vecF1, vecF2, vecF3, vecF4;
|
|
vector float zeroVector = (vector float)(0);
|
|
vector float vecNegOne = (vector float)(-1);
|
|
vector float vecSecondHalf, vecFirstHalf, vecSecondHalf2, vecFirstHalf2, vecSecondHalf3, vecFirstHalf3, vecFirstHalf4, vecSecondHalf4;
|
|
|
|
vector unsigned char vecPermA, vecPermA2, vecPermA3, vecPermA4;
|
|
vector unsigned char vecPermB, vecPermB2, vecPermB3, vecPermB4;
|
|
vector unsigned char vecPermC, vecPermC2, vecPermC3, vecPermC4;
|
|
|
|
vector unsigned char oneVector = (vector unsigned char)(1);
|
|
vector float vecLd1, vecLd2, vecLd3, vecLd4, vecLd5, vecLd6;
|
|
vector unsigned char vecPermZeroLast = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
|
|
const float *xyzPtr = verts[0].xyz.ToFloatPtr();
|
|
float *planePtr = planes[0].ToFloatPtr();
|
|
|
|
int j;
|
|
for ( j = 0, i = 0; i+11 < numIndexes; i += 12, j += 4 ) {
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// calculate permute vectors to load as needed. these are all
|
|
// triangle indexes and are usaully pretty close together but
|
|
// not guaranteed to be in any particular order
|
|
vecPermA = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermA2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC2 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermA3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC3 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermA4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermB4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) ), oneVector );
|
|
vecPermC4 = vec_add( vec_lvsl( -1, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) ), oneVector );
|
|
#endif
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// load first A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA = vec_perm( vecLd1, vecLd2, vecPermA );
|
|
vecVertB = vec_perm( vecLd3, vecLd4, vecPermB );
|
|
vecVertC = vec_perm( vecLd5, vecLd6, vecPermC );
|
|
|
|
// set the last element to 0
|
|
vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
|
|
vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
|
|
vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
|
|
|
|
// load second A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA2 = vec_perm( vecLd1, vecLd2, vecPermA2 );
|
|
vecVertB2 = vec_perm( vecLd3, vecLd4, vecPermB2 );
|
|
vecVertC2 = vec_perm( vecLd5, vecLd6, vecPermC2 );
|
|
|
|
// set the last element to 0
|
|
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
|
|
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
|
|
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
|
|
|
|
// load third A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA3 = vec_perm( vecLd1, vecLd2, vecPermA3 );
|
|
vecVertB3 = vec_perm( vecLd3, vecLd4, vecPermB3 );
|
|
vecVertC3 = vec_perm( vecLd5, vecLd6, vecPermC3 );
|
|
|
|
// set the last element to 0
|
|
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
|
|
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
|
|
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
|
|
|
|
// load the fourth A B C
|
|
vecLd1 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
|
|
vecLd2 = vec_ld( 15, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
|
|
vecLd3 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
|
|
vecLd4 = vec_ld( 15, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
|
|
vecLd5 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
|
|
vecLd6 = vec_ld( 15, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
|
|
|
|
vecVertA4 = vec_perm( vecLd1, vecLd2, vecPermA4 );
|
|
vecVertB4 = vec_perm( vecLd3, vecLd4, vecPermB4 );
|
|
vecVertC4 = vec_perm( vecLd5, vecLd6, vecPermC4 );
|
|
|
|
// set the last element to 0
|
|
vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
|
|
vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
|
|
vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
|
|
#else
|
|
// load first A B C
|
|
vecVertA = vec_ld( 0, xyzPtr + ( indexes[i+0] * DRAWVERT_OFFSET ) );
|
|
vecVertB = vec_ld( 0, xyzPtr + ( indexes[i+1] * DRAWVERT_OFFSET ) );
|
|
vecVertC = vec_ld( 0, xyzPtr + ( indexes[i+2] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA = vec_perm( vecVertA, zeroVector, vecPermZeroLast );
|
|
vecVertB = vec_perm( vecVertB, zeroVector, vecPermZeroLast );
|
|
vecVertC = vec_perm( vecVertC, zeroVector, vecPermZeroLast );
|
|
|
|
// load second A B C
|
|
vecVertA2 = vec_ld( 0, xyzPtr + ( indexes[i+3] * DRAWVERT_OFFSET ) );
|
|
vecVertB2 = vec_ld( 0, xyzPtr + ( indexes[i+4] * DRAWVERT_OFFSET ) );
|
|
vecVertC2 = vec_ld( 0, xyzPtr + ( indexes[i+5] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA2 = vec_perm( vecVertA2, zeroVector, vecPermZeroLast );
|
|
vecVertB2 = vec_perm( vecVertB2, zeroVector, vecPermZeroLast );
|
|
vecVertC2 = vec_perm( vecVertC2, zeroVector, vecPermZeroLast );
|
|
|
|
// load third A B C
|
|
vecVertA3 = vec_ld( 0, xyzPtr + ( indexes[i+6] * DRAWVERT_OFFSET ) );
|
|
vecVertB3 = vec_ld( 0, xyzPtr + ( indexes[i+7] * DRAWVERT_OFFSET ) );
|
|
vecVertC3 = vec_ld( 0, xyzPtr + ( indexes[i+8] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA3 = vec_perm( vecVertA3, zeroVector, vecPermZeroLast );
|
|
vecVertB3 = vec_perm( vecVertB3, zeroVector, vecPermZeroLast );
|
|
vecVertC3 = vec_perm( vecVertC3, zeroVector, vecPermZeroLast );
|
|
|
|
// load the fourth A B C
|
|
vecVertA4 = vec_ld( 0, xyzPtr + ( indexes[i+9] * DRAWVERT_OFFSET ) );
|
|
vecVertB4 = vec_ld( 0, xyzPtr + ( indexes[i+10] * DRAWVERT_OFFSET ) );
|
|
vecVertC4 = vec_ld( 0, xyzPtr + ( indexes[i+11] * DRAWVERT_OFFSET ) );
|
|
|
|
// set the last element to 0
|
|
vecVertA4 = vec_perm( vecVertA4, zeroVector, vecPermZeroLast );
|
|
vecVertB4 = vec_perm( vecVertB4, zeroVector, vecPermZeroLast );
|
|
vecVertC4 = vec_perm( vecVertC4, zeroVector, vecPermZeroLast );
|
|
#endif
|
|
// calculate d0 and d1 for each
|
|
vecD0 = vec_sub( vecVertB, vecVertA );
|
|
vecD1 = vec_sub( vecVertC, vecVertA );
|
|
|
|
vecD2 = vec_sub( vecVertB2, vecVertA2 );
|
|
vecD3 = vec_sub( vecVertC2, vecVertA2 );
|
|
|
|
vecD4 = vec_sub( vecVertB3, vecVertA3 );
|
|
vecD5 = vec_sub( vecVertC3, vecVertA3 );
|
|
|
|
vecD6 = vec_sub( vecVertB4, vecVertA4 );
|
|
vecD7 = vec_sub( vecVertC4, vecVertA4 );
|
|
|
|
vecWork1 = vec_perm( vecD0, vecD0, vecPerm1 );
|
|
vecWork2 = vec_perm( vecD1, vecD1, vecPerm2 );
|
|
vecWork3 = vec_perm( vecD2, vecD2, vecPerm1 );
|
|
vecWork4 = vec_perm( vecD3, vecD3, vecPerm2 );
|
|
vecWork5 = vec_perm( vecD4, vecD4, vecPerm1 );
|
|
vecWork6 = vec_perm( vecD5, vecD5, vecPerm2 );
|
|
vecWork7 = vec_perm( vecD6, vecD6, vecPerm1 );
|
|
vecWork8 = vec_perm( vecD7, vecD7, vecPerm2 );
|
|
|
|
vecSecondHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecSecondHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecSecondHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecSecondHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecWork1 = vec_perm( vecD1, vecD1, vecPerm1 );
|
|
vecWork2 = vec_perm( vecD0, vecD0, vecPerm2 );
|
|
vecWork3 = vec_perm( vecD3, vecD3, vecPerm1 );
|
|
vecWork4 = vec_perm( vecD2, vecD2, vecPerm2 );
|
|
vecWork5 = vec_perm( vecD5, vecD5, vecPerm1 );
|
|
vecWork6 = vec_perm( vecD4, vecD4, vecPerm2 );
|
|
vecWork7 = vec_perm( vecD7, vecD7, vecPerm1 );
|
|
vecWork8 = vec_perm( vecD6, vecD6, vecPerm2 );
|
|
|
|
vecFirstHalf = vec_madd( vecWork1, vecWork2, zeroVector );
|
|
vecFirstHalf2 = vec_madd( vecWork3, vecWork4, zeroVector );
|
|
vecFirstHalf3 = vec_madd( vecWork5, vecWork6, zeroVector );
|
|
vecFirstHalf4 = vec_madd( vecWork7, vecWork8, zeroVector );
|
|
|
|
vecN = vec_madd( vecSecondHalf, vecNegOne, vecFirstHalf );
|
|
vecN2 = vec_madd( vecSecondHalf2, vecNegOne, vecFirstHalf2 );
|
|
vecN3 = vec_madd( vecSecondHalf3, vecNegOne, vecFirstHalf3 );
|
|
vecN4 = vec_madd( vecSecondHalf4, vecNegOne, vecFirstHalf4 );
|
|
|
|
// transpose vecNs
|
|
vector float v0, v1, v2, v3;
|
|
v0 = vec_mergeh( vecN, vecN3 );
|
|
v1 = vec_mergeh( vecN2, vecN4 );
|
|
v2 = vec_mergel( vecN, vecN3 );
|
|
v3 = vec_mergel( vecN2, vecN4 );
|
|
|
|
vecN = vec_mergeh( v0, v1 );
|
|
vecN2 = vec_mergel( v0, v1 );
|
|
vecN3 = vec_mergeh( v2, v3 );
|
|
vecN4 = vec_mergel( v2, v3 );
|
|
|
|
vecF = vec_madd( vecN, vecN, zeroVector );
|
|
vecF = vec_madd( vecN2, vecN2, vecF );
|
|
vecF = vec_madd( vecN3, vecN3, vecF );
|
|
|
|
vecF = ReciprocalSquareRoot( vecF );
|
|
|
|
vecF1 = vec_madd( vecF, vecN, zeroVector );
|
|
vecF2 = vec_madd( vecF, vecN2, zeroVector );
|
|
vecF3 = vec_madd( vecF, vecN3, zeroVector );
|
|
vecF4 = vec_madd( vecF, vecN4, zeroVector );
|
|
|
|
vector float v8, v9, v10, v11;
|
|
v8 = vecF1;
|
|
v9 = vecF2;
|
|
v10 = vecF3;
|
|
v11 = vecF4;
|
|
|
|
// transpose vecVerts
|
|
v0 = vec_mergeh( vecVertA, vecVertA3 );
|
|
v1 = vec_mergeh( vecVertA2, vecVertA4 );
|
|
v2 = vec_mergel( vecVertA, vecVertA3 );
|
|
v3 = vec_mergel( vecVertA2, vecVertA4 );
|
|
|
|
vecVertA = vec_mergeh( v0, v1 );
|
|
vecVertA2 = vec_mergel( v0, v1 );
|
|
vecVertA3 = vec_mergeh( v2, v3 );
|
|
vecVertA4 = vec_mergel( v2, v3 );
|
|
|
|
vector float vecTotals;
|
|
vecTotals = vec_madd( vecVertA, v8, zeroVector );
|
|
vecTotals = vec_madd( vecVertA2, v9, vecTotals );
|
|
vecTotals = vec_madd( vecVertA3, v10, vecTotals );
|
|
vecTotals = vec_madd( vecVertA4, v11, vecTotals );
|
|
vecF = vec_madd( vecTotals, vecNegOne, zeroVector );
|
|
|
|
// transpose vecFs
|
|
v0 = vec_mergeh( vecF1, vecF3 );
|
|
v1 = vec_mergeh( vecF2, vecF );
|
|
v2 = vec_mergel( vecF1, vecF3 );
|
|
v3 = vec_mergel( vecF2, vecF );
|
|
|
|
vecF1 = vec_mergeh( v0, v1 );
|
|
vecF2 = vec_mergel( v0, v1 );
|
|
vecF3 = vec_mergeh( v2, v3 );
|
|
vecF4 = vec_mergel( v2, v3 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE4( planePtr + ( j * PLANE_OFFSET ), vecF1, vecF2, vecF3, vecF4 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numIndexes; i += 3, j++ ) {
|
|
const idDrawVert *a, *b, *c;
|
|
float d0[3], d1[3], f;
|
|
idVec3 n;
|
|
|
|
a = verts + indexes[i + 0];
|
|
b = verts + indexes[i + 1];
|
|
c = verts + indexes[i + 2];
|
|
|
|
d0[0] = b->xyz[0] - a->xyz[0];
|
|
d0[1] = b->xyz[1] - a->xyz[1];
|
|
d0[2] = b->xyz[2] - a->xyz[2];
|
|
|
|
d1[0] = c->xyz[0] - a->xyz[0];
|
|
d1[1] = c->xyz[1] - a->xyz[1];
|
|
d1[2] = c->xyz[2] - a->xyz[2];
|
|
|
|
n[0] = d1[1] * d0[2] - d1[2] * d0[1];
|
|
n[1] = d1[2] * d0[0] - d1[0] * d0[2];
|
|
n[2] = d1[0] * d0[1] - d1[1] * d0[0];
|
|
|
|
f = FastScalarInvSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
|
|
//idMath::RSqrt( n.x * n.x + n.y * n.y + n.z * n.z );
|
|
|
|
n.x *= f;
|
|
n.y *= f;
|
|
n.z *= f;
|
|
|
|
planes[j].SetNormal( n );
|
|
planes[j].FitThroughPoint( a->xyz );
|
|
}
|
|
}
|
|
|
|
|
|
#endif /* ENABLE_DERIVE */
|
|
|
|
#ifdef ENABLE_CREATE
|
|
|
|
#if 1
|
|
|
|
#ifdef VERTEXCACHE_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// vertexCache aligned
|
|
assert( IS_16BYTE_ALIGNED( vertexCache[0] ) );
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// idVec4 size
|
|
assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float oneVector = (vector float)(1);
|
|
register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
int i = 0;
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
#endif
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v1 = vec_perm( v2, v3, vertPerm2 );
|
|
v2 = vec_perm( v4, v5, vertPerm3 );
|
|
v3 = vec_perm( v6, v7, vertPerm4 );
|
|
#else
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 0, vertPtr2 );
|
|
v2 = vec_ld( 0, vertPtr3 );
|
|
v3 = vec_ld( 0, vertPtr4 );
|
|
#endif
|
|
|
|
v0 = vec_perm( v0, oneVector, vecPermThreeOne );
|
|
v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
|
|
|
|
v1 = vec_perm( v1, oneVector, vecPermThreeOne );
|
|
v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
|
|
|
|
v2 = vec_perm( v2, oneVector, vecPermThreeOne );
|
|
v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
|
|
|
|
v3 = vec_perm( v3, oneVector, vecPermThreeOne );
|
|
v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &vertexCache[i*2][0], v0, v4, v1, v5 );
|
|
ALIGNED_STORE4( &vertexCache[(i+2)*2][0], v2, v6, v3, v7 );
|
|
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[i*2+0][0] = v[0];
|
|
vertexCache[i*2+1][0] = v[0];
|
|
vertexCache[i*2+0][1] = v[1];
|
|
vertexCache[i*2+1][1] = v[1];
|
|
vertexCache[i*2+0][2] = v[2];
|
|
vertexCache[i*2+1][2] = v[2];
|
|
vertexCache[i*2+0][3] = 1.0f;
|
|
vertexCache[i*2+1][3] = 0.0f;
|
|
}
|
|
return numVerts * 2;
|
|
}
|
|
|
|
#else
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::CreateShadowCache
|
|
============
|
|
*/
|
|
int VPCALL idSIMD_AltiVec::CreateShadowCache( idVec4 *vertexCache, const idDrawVert *verts, const int numVerts ) {
|
|
|
|
// idDrawVert size
|
|
assert( sizeof(idDrawVert) == DRAWVERT_OFFSET * sizeof(float) );
|
|
// idVec4 size
|
|
assert( sizeof(idVec4) == IDVEC4_OFFSET * sizeof(float) );
|
|
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector float zeroVector = (vector float)(0.0);
|
|
register vector float oneVector = (vector float)(1);
|
|
register vector unsigned char vecPermThreeOne = (vector unsigned char)(0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19);
|
|
vector unsigned char vertPerm1, vertPerm2, vertPerm3, vertPerm4;
|
|
int i = 0;
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
// every fourth one will have the same alignment. Make sure we've got enough here
|
|
if ( i+3 < numVerts ) {
|
|
vertPerm1 = vec_add( vec_lvsl( -1, (float*) verts[0].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm2 = vec_add( vec_lvsl( -1, (float*) verts[1].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm3 = vec_add( vec_lvsl( -1, (float*) verts[2].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
vertPerm4 = vec_add( vec_lvsl( -1, (float*) verts[3].xyz.ToFloatPtr() ), (vector unsigned char)(1) );
|
|
}
|
|
#endif
|
|
|
|
for ( ; i+3 < numVerts; i+=4 ) {
|
|
const float *vertPtr = verts[i].xyz.ToFloatPtr();
|
|
const float *vertPtr2 = verts[i+1].xyz.ToFloatPtr();
|
|
const float *vertPtr3 = verts[i+2].xyz.ToFloatPtr();
|
|
const float *vertPtr4 = verts[i+3].xyz.ToFloatPtr();
|
|
|
|
#ifndef DRAWVERT_PADDED
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 15, vertPtr );
|
|
v2 = vec_ld( 0, vertPtr2 );
|
|
v3 = vec_ld( 15, vertPtr2 );
|
|
v4 = vec_ld( 0, vertPtr3 );
|
|
v5 = vec_ld( 15, vertPtr3 );
|
|
v6 = vec_ld( 0, vertPtr4 );
|
|
v7 = vec_ld( 15, vertPtr4 );
|
|
|
|
v0 = vec_perm( v0, v1, vertPerm1 );
|
|
v1 = vec_perm( v2, v3, vertPerm2 );
|
|
v2 = vec_perm( v4, v5, vertPerm3 );
|
|
v3 = vec_perm( v6, v7, vertPerm4 );
|
|
#else
|
|
v0 = vec_ld( 0, vertPtr );
|
|
v1 = vec_ld( 0, vertPtr2 );
|
|
v2 = vec_ld( 0, vertPtr3 );
|
|
v3 = vec_ld( 0, vertPtr4 );
|
|
#endif
|
|
|
|
v0 = vec_perm( v0, oneVector, vecPermThreeOne );
|
|
v4 = vec_perm( v0, zeroVector, vecPermThreeOne );
|
|
|
|
v1 = vec_perm( v1, oneVector, vecPermThreeOne );
|
|
v5 = vec_perm( v1, zeroVector, vecPermThreeOne );
|
|
|
|
v2 = vec_perm( v2, oneVector, vecPermThreeOne );
|
|
v6 = vec_perm( v2, zeroVector, vecPermThreeOne );
|
|
|
|
v3 = vec_perm( v3, oneVector, vecPermThreeOne );
|
|
v7 = vec_perm( v3, zeroVector, vecPermThreeOne );
|
|
|
|
// store results as unaligned
|
|
vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &vertexCache[i*2][0] ), (vector unsigned char)(1) );
|
|
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
|
|
vector float vc1 = vec_ld( 0, &vertexCache[i*2][0] );
|
|
vector float vc2 = vec_ld( 127, &vertexCache[i*2][0] );
|
|
|
|
// right rotate input data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
|
|
vec_st( vec_sel( vc1, v0, mask ), 0 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v0, v4, mask ), 15 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v4, v1, mask ), 31 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v1, v5, mask ), 47 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v5, v2, mask ), 63 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v2, v6, mask ), 79 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v6, v3, mask ), 95 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v3, v7, mask ), 111 , &vertexCache[i*2][0] );
|
|
vec_st( vec_sel( v7, vc2, mask ), 127 , &vertexCache[i*2][0] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numVerts; i++ ) {
|
|
const float *v = verts[i].xyz.ToFloatPtr();
|
|
vertexCache[i*2+0][0] = v[0];
|
|
vertexCache[i*2+1][0] = v[0];
|
|
vertexCache[i*2+0][1] = v[1];
|
|
vertexCache[i*2+1][1] = v[1];
|
|
vertexCache[i*2+0][2] = v[2];
|
|
vertexCache[i*2+1][2] = v[2];
|
|
vertexCache[i*2+0][3] = 1.0f;
|
|
vertexCache[i*2+1][3] = 0.0f;
|
|
}
|
|
return numVerts * 2;
|
|
}
|
|
|
|
#endif /* VERTEXCACHE_ALIGNED */
|
|
|
|
#endif /* 0 to kill VP shader cache */
|
|
|
|
#endif /* ENABLE_CREATE */
|
|
|
|
#ifdef ENABLE_SOUND_ROUTINES
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSamplePCMTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
Assumes that dest starts at aligned address
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
|
|
|
|
// dest is aligned
|
|
assert( IS_16BYTE_ALIGNED( dest[0] ) );
|
|
|
|
vector signed short vs0, vs1;
|
|
register vector signed int vi0, vi1;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
|
|
// permute vectors
|
|
register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
|
|
register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
|
|
|
|
register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
// If this can be assumed true, we can eliminate another conditional that checks to see if we can
|
|
// load up a vector before the loop
|
|
assert( numSamples >= 12 );
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// 8 at a time
|
|
int i = 0;
|
|
|
|
vector signed short vsOld = vec_ld( 0, &src[i] );
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
|
|
|
|
for ( ; i+7 < numSamples; i+= 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// permute into vectors in the order to store
|
|
|
|
v2 = vec_splat( v0, 0 );
|
|
v3 = vec_splat( v0, 1 );
|
|
v4 = vec_splat( v0, 2 );
|
|
v5 = vec_splat( v0, 3 );
|
|
v6 = vec_splat( v1, 0 );
|
|
v7 = vec_splat( v1, 1 );
|
|
v8 = vec_splat( v1, 2 );
|
|
v9 = vec_splat( v1, 3 );
|
|
|
|
// store results
|
|
ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
|
|
}
|
|
// cleanup
|
|
for (; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i = 0;
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( ; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = v2;
|
|
v4 = vec_perm( v0, v0, vecSecondHalf );
|
|
v5 = v4;
|
|
v6 = vec_perm( v1, v1, vecFirstHalf );
|
|
v7 = v6;
|
|
v8 = vec_perm (v1, v1, vecSecondHalf );
|
|
v9 = v8;
|
|
|
|
// store results
|
|
ALIGNED_STORE8( &dest[i*4], v2, v3, v4, v5, v6, v7, v8, v9 );
|
|
}
|
|
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
|
|
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 0, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecBottom );
|
|
v3 = vec_perm( v0, v0, vecTop );
|
|
v4 = vec_perm( v1, v1, vecBottom );
|
|
v5 = vec_perm (v1, v1, vecTop );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = vec_perm( v0, v0, vecSecondHalf );
|
|
v4 = vec_perm( v1, v1, vecFirstHalf );
|
|
v5 = vec_perm (v1, v1, vecSecondHalf );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &dest[i*2], v2, v3, v4, v5 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
|
|
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
|
|
//unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
|
|
//convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
|
|
//store results
|
|
ALIGNED_STORE2( &dest[i], v0, v1 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i] = (float) src[i];
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSamplePCMTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSamplePCMTo44kHz( float *dest, const short *src, const int numSamples, const int kHz, const int numChannels ) {
|
|
|
|
vector signed short vs0, vs1;
|
|
register vector signed int vi0, vi1;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9;
|
|
// permute vectors
|
|
register vector unsigned char vecFirstHalf = (vector unsigned char)(0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7);
|
|
register vector unsigned char vecSecondHalf = (vector unsigned char)(8,9,10,11,12,13,14,15,8,9,10,11,12,13,14,15);
|
|
|
|
register vector unsigned char vecBottom = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
register vector unsigned char vecTop = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
// calculate perm vector and masks for stores
|
|
vector unsigned char storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
|
|
// original values of dest
|
|
vector float vecDest = vec_ld( 0, &dest[0] );
|
|
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// 8 at a time
|
|
int i = 0;
|
|
|
|
vector signed short vsOld = vec_ld( 0, &src[i] );
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[i] ), (vector unsigned char)(1) );
|
|
|
|
for ( ; i+7 < numSamples; i+= 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// permute into vectors in the order to store
|
|
|
|
v2 = vec_splat( v0, 0 );
|
|
v3 = vec_splat( v0, 1 );
|
|
v4 = vec_splat( v0, 2 );
|
|
v5 = vec_splat( v0, 3 );
|
|
v6 = vec_splat( v1, 0 );
|
|
v7 = vec_splat( v1, 1 );
|
|
v8 = vec_splat( v1, 2 );
|
|
v9 = vec_splat( v1, 3 );
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
v8 = vec_perm( v8, v8, storePerm );
|
|
v9 = vec_perm( v9, v9, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
|
|
vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
|
|
vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
|
|
vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
|
|
vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
|
|
vecDest = vec_sel( v9, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*4] );
|
|
}
|
|
// cleanup
|
|
for (; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i = 0;
|
|
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( ; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = v2;
|
|
v4 = vec_perm( v0, v0, vecSecondHalf );
|
|
v5 = v4;
|
|
v6 = vec_perm( v1, v1, vecFirstHalf );
|
|
v7 = v6;
|
|
v8 = vec_perm (v1, v1, vecSecondHalf );
|
|
v9 = v8;
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
v8 = vec_perm( v8, v8, storePerm );
|
|
v9 = vec_perm( v9, v9, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
|
|
vec_st( vec_sel( v5, v6, mask ), 63, &dest[i*4] );
|
|
vec_st( vec_sel( v6, v7, mask ), 79, &dest[i*4] );
|
|
vec_st( vec_sel( v7, v8, mask ), 95, &dest[i*4] );
|
|
vec_st( vec_sel( v8, v9, mask ), 111, &dest[i*4] );
|
|
vecDest = vec_sel( v9, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*4] );
|
|
}
|
|
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*4+0] = dest[i*4+2] = dest[i*4+4] = dest[i*4+6] = (float) src[i+0];
|
|
dest[i*4+1] = dest[i*4+3] = dest[i*4+5] = dest[i*4+7] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 0, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecBottom );
|
|
v3 = vec_perm( v0, v0, vecTop );
|
|
v4 = vec_perm( v1, v1, vecBottom );
|
|
v5 = vec_perm (v1, v1, vecTop );
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
|
|
vecDest = vec_sel( v5, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*2] );
|
|
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = (float) src[i+0];
|
|
}
|
|
} else {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load src
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
|
|
|
|
// unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
// convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
// put into vectors in order to store
|
|
v2 = vec_perm( v0, v0, vecFirstHalf );
|
|
v3 = vec_perm( v0, v0, vecSecondHalf );
|
|
v4 = vec_perm( v1, v1, vecFirstHalf );
|
|
v5 = vec_perm (v1, v1, vecSecondHalf );
|
|
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*2] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*2] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*2] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*2] );
|
|
vecDest = vec_sel( v5, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*2] );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i += 2 ) {
|
|
dest[i*2+0] = dest[i*2+2] = (float) src[i+0];
|
|
dest[i*2+1] = dest[i*2+3] = (float) src[i+1];
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
int i;
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &src[0] ), (vector unsigned char)(1) );
|
|
vector signed short vsOld = vec_ld( 0, &src[0] );
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
//vs0 = vec_ld( 0, &src[i] );
|
|
vs1 = vec_ld( 15, &src[i] );
|
|
vs0 = vec_perm( vsOld, vs1, permVec );
|
|
vsOld = vs1;
|
|
vector float vecDestEnd = vec_ld( 31, &dest[i] );
|
|
|
|
//unpack shorts to ints
|
|
vi0 = vec_unpackh( vs0 );
|
|
vi1 = vec_unpackl( vs0 );
|
|
|
|
//convert ints to floats
|
|
v0 = vec_ctf( vi0, 0 );
|
|
v1 = vec_ctf( vi1, 0 );
|
|
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
|
|
vecDest = vec_sel( v1, vecDestEnd, mask );
|
|
vec_st( vecDest, 31, &dest[i] );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i] = (float) src[i];
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSampleOGGTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
Assumes that dest starts at aligned address
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
|
|
// dest is aligned
|
|
assert( IS_16BYTE_ALIGNED( dest[0] ) );
|
|
|
|
register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
|
|
register vector float constVec, zeroVector;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
|
|
vector unsigned char vecPerm1;
|
|
vector unsigned char vecPerm2;
|
|
|
|
vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
|
|
vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
|
|
vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
|
|
vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
|
|
|
|
constVec = (vector float)(32768.0f);
|
|
zeroVector = (vector float)(0.0);
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
v0 = vec_madd( oggVec1, constVec, zeroVector );
|
|
v1 = vec_madd( oggVec2, constVec, zeroVector );
|
|
v2 = vec_madd( oggVec3, constVec, zeroVector );
|
|
v3 = vec_madd( oggVec4, constVec, zeroVector );
|
|
v4 = vec_madd( oggVec5, constVec, zeroVector );
|
|
v5 = vec_madd( oggVec6, constVec, zeroVector );
|
|
v6 = vec_madd( oggVec7, constVec, zeroVector );
|
|
v7 = vec_madd( oggVec8, constVec, zeroVector );
|
|
|
|
//store results
|
|
ALIGNED_STORE8( &dest[i*4], v0, v1, v2, v3, v4, v5, v6, v7 );
|
|
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
|
|
}
|
|
|
|
} else {
|
|
|
|
// calculate perm vec for ogg
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
|
|
// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
|
|
// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
|
|
oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
|
|
oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
|
|
oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
|
|
oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
|
|
oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
|
|
oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
|
|
oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
|
|
|
|
//merge generates the interleaved pattern that we want and it
|
|
//doesn't require a permute vector, so use that instead
|
|
v0 = vec_mergeh( oggVec1, oggVec5 );
|
|
v1 = vec_mergel( oggVec1, oggVec5 );
|
|
v2 = vec_mergeh( oggVec2, oggVec6 );
|
|
v3 = vec_mergel( oggVec2, oggVec6 );
|
|
|
|
v4 = vec_mergeh( oggVec3, oggVec7 );
|
|
v5 = vec_mergel( oggVec3, oggVec7 );
|
|
v6 = vec_mergeh( oggVec4, oggVec8 );
|
|
v10 = vec_mergel( oggVec4, oggVec8 );
|
|
|
|
//store results
|
|
ALIGNED_STORE8( &dest[i*8], v0, v1, v2, v3, v4, v5, v6, v10 );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
|
|
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load values from ogg
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// permute into results vectors to store
|
|
v5 = vec_perm( v0, v0, vecOneTwo );
|
|
v6 = vec_perm( v0, v0, vecThreeFour);
|
|
v7 = vec_perm( v1, v1, vecOneTwo );
|
|
v8 = vec_perm( v1, v1, vecThreeFour );
|
|
|
|
//store results
|
|
ALIGNED_STORE4( &dest[i*2], v5, v6, v7, v8 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
// load ogg[0][i] to ogg[0][i+4]
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors to store
|
|
v2 = vec_perm( v0, v1, vecFirst );
|
|
v3 = vec_perm( v0, v1, vecSecond );
|
|
v4 = vec_perm( v0, v1, vecThird );
|
|
v5 = vec_perm( v0, v1, vecFourth );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &dest[i*4], v2, v3, v4, v5 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
|
|
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load values from ogg
|
|
v8 = v9;
|
|
v7 = vec_ld( 15, &ogg[0][i] );
|
|
v6 = v7;
|
|
v9 = vec_ld( 31, &ogg[0][i] );
|
|
|
|
v0 = vec_perm( v8, v7, vecPerm1 );
|
|
v1 = vec_perm( v6, v9, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
ALIGNED_STORE2( &dest[i], v0, v1 );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*1+0] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors
|
|
v2 = vec_mergeh( v0, v1 );
|
|
v3 = vec_mergel( v0, v1 );
|
|
|
|
// store results
|
|
ALIGNED_STORE2( &dest[i*2], v2, v3 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*2+0] = ogg[0][i] * 32768.0f;
|
|
dest[i*2+1] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::UpSampleOGGTo44kHz
|
|
|
|
Duplicate samples for 44kHz output.
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void idSIMD_AltiVec::UpSampleOGGTo44kHz( float *dest, const float * const *ogg, const int numSamples, const int kHz, const int numChannels ) {
|
|
|
|
register vector float oggVec1, oggVec2, oggVec3, oggVec4, oggVec5, oggVec6, oggVec7, oggVec8;
|
|
register vector float constVec, zeroVector;
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10;
|
|
vector unsigned char vecPerm1;
|
|
vector unsigned char vecPerm2;
|
|
|
|
vector unsigned char vecOneTwo = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
vector unsigned char vecThreeFour = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
vector unsigned char vecFirst = (vector unsigned char)(0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
|
|
vector unsigned char vecSecond = (vector unsigned char)(4,5,6,7,20,21,22,23,4,5,6,7,20,21,22,23);
|
|
vector unsigned char vecThird = (vector unsigned char)(8,9,10,11,24,25,26,27,8,9,10,11,24,25,26,27);
|
|
vector unsigned char vecFourth = (vector unsigned char)(12,13,14,15,28,29,30,31,12,13,14,15,28,29,30,31);
|
|
|
|
vector unsigned char storePerm;
|
|
|
|
constVec = (vector float)(32768.0f);
|
|
zeroVector = (vector float)(0.0);
|
|
|
|
// calculate perm vector and masks for stores
|
|
storePerm = vec_sub( vec_lvsr( 15, &dest[0] ), (vector unsigned char)(1) );
|
|
// original values of dest
|
|
vector float vecDest = vec_ld( 0, &dest[0] );
|
|
vector unsigned int mask = vec_perm( (vector unsigned int)(0), (vector unsigned int)(-1), storePerm );
|
|
|
|
if ( kHz == 11025 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// as it happens, ogg[0][i] through ogg[0][i+3] are contiguous in memory
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*4] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
v0 = vec_madd( oggVec1, constVec, zeroVector );
|
|
v1 = vec_madd( oggVec2, constVec, zeroVector );
|
|
v2 = vec_madd( oggVec3, constVec, zeroVector );
|
|
v3 = vec_madd( oggVec4, constVec, zeroVector );
|
|
v4 = vec_madd( oggVec5, constVec, zeroVector );
|
|
v5 = vec_madd( oggVec6, constVec, zeroVector );
|
|
v6 = vec_madd( oggVec7, constVec, zeroVector );
|
|
v7 = vec_madd( oggVec8, constVec, zeroVector );
|
|
|
|
// rotate input data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*4] );
|
|
vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*4] );
|
|
vec_st( vec_sel( v6, v7, mask ), 111, &dest[i*4] );
|
|
vecDest = vec_sel( v7, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*4] );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*4+0] = dest[i*4+1] = dest[i*4+2] = dest[i*4+3] = ogg[0][i] * 32768.0f;
|
|
}
|
|
|
|
} else {
|
|
|
|
// calculate perm vec for ogg
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i+=4 ) { // +1 += 2
|
|
// load and splat from the array ( ogg[0][i] to ogg[0][i+3] )
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 127, &dest[i*8] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec1 = vec_splat( v0, 0 );
|
|
oggVec2 = vec_splat( v0, 1 );
|
|
oggVec3 = vec_splat( v0, 2 );
|
|
oggVec4 = vec_splat( v0, 3 );
|
|
|
|
// load and splat from the array ( ogg[1][i] to ogg[1][i+3] )
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// now we have the elements in a vector, we want
|
|
// to splat them each accross their own vector
|
|
oggVec5 = vec_splat( v1, 0 );
|
|
oggVec6 = vec_splat( v1, 1 );
|
|
oggVec7 = vec_splat( v1, 2 );
|
|
oggVec8 = vec_splat( v1, 3 );
|
|
|
|
oggVec1 = vec_madd( oggVec1, constVec, zeroVector ); // ogg[0][i] * 32768
|
|
oggVec2 = vec_madd( oggVec2, constVec, zeroVector ); // ogg[0][i+1] * 32768
|
|
oggVec3 = vec_madd( oggVec3, constVec, zeroVector ); // ogg[0][i+2] * 32768
|
|
oggVec4 = vec_madd( oggVec4, constVec, zeroVector ); // ogg[0][i+3] * 32768
|
|
oggVec5 = vec_madd( oggVec5, constVec, zeroVector ); // ogg[1][i] * 32768
|
|
oggVec6 = vec_madd( oggVec6, constVec, zeroVector ); // ogg[1][i+1] * 32768
|
|
oggVec7 = vec_madd( oggVec7, constVec, zeroVector ); // ogg[1][i+2] * 32768
|
|
oggVec8 = vec_madd( oggVec8, constVec, zeroVector ); // ogg[1][i+3] * 32768
|
|
|
|
//merge generates the interleaved pattern that we want and it
|
|
//doesn't require a permute vector, so use that instead
|
|
v0 = vec_mergeh( oggVec1, oggVec5 );
|
|
v1 = vec_mergel( oggVec1, oggVec5 );
|
|
v2 = vec_mergeh( oggVec2, oggVec6 );
|
|
v3 = vec_mergel( oggVec2, oggVec6 );
|
|
|
|
v4 = vec_mergeh( oggVec3, oggVec7 );
|
|
v5 = vec_mergel( oggVec3, oggVec7 );
|
|
v6 = vec_mergeh( oggVec4, oggVec8 );
|
|
v10 = vec_mergel( oggVec4, oggVec8 );
|
|
|
|
// rotate input data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v10 = vec_perm( v10, v10, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i*8] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i*8] );
|
|
vec_st( vec_sel( v1, v2, mask ), 31, &dest[i*8] );
|
|
vec_st( vec_sel( v2, v3, mask ), 47, &dest[i*8] );
|
|
vec_st( vec_sel( v3, v4, mask ), 63, &dest[i*8] );
|
|
vec_st( vec_sel( v4, v5, mask ), 79, &dest[i*8] );
|
|
vec_st( vec_sel( v5, v6, mask ), 95, &dest[i*8] );
|
|
vec_st( vec_sel( v6, v10, mask ), 111, &dest[i*8] );
|
|
vecDest = vec_sel( v10, vecDestEnd, mask );
|
|
vec_st( vecDest, 127, &dest[i*8] );
|
|
}
|
|
|
|
//cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*8+0] = dest[i*8+2] = dest[i*8+4] = dest[i*8+6] = ogg[0][i] * 32768.0f;
|
|
dest[i*8+1] = dest[i*8+3] = dest[i*8+5] = dest[i*8+7] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 22050 ) {
|
|
if ( numChannels == 1 ) {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
v10 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
|
|
// load values from ogg
|
|
v8 = v10;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v10 = vec_ld( 31, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*2] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
v1 = vec_perm( v9, v10, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// permute into results vectors to store
|
|
v5 = vec_perm( v0, v0, vecOneTwo );
|
|
v6 = vec_perm( v0, v0, vecThreeFour);
|
|
v7 = vec_perm( v1, v1, vecOneTwo );
|
|
v8 = vec_perm( v1, v1, vecThreeFour );
|
|
|
|
// rotate input data
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
v6 = vec_perm( v6, v6, storePerm );
|
|
v7 = vec_perm( v7, v7, storePerm );
|
|
v8 = vec_perm( v8, v8, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v5, mask ), 0, &dest[i*2] );
|
|
vec_st( vec_sel( v5, v6, mask ), 15, &dest[i*2] );
|
|
vec_st( vec_sel( v6, v7, mask ), 31, &dest[i*2] );
|
|
vec_st( vec_sel( v7, v8, mask ), 47, &dest[i*2] );
|
|
vecDest = vec_sel( v8, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*2] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*2+0] = dest[i*2+1] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
|
|
int i;
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
// load ogg[0][i] to ogg[0][i+4]
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 63, &dest[i*4] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors to store
|
|
v2 = vec_perm( v0, v1, vecFirst );
|
|
v3 = vec_perm( v0, v1, vecSecond );
|
|
v4 = vec_perm( v0, v1, vecThird );
|
|
v5 = vec_perm( v0, v1, vecFourth );
|
|
|
|
// rotate input data
|
|
v2 = vec_perm( v2, v2, storePerm );
|
|
v3 = vec_perm( v3, v3, storePerm );
|
|
v4 = vec_perm( v4, v4, storePerm );
|
|
v5 = vec_perm( v5, v5, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v2, mask ), 0, &dest[i*4] );
|
|
vec_st( vec_sel( v2, v3, mask ), 15, &dest[i*4] );
|
|
vec_st( vec_sel( v3, v4, mask ), 31, &dest[i*4] );
|
|
vec_st( vec_sel( v4, v5, mask ), 47, &dest[i*4] );
|
|
vecDest = vec_sel( v5, vecDestEnd, mask );
|
|
vec_st( vecDest, 63, &dest[i*4] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*4+0] = dest[i*4+2] = ogg[0][i] * 32768.0f;
|
|
dest[i*4+1] = dest[i*4+3] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else if ( kHz == 44100 ) {
|
|
if ( numChannels == 1 ) {
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+7 < numSamples; i += 8 ) {
|
|
// load values from ogg
|
|
v8 = v9;
|
|
v7 = vec_ld( 15, &ogg[0][i] );
|
|
v6 = v7;
|
|
v9 = vec_ld( 31, &ogg[0][i] );
|
|
vector float vecDestEnd = vec_ld( 31, &dest[i] );
|
|
|
|
v0 = vec_perm( v8, v7, vecPerm1 );
|
|
v1 = vec_perm( v6, v9, vecPerm1 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// rotate data
|
|
v0 = vec_perm( v0, v0, storePerm );
|
|
v1 = vec_perm( v1, v1, storePerm );
|
|
|
|
// store results
|
|
vec_st( vec_sel( vecDest, v0, mask ), 0, &dest[i] );
|
|
vec_st( vec_sel( v0, v1, mask ), 15, &dest[i] );
|
|
vecDest = vec_sel( v1, vecDestEnd, mask );
|
|
vec_st( vecDest, 31, &dest[i] );
|
|
}
|
|
|
|
// cleanup
|
|
for ( ; i < numSamples; i++ ) {
|
|
dest[i*1+0] = ogg[0][i] * 32768.0f;
|
|
}
|
|
} else {
|
|
|
|
// calculate perm vector and do first load
|
|
vecPerm1 = vec_add( vec_lvsl( -1, (int*) &ogg[0][0] ), (vector unsigned char)(1) );
|
|
vecPerm2 = vec_add( vec_lvsl( -1, (int*) &ogg[1][0] ), (vector unsigned char)(1) );
|
|
v7 = vec_ld( 0, &ogg[1][0] );
|
|
v9 = vec_ld( 0, &ogg[0][0] );
|
|
int i;
|
|
|
|
for ( i = 0; i+3 < numSamples >> 1; i += 4 ) {
|
|
v8 = v9;
|
|
v9 = vec_ld( 15, &ogg[0][i] );
|
|
v0 = vec_perm( v8, v9, vecPerm1 );
|
|
|
|
// load ogg[1][i] to ogg[1][i+3]
|
|
v6 = v7;
|
|
v7 = vec_ld( 15, &ogg[1][i] );
|
|
v1 = vec_perm( v6, v7, vecPerm2 );
|
|
|
|
// multiply
|
|
v0 = vec_madd( v0, constVec, zeroVector );
|
|
v1 = vec_madd( v1, constVec, zeroVector );
|
|
|
|
// generate result vectors
|
|
v2 = vec_mergeh( v0, v1 );
|
|
v3 = vec_mergel( v0, v1 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE2( &dest[i*2], v2, v3 );
|
|
}
|
|
// cleanup
|
|
for ( ; i < numSamples >> 1; i++ ) {
|
|
dest[i*2+0] = ogg[0][i] * 32768.0f;
|
|
dest[i*2+1] = ogg[1][i] * 32768.0f;
|
|
}
|
|
}
|
|
} else {
|
|
assert( 0 );
|
|
}
|
|
}
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerMono
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
int i;
|
|
float inc[2];
|
|
float spkr[4];
|
|
|
|
register vector float vecInc;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
register vector float vecSamplesLd1, vecSamplesLd2;
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
|
|
register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
|
|
register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
|
|
register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
|
|
register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
|
|
|
|
//constants
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] *= 2;
|
|
inc[1] *= 2;
|
|
|
|
//load data into registers
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
|
|
//load samples and mix buffers
|
|
vecSamplesLd1 = vecSamplesLast; //vec_ld( 0, &samples[i] );
|
|
vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLast = vec_ld( 31, &samples[i] );
|
|
|
|
vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
|
|
vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
|
|
|
|
vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
|
|
vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
|
|
vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
|
|
vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
// store results
|
|
ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
|
|
//add for next iteration
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerMono
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
|
|
int i;
|
|
float inc[2];
|
|
float spkr[4];
|
|
|
|
register vector float vecInc;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
register vector float vecSamplesLd1, vecSamplesLd2;
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
|
|
register vector unsigned char permVec1 = (vector unsigned char)(0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7); //0,0,1,1
|
|
register vector unsigned char permVec2 = (vector unsigned char)(8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15); //2,2,3,3
|
|
register vector unsigned char permVec3 = (vector unsigned char)(16,17,18,19,16,17,18,19,20,21,22,23,20,21,22,23); //4,4,5,5
|
|
register vector unsigned char permVec4 = (vector unsigned char)(24,25,26,27,24,25,26,27,28,29,30,31,28,29,30,31); //6,6,7,7
|
|
|
|
//constants
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] *= 2;
|
|
inc[1] *= 2;
|
|
|
|
//load data into registers
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0]), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i=0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
|
|
//load samples and mix buffers
|
|
vecSamplesLd1 = vecSamplesLast;
|
|
vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLast = vec_ld( 31, &samples[i] );
|
|
|
|
vecSamplesLd1 = vec_perm( vecSamplesLd1, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLd2 = vec_perm( vecSamplesLd2, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
|
|
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
|
|
vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
|
|
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
|
|
|
|
vecSamples1 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec1 );
|
|
vecSamples2 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec2 );
|
|
vecSamples3 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec3 );
|
|
vecSamples4 = vec_perm( vecSamplesLd1, vecSamplesLd2, permVec4 );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
|
|
//add for next iteration
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
}
|
|
}
|
|
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerStereo
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
int i, k;
|
|
float inc[2];
|
|
float spkr[4];
|
|
|
|
// loading buffers
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
// loading buffers
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecInc;
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
for ( k = 0; k < 2; k++ ) {
|
|
inc[k] *= 2;
|
|
}
|
|
|
|
// load data in vectors
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
// load mix buffers and samples
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*2] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[i*2+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[i*2+8] );
|
|
vecMixBuffer4 = vec_ld( 0, &mixBuffer[i*2+12] );
|
|
|
|
vecSamples1 = vecSamplesLast;
|
|
vecSamples2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamples3 = vec_ld( 31, &samples[i*2] );
|
|
vecSamples4 = vec_ld( 47, &samples[i*2] );
|
|
vecSamplesLast = vec_ld( 63, &samples[i*2] );
|
|
|
|
vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
|
|
vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
|
|
vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
|
|
vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
|
|
//store results
|
|
ALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundTwoSpeakerStereo
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundTwoSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[2], const float currentV[2] ) {
|
|
|
|
int i, k;
|
|
float inc[2];
|
|
float spkr[4];
|
|
// loading buffers
|
|
register vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4;
|
|
// loading buffers
|
|
register vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4;
|
|
register vector float vecSpeaker1, vecSpeaker2, vecSpeaker3, vecSpeaker4;
|
|
register vector float vecInc;
|
|
vector float fourVec = (vector float)(4.0);
|
|
vector float zeroVec = (vector float)(0.0);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
|
|
inc[0] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
inc[1] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
|
|
spkr[0] = lastV[0];
|
|
spkr[1] = lastV[1];
|
|
spkr[2] = lastV[0] + inc[0];
|
|
spkr[3] = lastV[1] + inc[1];
|
|
|
|
for ( k = 0; k < 2; k++ ) {
|
|
inc[k] *= 2;
|
|
}
|
|
|
|
// load data in vectors
|
|
vector float v0 = loadSplatUnalignedScalar( &inc[0] );
|
|
vector float v1 = loadSplatUnalignedScalar( &inc[1] );
|
|
vecInc = vec_mergeh( v0, v1 );
|
|
|
|
vector float v2 = loadSplatUnalignedScalar( &spkr[0] );
|
|
vector float v3 = loadSplatUnalignedScalar( &spkr[1] );
|
|
vector float v4 = loadSplatUnalignedScalar( &spkr[2] );
|
|
vector float v5 = loadSplatUnalignedScalar( &spkr[3] );
|
|
|
|
// load spkr array
|
|
v0 = vec_mergeh( v2, v4 );
|
|
v1 = vec_mergeh( v3, v5 );
|
|
vecSpeaker1 = vec_mergeh( v0, v1 );
|
|
|
|
vecSpeaker2 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker3, vecInc );
|
|
vecInc = vec_madd( vecInc, fourVec, zeroVec );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 8, we don't
|
|
//need a cleanup loop
|
|
for( i = 0 ; i+7 < MIXBUFFER_SAMPLES; i += 8 ) {
|
|
// load mix buffers and samples
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*2] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*2] );
|
|
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*2] );
|
|
vector float vecDestEnd = vec_ld( 63, &mixBuffer[i*2] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
|
|
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecDestEnd, mixBufferPerm );
|
|
|
|
vecSamples1 = vecSamplesLast;
|
|
vecSamples2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamples3 = vec_ld( 31, &samples[i*2] );
|
|
vecSamples4 = vec_ld( 47, &samples[i*2] );
|
|
vecSamplesLast = vec_ld( 63, &samples[i*2] );
|
|
|
|
vecSamples1 = vec_perm( vecSamples1, vecSamples2, samplesPerm );
|
|
vecSamples2 = vec_perm( vecSamples2, vecSamples3, samplesPerm );
|
|
vecSamples3 = vec_perm( vecSamples3, vecSamples4, samplesPerm );
|
|
vecSamples4 = vec_perm( vecSamples4, vecSamplesLast, samplesPerm );
|
|
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSpeaker1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSpeaker2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSpeaker3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSpeaker4, vecMixBuffer4 );
|
|
|
|
vecSpeaker1 = vec_add( vecSpeaker1, vecInc );
|
|
vecSpeaker2 = vec_add( vecSpeaker2, vecInc );
|
|
vecSpeaker3 = vec_add( vecSpeaker3, vecInc );
|
|
vecSpeaker4 = vec_add( vecSpeaker4, vecInc );
|
|
|
|
// store results
|
|
UNALIGNED_STORE4( &mixBuffer[i*2], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4 );
|
|
}
|
|
}
|
|
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerMono
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
float incL[24];
|
|
float sL[24];
|
|
int i, k;
|
|
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
|
|
// permute vectors for sample
|
|
vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
for ( k = 0; k < 6; k++ ) {
|
|
sL[k] = lastV[k];
|
|
}
|
|
for ( k = 6; k < 12; k++ ) {
|
|
sL[k] = lastV[k-6] + incL[k];
|
|
}
|
|
for ( k = 12; k < 18; k++ ) {
|
|
sL[k] = lastV[k-12] + incL[k] + incL[k];
|
|
}
|
|
for ( k = 18; k < 24; k++ ) {
|
|
sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
|
|
}
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
for ( k = 0; k < 24; k++ ) {
|
|
incL[k] *= 4;
|
|
}
|
|
|
|
//load the data
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
vecIncl5 = vec_ld( 63, &incL[0] );
|
|
vecIncl6 = vec_ld( 79, &incL[0] );
|
|
vecIncl7 = vec_ld( 95, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
|
|
vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
|
|
vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
vecSL5 = vec_ld( 63, &sL[0] );
|
|
vecSL6 = vec_ld( 79, &sL[0] );
|
|
vecSL7 = vec_ld( 95, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
|
|
vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
|
|
vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
|
|
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
|
|
//need a cleanup loop
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
|
|
//load mix buffer into vectors, assume aligned
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
|
|
vecMixBuffer4 = vec_ld( 0, &mixBuffer[(i*6)+12] );
|
|
vecMixBuffer5 = vec_ld( 0, &mixBuffer[(i*6)+16] );
|
|
vecMixBuffer6 = vec_ld( 0, &mixBuffer[(i*6)+20] );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want
|
|
vecSamples1 = vec_splat( vecSamplesLd, 0 );
|
|
vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
|
|
vecSamples3 = vec_splat( vecSamplesLd, 1 );
|
|
vecSamples4 = vec_splat( vecSamplesLd, 2 );
|
|
vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
|
|
vecSamples6 = vec_splat( vecSamplesLd, 3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
|
|
vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
|
|
vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
|
|
|
|
//store out results
|
|
ALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
vecSL4 = vec_add( vecSL4, vecIncl4 );
|
|
vecSL5 = vec_add( vecSL5, vecIncl5 );
|
|
vecSL6 = vec_add( vecSL6, vecIncl6 );
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerMono
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerMono( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
float incL[24];
|
|
float sL[24];
|
|
int i, k;
|
|
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4, vecIncl5, vecIncl6, vecIncl7;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4, vecSL5, vecSL6, vecSL7;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3, vecSamples4, vecSamples5, vecSamples6;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6;
|
|
// permute vectors for sample
|
|
register vector unsigned char samplePerm2 = (vector unsigned char)( 0,1,2,3,0,1,2,3,4,5,6,7,4,5,6,7);
|
|
register vector unsigned char samplePerm5 = (vector unsigned char)( 8,9,10,11,8,9,10,11,12,13,14,15,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = incL[12] = incL[18] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = incL[13] = incL[19] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = incL[14] = incL[20] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = incL[15] = incL[21] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = incL[16] = incL[22] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = incL[17] = incL[23] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
for ( k = 0; k < 6; k++ ) {
|
|
sL[k] = lastV[k];
|
|
}
|
|
for ( k = 6; k < 12; k++ ) {
|
|
sL[k] = lastV[k-6] + incL[k];
|
|
}
|
|
for ( k = 12; k < 18; k++ ) {
|
|
sL[k] = lastV[k-12] + incL[k] + incL[k];
|
|
}
|
|
for ( k = 18; k < 24; k++ ) {
|
|
sL[k] = lastV[k-18] + incL[k] + incL[k] + incL[k];
|
|
}
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
for ( k = 0; k < 24; k++ ) {
|
|
incL[k] *= 4;
|
|
}
|
|
|
|
// load the data
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
vecIncl5 = vec_ld( 63, &incL[0] );
|
|
vecIncl6 = vec_ld( 79, &incL[0] );
|
|
vecIncl7 = vec_ld( 95, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
vecIncl4 = vec_perm( vecIncl4, vecIncl5, incPerm );
|
|
vecIncl5 = vec_perm( vecIncl5, vecIncl6, incPerm );
|
|
vecIncl6 = vec_perm( vecIncl6, vecIncl7, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
vecSL5 = vec_ld( 63, &sL[0] );
|
|
vecSL6 = vec_ld( 79, &sL[0] );
|
|
vecSL7 = vec_ld( 95, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
vecSL4 = vec_perm( vecSL4, vecSL5, slPerm );
|
|
vecSL5 = vec_perm( vecSL5, vecSL6, slPerm );
|
|
vecSL6 = vec_perm( vecSL6, vecSL7, slPerm );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
//since MIXBUFFER_SAMPLES is a multiple of 4, we don't
|
|
//need a cleanup loop
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 4; i += 4 ) {
|
|
//load mix buffer into vectors
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
|
|
vecMixBuffer4 = vec_ld( 47, &mixBuffer[i*6] );
|
|
vecMixBuffer5 = vec_ld( 63, &mixBuffer[i*6] );
|
|
vecMixBuffer6 = vec_ld( 79, &mixBuffer[i*6] );
|
|
vector float vecDestEnd = vec_ld( 95, &mixBuffer[i*6] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecMixBuffer4, mixBufferPerm );
|
|
vecMixBuffer4 = vec_perm( vecMixBuffer4, vecMixBuffer5, mixBufferPerm );
|
|
vecMixBuffer5 = vec_perm( vecMixBuffer5, vecMixBuffer6, mixBufferPerm );
|
|
vecMixBuffer6 = vec_perm( vecMixBuffer6, vecDestEnd, mixBufferPerm );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want
|
|
vecSamples1 = vec_splat( vecSamplesLd, 0 );
|
|
vecSamples2 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm2 );
|
|
vecSamples3 = vec_splat( vecSamplesLd, 1 );
|
|
vecSamples4 = vec_splat( vecSamplesLd, 2 );
|
|
vecSamples5 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm5 );
|
|
vecSamples6 = vec_splat( vecSamplesLd, 3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
vecMixBuffer4 = vec_madd( vecSamples4, vecSL4, vecMixBuffer4 );
|
|
vecMixBuffer5 = vec_madd( vecSamples5, vecSL5, vecMixBuffer5 );
|
|
vecMixBuffer6 = vec_madd( vecSamples6, vecSL6, vecMixBuffer6 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE6( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3, vecMixBuffer4, vecMixBuffer5, vecMixBuffer6 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
vecSL4 = vec_add( vecSL4, vecIncl4 );
|
|
vecSL5 = vec_add( vecSL5, vecIncl5 );
|
|
vecSL6 = vec_add( vecSL6, vecIncl6 );
|
|
}
|
|
}
|
|
|
|
#endif /* SOUND_DEST_ALIGNED */
|
|
|
|
#ifdef SOUND_DEST_ALIGNED
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerStereo
|
|
|
|
Assumptions:
|
|
Assumes that mixBuffer starts at aligned address
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
// mixBuffer is aligned
|
|
assert( IS_16BYTE_ALIGNED( mixBuffer[0] ) );
|
|
|
|
float incL[12];
|
|
float sL[12];
|
|
int i;
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
|
|
// permute vectors for sample
|
|
vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
|
|
vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
sL[0] = lastV[0];
|
|
sL[1] = lastV[1];
|
|
sL[2] = lastV[2];
|
|
sL[3] = lastV[3];
|
|
sL[4] = lastV[4];
|
|
sL[5] = lastV[5];
|
|
sL[6] = lastV[0] + incL[0];
|
|
sL[7] = lastV[1] + incL[1];
|
|
sL[8] = lastV[2] + incL[2];
|
|
sL[9] = lastV[3] + incL[3];
|
|
sL[10] = lastV[4] + incL[4];
|
|
sL[11] = lastV[5] + incL[5];
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
incL[0] *= 2;
|
|
incL[1] *= 2;
|
|
incL[2] *= 2;
|
|
incL[3] *= 2;
|
|
incL[4] *= 2;
|
|
incL[5] *= 2;
|
|
incL[6] *= 2;
|
|
incL[7] *= 2;
|
|
incL[8] *= 2;
|
|
incL[9] *= 2;
|
|
incL[10] *= 2;
|
|
incL[11] *= 2;
|
|
|
|
//we aligned this data, so load it up
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
|
|
|
|
//load mix buffer into vectors, assume aligned
|
|
vecMixBuffer1 = vec_ld( 0, &mixBuffer[i*6] );
|
|
vecMixBuffer2 = vec_ld( 0, &mixBuffer[(i*6)+4] );
|
|
vecMixBuffer3 = vec_ld( 0, &mixBuffer[(i*6)+8] );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want. For the 2nd vector,
|
|
//the order happens to be the same as the order we loaded them
|
|
//in, so there's no need to permute that one
|
|
vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
|
|
vecSamples2 = vecSamplesLd;
|
|
vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
|
|
//store out results
|
|
ALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
}
|
|
}
|
|
#else
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixSoundSixSpeakerStereo
|
|
|
|
Assumptions:
|
|
No assumptions
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixSoundSixSpeakerStereo( float *mixBuffer, const float *samples, const int numSamples, const float lastV[6], const float currentV[6] ) {
|
|
|
|
float incL[12];
|
|
float sL[12];
|
|
|
|
int i;
|
|
vector float vecIncl1, vecIncl2, vecIncl3, vecIncl4;
|
|
vector float vecSL1, vecSL2, vecSL3, vecSL4;
|
|
vector float vecSamplesLd;
|
|
vector float vecSamples1, vecSamples2, vecSamples3;
|
|
vector float vecMixBuffer1, vecMixBuffer2, vecMixBuffer3;
|
|
// permute vectors for sample
|
|
vector unsigned char samplePerm1 = (vector unsigned char)( 0,1,2,3,4,5,6,7,0,1,2,3,0,1,2,3);
|
|
vector unsigned char samplePerm3 = (vector unsigned char)( 8,9,10,11,8,9,10,11,8,9,10,11,12,13,14,15);
|
|
|
|
assert( numSamples == MIXBUFFER_SAMPLES );
|
|
assert( SPEAKER_RIGHT == 1 );
|
|
assert( SPEAKER_BACKRIGHT == 5 );
|
|
|
|
// incL array, 6 elements repeated
|
|
incL[0] = incL[6] = ( currentV[0] - lastV[0] ) / MIXBUFFER_SAMPLES;
|
|
incL[1] = incL[7] = ( currentV[1] - lastV[1] ) / MIXBUFFER_SAMPLES;
|
|
incL[2] = incL[8] = ( currentV[2] - lastV[2] ) / MIXBUFFER_SAMPLES;
|
|
incL[3] = incL[9] = ( currentV[3] - lastV[3] ) / MIXBUFFER_SAMPLES;
|
|
incL[4] = incL[10] = ( currentV[4] - lastV[4] ) / MIXBUFFER_SAMPLES;
|
|
incL[5] = incL[11] = ( currentV[5] - lastV[5] ) / MIXBUFFER_SAMPLES;
|
|
|
|
// sL array repeated
|
|
sL[0] = lastV[0];
|
|
sL[1] = lastV[1];
|
|
sL[2] = lastV[2];
|
|
sL[3] = lastV[3];
|
|
sL[4] = lastV[4];
|
|
sL[5] = lastV[5];
|
|
sL[6] = lastV[0] + incL[0];
|
|
sL[7] = lastV[1] + incL[1];
|
|
sL[8] = lastV[2] + incL[2];
|
|
sL[9] = lastV[3] + incL[3];
|
|
sL[10] = lastV[4] + incL[4];
|
|
sL[11] = lastV[5] + incL[5];
|
|
|
|
// multiply by 2 since doing 12 at a time
|
|
incL[0] *= 2;
|
|
incL[1] *= 2;
|
|
incL[2] *= 2;
|
|
incL[3] *= 2;
|
|
incL[4] *= 2;
|
|
incL[5] *= 2;
|
|
incL[6] *= 2;
|
|
incL[7] *= 2;
|
|
incL[8] *= 2;
|
|
incL[9] *= 2;
|
|
incL[10] *= 2;
|
|
incL[11] *= 2;
|
|
|
|
// load the data
|
|
vector unsigned char incPerm = vec_add( vec_lvsl( -1, &incL[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char slPerm = vec_add( vec_lvsl( -1, &sL[0] ), (vector unsigned char)(1) );
|
|
vecIncl1 = vec_ld( 0, &incL[0] );
|
|
vecIncl2 = vec_ld( 15, &incL[0] );
|
|
vecIncl3 = vec_ld( 31, &incL[0] );
|
|
vecIncl4 = vec_ld( 47, &incL[0] );
|
|
|
|
vecIncl1 = vec_perm( vecIncl1, vecIncl2, incPerm );
|
|
vecIncl2 = vec_perm( vecIncl2, vecIncl3, incPerm );
|
|
vecIncl3 = vec_perm( vecIncl3, vecIncl4, incPerm );
|
|
|
|
vecSL1 = vec_ld( 0, &sL[0] );
|
|
vecSL2 = vec_ld( 15, &sL[0] );
|
|
vecSL3 = vec_ld( 31, &sL[0] );
|
|
vecSL4 = vec_ld( 47, &sL[0] );
|
|
|
|
vecSL1 = vec_perm( vecSL1, vecSL2, slPerm );
|
|
vecSL2 = vec_perm( vecSL2, vecSL3, slPerm );
|
|
vecSL3 = vec_perm( vecSL3, vecSL4, slPerm );
|
|
|
|
vector unsigned char samplesPerm = vec_add( vec_lvsl( -1, &samples[0] ), (vector unsigned char)(1) );
|
|
vector unsigned char mixBufferPerm = vec_add( vec_lvsl( -1, &mixBuffer[0] ), (vector unsigned char)(1) );
|
|
vector float vecSamplesLast = vec_ld( 0, &samples[0] );
|
|
vector float vecDest = vec_ld( 0, &mixBuffer[0] );
|
|
|
|
for( i = 0; i <= MIXBUFFER_SAMPLES - 2; i += 2 ) {
|
|
|
|
//load mix buffer into vectors
|
|
vecMixBuffer1 = vecDest;
|
|
vecMixBuffer2 = vec_ld( 15, &mixBuffer[i*6] );
|
|
vecMixBuffer3 = vec_ld( 31, &mixBuffer[i*6] );
|
|
vector float vecDestEnd = vec_ld( 47, &mixBuffer[i*6] );
|
|
|
|
vecMixBuffer1 = vec_perm( vecMixBuffer1, vecMixBuffer2, mixBufferPerm );
|
|
vecMixBuffer2 = vec_perm( vecMixBuffer2, vecMixBuffer3, mixBufferPerm );
|
|
vecMixBuffer3 = vec_perm( vecMixBuffer3, vecDestEnd, mixBufferPerm );
|
|
|
|
//load samples into vector
|
|
vector float vecSamplesLd2 = vec_ld( 15, &samples[i*2] );
|
|
vecSamplesLd = vec_perm( vecSamplesLast, vecSamplesLd2, samplesPerm );
|
|
vecSamplesLast = vecSamplesLd2;
|
|
|
|
//permute to get them ordered how we want. For the 2nd vector,
|
|
//the order happens to be the same as the order we loaded them
|
|
//in, so there's no need to permute that one
|
|
vecSamples1 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm1 );
|
|
vecSamples2 = vecSamplesLd;
|
|
vecSamples3 = vec_perm( vecSamplesLd, vecSamplesLd, samplePerm3 );
|
|
|
|
//do calculation
|
|
vecMixBuffer1 = vec_madd( vecSamples1, vecSL1, vecMixBuffer1 );
|
|
vecMixBuffer2 = vec_madd( vecSamples2, vecSL2, vecMixBuffer2 );
|
|
vecMixBuffer3 = vec_madd( vecSamples3, vecSL3, vecMixBuffer3 );
|
|
|
|
// store results
|
|
UNALIGNED_STORE3( &mixBuffer[i*6], vecMixBuffer1, vecMixBuffer2, vecMixBuffer3 );
|
|
|
|
// add for next iteration
|
|
vecSL1 = vec_add( vecSL1, vecIncl1 );
|
|
vecSL2 = vec_add( vecSL2, vecIncl2 );
|
|
vecSL3 = vec_add( vecSL3, vecIncl3 );
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
============
|
|
idSIMD_AltiVec::MixedSoundToSamples
|
|
============
|
|
*/
|
|
void VPCALL idSIMD_AltiVec::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
|
|
//this is basically a clamp for sound mixing
|
|
register vector float v0, v1, v2, v3, v4, v5, v6, v7;
|
|
register vector signed int vi0, vi1, vi2, vi3;
|
|
register vector signed short vs0, vs1;
|
|
register vector float minVec, maxVec, constVec;
|
|
int i = 0;
|
|
|
|
//unaligned at start, since samples is not 16-byte aligned
|
|
for ( ; NOT_16BYTE_ALIGNED( samples[i] ) && ( i < numSamples ); i++ ) {
|
|
samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
|
|
}
|
|
|
|
constVec = (vector float)(65536.0f);
|
|
|
|
//splat min/max into a vector
|
|
minVec = (vector float)(-32768.0f);
|
|
maxVec = (vector float)(32767.0f);
|
|
|
|
vector float vecOld = vec_ld( 0, &mixBuffer[i] );
|
|
vector unsigned char permVec = vec_add( vec_lvsl( -1, &mixBuffer[i] ), (vector unsigned char)(1) );
|
|
|
|
//vectorize!
|
|
for ( ; i+15 < numSamples; i += 16 ) {
|
|
//load source
|
|
v0 = vecOld;
|
|
v1 = vec_ld( 15, &mixBuffer[i] );
|
|
v2 = vec_ld( 31, &mixBuffer[i] );
|
|
v3 = vec_ld( 31, &mixBuffer[i] );
|
|
vecOld = vec_ld( 47, &mixBuffer[i] );
|
|
|
|
v0 = vec_perm( v0, v1, permVec );
|
|
v1 = vec_perm( v1, v2, permVec );
|
|
v2 = vec_perm( v2, v3, permVec );
|
|
v3 = vec_perm( v3, vecOld, permVec );
|
|
|
|
//apply minimum
|
|
v4 = vec_max( v0, minVec );
|
|
v5 = vec_max( v1, minVec );
|
|
v6 = vec_max( v2, minVec );
|
|
v7 = vec_max( v3, minVec );
|
|
|
|
//apply maximum
|
|
v4 = vec_min( v4, maxVec );
|
|
v5 = vec_min( v5, maxVec );
|
|
v6 = vec_min( v6, maxVec );
|
|
v7 = vec_min( v7, maxVec );
|
|
|
|
// convert floats to ints
|
|
vi0 = vec_cts( v4, 0 );
|
|
vi1 = vec_cts( v5, 0 );
|
|
vi2 = vec_cts( v6, 0 );
|
|
vi3 = vec_cts( v7, 0 );
|
|
|
|
// pack ints into shorts
|
|
vs0 = vec_pack( vi0, vi1 );
|
|
vs1 = vec_pack( vi2, vi3 );
|
|
ALIGNED_STORE2( &samples[i], vs0, vs1 );
|
|
}
|
|
|
|
//handle cleanup
|
|
for ( ; i < numSamples ; i++ ) {
|
|
samples[i] = mixBuffer[i] <= -32768.0f ? -32768 : mixBuffer[i] >= 32767.0f ? 32767 : (short) mixBuffer[i];
|
|
}
|
|
}
|
|
#endif /* ENABLE_SOUND_ROUTINES */
|
|
|
|
#endif /* MACOS_X */
|