mirror of
https://github.com/id-Software/DOOM-3-BFG.git
synced 2024-11-24 13:01:27 +00:00
Restored generic (non-SIMD) code
This commit is contained in:
parent
be311f42e1
commit
9c37079c16
23 changed files with 3328 additions and 24 deletions
|
@ -193,6 +193,7 @@ Assumes input is in the range [-1, 1]
|
||||||
ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float & z, byte * bval ) {
|
ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float & z, byte * bval ) {
|
||||||
assert_4_byte_aligned( bval ); // for __stvebx
|
assert_4_byte_aligned( bval ); // for __stvebx
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
|
const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||||
const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f };
|
const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||||
|
@ -209,6 +210,13 @@ ID_INLINE void VertexFloatToByte( const float & x, const float & y, const float
|
||||||
bval[1] = (byte)_mm_extract_epi16( xyz16, 1 );
|
bval[1] = (byte)_mm_extract_epi16( xyz16, 1 );
|
||||||
bval[2] = (byte)_mm_extract_epi16( xyz16, 2 );
|
bval[2] = (byte)_mm_extract_epi16( xyz16, 2 );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
bval[0] = VERTEX_FLOAT_TO_BYTE( x );
|
||||||
|
bval[1] = VERTEX_FLOAT_TO_BYTE( y );
|
||||||
|
bval[2] = VERTEX_FLOAT_TO_BYTE( z );
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -609,6 +617,7 @@ ID_INLINE void WriteDrawVerts16( idDrawVert * destVerts, const idDrawVert * loca
|
||||||
assert_16_byte_aligned( destVerts );
|
assert_16_byte_aligned( destVerts );
|
||||||
assert_16_byte_aligned( localVerts );
|
assert_16_byte_aligned( localVerts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
for ( int i = 0; i < numVerts; i++ ) {
|
for ( int i = 0; i < numVerts; i++ ) {
|
||||||
__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)( localVerts + i ) + 0 ) );
|
__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)( localVerts + i ) + 0 ) );
|
||||||
|
@ -617,6 +626,11 @@ ID_INLINE void WriteDrawVerts16( idDrawVert * destVerts, const idDrawVert * loca
|
||||||
_mm_stream_si128( (__m128i *)( (byte *)( destVerts + i ) + 16 ), v1 );
|
_mm_stream_si128( (__m128i *)( (byte *)( destVerts + i ) + 16 ), v1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
memcpy( destVerts, localVerts, numVerts * sizeof( idDrawVert ) );
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -29,6 +29,7 @@ If you have questions concerning this license or the applicable additional terms
|
||||||
#ifndef __DRAWVERT_INTRINSICS_H__
|
#ifndef __DRAWVERT_INTRINSICS_H__
|
||||||
#define __DRAWVERT_INTRINSICS_H__
|
#define __DRAWVERT_INTRINSICS_H__
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
|
static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
|
||||||
static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
|
static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
|
||||||
|
@ -50,12 +51,14 @@ static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0
|
||||||
static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
|
static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
|
||||||
static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
|
static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
====================
|
====================
|
||||||
FastF32toF16
|
FastF32toF16
|
||||||
====================
|
====================
|
||||||
*/
|
*/
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
|
ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
|
||||||
__m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
|
__m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
|
||||||
|
@ -77,6 +80,7 @@ ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits ) {
|
||||||
return _mm_packs_epi32( flt16, flt16 );
|
return _mm_packs_epi32( flt16, flt16 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
|
ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
|
||||||
const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT;
|
const int f32_sign_mask = 1U << IEEE_FLT_SIGN_BIT;
|
||||||
|
@ -115,6 +119,7 @@ ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 ) {
|
||||||
LoadSkinnedDrawVertPosition
|
LoadSkinnedDrawVertPosition
|
||||||
====================
|
====================
|
||||||
*/
|
*/
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) {
|
ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, const idJointMat * joints ) {
|
||||||
const idJointMat & j0 = joints[base.color[0]];
|
const idJointMat & j0 = joints[base.color[0]];
|
||||||
|
@ -176,6 +181,7 @@ ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert & base, co
|
||||||
return r0;
|
return r0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) {
|
ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert & vert, const idJointMat * joints ) {
|
||||||
const idJointMat & j0 = joints[vert.color[0]];
|
const idJointMat & j0 = joints[vert.color[0]];
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -44,6 +44,7 @@ const float LCP_DELTA_FORCE_EPSILON = 1e-9f;
|
||||||
#define IGNORE_UNSATISFIABLE_VARIABLES
|
#define IGNORE_UNSATISFIABLE_VARIABLES
|
||||||
|
|
||||||
|
|
||||||
|
#if defined( ID_WIN_X86_SSE_ASM ) || defined( ID_WIN_X86_SSE_INTRIN )
|
||||||
|
|
||||||
ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f };
|
ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||||
ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||||
|
@ -67,6 +68,8 @@ ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 };
|
||||||
ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 };
|
ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 };
|
||||||
ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 };
|
ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 };
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
========================
|
========================
|
||||||
Multiply_SIMD
|
Multiply_SIMD
|
||||||
|
@ -82,6 +85,7 @@ static void Multiply_SIMD( float * dst, const float * src0, const float * src1,
|
||||||
dst[i] = src0[i] * src1[i];
|
dst[i] = src0[i] * src1[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
for ( ; i + 4 <= count; i += 4 ) {
|
for ( ; i + 4 <= count; i += 4 ) {
|
||||||
assert_16_byte_aligned( &dst[i] );
|
assert_16_byte_aligned( &dst[i] );
|
||||||
|
@ -94,6 +98,20 @@ static void Multiply_SIMD( float * dst, const float * src0, const float * src1,
|
||||||
_mm_store_ps( dst + i, s0 );
|
_mm_store_ps( dst + i, s0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
for ( ; i + 4 <= count; i += 4 ) {
|
||||||
|
assert_16_byte_aligned( &dst[i] );
|
||||||
|
assert_16_byte_aligned( &src0[i] );
|
||||||
|
assert_16_byte_aligned( &src1[i] );
|
||||||
|
|
||||||
|
dst[i+0] = src0[i+0] * src1[i+0];
|
||||||
|
dst[i+1] = src0[i+1] * src1[i+1];
|
||||||
|
dst[i+2] = src0[i+2] * src1[i+2];
|
||||||
|
dst[i+3] = src0[i+3] * src1[i+3];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
for ( ; i < count; i++ ) {
|
for ( ; i < count; i++ ) {
|
||||||
dst[i] = src0[i] * src1[i];
|
dst[i] = src0[i] * src1[i];
|
||||||
|
@ -115,6 +133,7 @@ static void MultiplyAdd_SIMD( float * dst, const float constant, const float * s
|
||||||
dst[i] += constant * src[i];
|
dst[i] += constant * src[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
__m128 c = _mm_load1_ps( & constant );
|
__m128 c = _mm_load1_ps( & constant );
|
||||||
for ( ; i + 4 <= count; i += 4 ) {
|
for ( ; i + 4 <= count; i += 4 ) {
|
||||||
|
@ -127,6 +146,19 @@ static void MultiplyAdd_SIMD( float * dst, const float constant, const float * s
|
||||||
_mm_store_ps( dst + i, s );
|
_mm_store_ps( dst + i, s );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
for ( ; i + 4 <= count; i += 4 ) {
|
||||||
|
assert_16_byte_aligned( &src[i] );
|
||||||
|
assert_16_byte_aligned( &dst[i] );
|
||||||
|
|
||||||
|
dst[i+0] += constant * src[i+0];
|
||||||
|
dst[i+1] += constant * src[i+1];
|
||||||
|
dst[i+2] += constant * src[i+2];
|
||||||
|
dst[i+3] += constant * src[i+3];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
for ( ; i < count; i++ ) {
|
for ( ; i < count; i++ ) {
|
||||||
dst[i] += constant * src[i];
|
dst[i] += constant * src[i];
|
||||||
|
@ -144,7 +176,7 @@ static float DotProduct_SIMD( const float * src0, const float * src1, const int
|
||||||
assert_16_byte_aligned( src0 );
|
assert_16_byte_aligned( src0 );
|
||||||
assert_16_byte_aligned( src1 );
|
assert_16_byte_aligned( src1 );
|
||||||
|
|
||||||
#ifndef _lint
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
__m128 sum = (__m128 &) SIMD_SP_zero;
|
__m128 sum = (__m128 &) SIMD_SP_zero;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
@ -266,7 +298,7 @@ static void LowerTriangularSolve_SIMD( const idMatX & L, float * x, const float
|
||||||
|
|
||||||
int i = skip;
|
int i = skip;
|
||||||
|
|
||||||
#ifndef _lint
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
// work up to a multiple of 4 rows
|
// work up to a multiple of 4 rows
|
||||||
for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
|
for ( ; ( i & 3 ) != 0 && i < n; i++ ) {
|
||||||
|
@ -520,7 +552,7 @@ static void LowerTriangularSolveTranspose_SIMD( const idMatX & L, float * x, con
|
||||||
const float * lptr = L.ToFloatPtr() + m * nc + m - 4;
|
const float * lptr = L.ToFloatPtr() + m * nc + m - 4;
|
||||||
float * xptr = x + m;
|
float * xptr = x + m;
|
||||||
|
|
||||||
#ifndef _lint
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
// process 4 rows at a time
|
// process 4 rows at a time
|
||||||
for ( int i = m; i >= 4; i -= 4 ) {
|
for ( int i = m; i >= 4; i -= 4 ) {
|
||||||
|
@ -850,7 +882,7 @@ static bool LDLT_Factor_SIMD( idMatX & mat, idVecX & invDiag, const int n ) {
|
||||||
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
mptr[j*nc+3] = ( mptr[j*nc+3] - v[0] * mptr[j*nc+0] - v[1] * mptr[j*nc+1] - v[2] * mptr[j*nc+2] ) * d;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef _lint
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
__m128 vzero = _mm_setzero_ps();
|
__m128 vzero = _mm_setzero_ps();
|
||||||
for ( int i = 4; i < n; i += 4 ) {
|
for ( int i = 4; i < n; i += 4 ) {
|
||||||
|
@ -1210,6 +1242,7 @@ static void GetMaxStep_SIMD( const float * f, const float * a, const float * del
|
||||||
const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped,
|
const float * lo, const float * hi, const int * side, int numUnbounded, int numClamped,
|
||||||
int d, float dir, float & maxStep, int & limit, int & limitSide ) {
|
int d, float dir, float & maxStep, int & limit, int & limitSide ) {
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
__m128 vMaxStep;
|
__m128 vMaxStep;
|
||||||
__m128i vLimit;
|
__m128i vLimit;
|
||||||
|
@ -1332,6 +1365,65 @@ static void GetMaxStep_SIMD( const float * f, const float * a, const float * del
|
||||||
_mm_store_ss( & maxStep, vMaxStep );
|
_mm_store_ss( & maxStep, vMaxStep );
|
||||||
limit = _mm_cvtsi128_si32( vLimit );
|
limit = _mm_cvtsi128_si32( vLimit );
|
||||||
limitSide = _mm_cvtsi128_si32( vLimitSide );
|
limitSide = _mm_cvtsi128_si32( vLimitSide );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// default to a full step for the current variable
|
||||||
|
{
|
||||||
|
float negAccel = -a[d];
|
||||||
|
float deltaAccel = delta_a[d];
|
||||||
|
int m0 = ( fabs( deltaAccel ) > LCP_DELTA_ACCEL_EPSILON );
|
||||||
|
float step = negAccel / ( m0 ? deltaAccel : 1.0f );
|
||||||
|
maxStep = m0 ? step : 0.0f;
|
||||||
|
limit = d;
|
||||||
|
limitSide = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// test the current variable
|
||||||
|
{
|
||||||
|
float deltaForce = dir;
|
||||||
|
float forceLimit = ( deltaForce < 0.0f ) ? lo[d] : hi[d];
|
||||||
|
float step = ( forceLimit - f[d] ) / deltaForce;
|
||||||
|
int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
|
||||||
|
int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
|
||||||
|
int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
|
||||||
|
int m2 = ( step < maxStep );
|
||||||
|
int m3 = ( m0 & m1 & m2 );
|
||||||
|
maxStep = m3 ? step : maxStep;
|
||||||
|
limit = m3 ? d : limit;
|
||||||
|
limitSide = m3 ? setSide : limitSide;
|
||||||
|
}
|
||||||
|
|
||||||
|
// test the clamped bounded variables
|
||||||
|
for ( int i = numUnbounded; i < numClamped; i++ ) {
|
||||||
|
float deltaForce = delta_f[i];
|
||||||
|
float forceLimit = ( deltaForce < 0.0f ) ? lo[i] : hi[i];
|
||||||
|
int m0 = ( fabs( deltaForce ) > LCP_DELTA_FORCE_EPSILON );
|
||||||
|
float step = ( forceLimit - f[i] ) / ( m0 ? deltaForce : 1.0f );
|
||||||
|
int setSide = ( deltaForce < 0.0f ) ? -1 : 1;
|
||||||
|
int m1 = ( fabs( forceLimit ) != idMath::INFINITY );
|
||||||
|
int m2 = ( step < maxStep );
|
||||||
|
int m3 = ( m0 & m1 & m2 );
|
||||||
|
maxStep = m3 ? step : maxStep;
|
||||||
|
limit = m3 ? i : limit;
|
||||||
|
limitSide = m3 ? setSide : limitSide;
|
||||||
|
}
|
||||||
|
|
||||||
|
// test the not clamped bounded variables
|
||||||
|
for ( int i = numClamped; i < d; i++ ) {
|
||||||
|
float negAccel = -a[i];
|
||||||
|
float deltaAccel = delta_a[i];
|
||||||
|
int m0 = ( side[i] * deltaAccel > LCP_DELTA_ACCEL_EPSILON );
|
||||||
|
float step = negAccel / ( m0 ? deltaAccel : 1.0f );
|
||||||
|
int m1 = ( lo[i] < -LCP_BOUND_EPSILON || hi[i] > LCP_BOUND_EPSILON );
|
||||||
|
int m2 = ( step < maxStep );
|
||||||
|
int m3 = ( m0 & m1 & m2 );
|
||||||
|
maxStep = m3 ? step : maxStep;
|
||||||
|
limit = m3 ? i : limit;
|
||||||
|
limitSide = m3 ? 0 : limitSide;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -171,6 +171,7 @@ void idMatX::CopyLowerToUpperTriangle() {
|
||||||
assert( ( GetNumColumns() & 3 ) == 0 );
|
assert( ( GetNumColumns() & 3 ) == 0 );
|
||||||
assert( GetNumColumns() >= GetNumRows() );
|
assert( GetNumColumns() >= GetNumRows() );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
const int n = GetNumColumns();
|
const int n = GetNumColumns();
|
||||||
const int m = GetNumRows();
|
const int m = GetNumRows();
|
||||||
|
@ -307,6 +308,20 @@ void idMatX::CopyLowerToUpperTriangle() {
|
||||||
_mm_store_ps( basePtr + n0, r0 );
|
_mm_store_ps( basePtr + n0, r0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
const int n = GetNumColumns();
|
||||||
|
const int m = GetNumRows();
|
||||||
|
for ( int i = 0; i < m; i++ ) {
|
||||||
|
const float * __restrict ptr = ToFloatPtr() + ( i + 1 ) * n + i;
|
||||||
|
float * __restrict dstPtr = ToFloatPtr() + i * n;
|
||||||
|
for ( int j = i + 1; j < m; j++ ) {
|
||||||
|
dstPtr[j] = ptr[0];
|
||||||
|
ptr += n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _DEBUG
|
#ifdef _DEBUG
|
||||||
for ( int i = 0; i < numRows; i++ ) {
|
for ( int i = 0; i < numRows; i++ ) {
|
||||||
|
|
|
@ -389,7 +389,7 @@ idMatX::operator=
|
||||||
ID_INLINE idMatX &idMatX::operator=( const idMatX &a ) {
|
ID_INLINE idMatX &idMatX::operator=( const idMatX &a ) {
|
||||||
SetSize( a.numRows, a.numColumns );
|
SetSize( a.numRows, a.numColumns );
|
||||||
int s = a.numRows * a.numColumns;
|
int s = a.numRows * a.numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( mat + i, _mm_load_ps( a.mat + i ) );
|
_mm_store_ps( mat + i, _mm_load_ps( a.mat + i ) );
|
||||||
}
|
}
|
||||||
|
@ -410,7 +410,7 @@ ID_INLINE idMatX idMatX::operator*( const float a ) const {
|
||||||
|
|
||||||
m.SetTempSize( numRows, numColumns );
|
m.SetTempSize( numRows, numColumns );
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
__m128 va = _mm_load1_ps( & a );
|
__m128 va = _mm_load1_ps( & a );
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( m.mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) );
|
_mm_store_ps( m.mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) );
|
||||||
|
@ -462,7 +462,7 @@ ID_INLINE idMatX idMatX::operator+( const idMatX &a ) const {
|
||||||
assert( numRows == a.numRows && numColumns == a.numColumns );
|
assert( numRows == a.numRows && numColumns == a.numColumns );
|
||||||
m.SetTempSize( numRows, numColumns );
|
m.SetTempSize( numRows, numColumns );
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( m.mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
_mm_store_ps( m.mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -485,7 +485,7 @@ ID_INLINE idMatX idMatX::operator-( const idMatX &a ) const {
|
||||||
assert( numRows == a.numRows && numColumns == a.numColumns );
|
assert( numRows == a.numRows && numColumns == a.numColumns );
|
||||||
m.SetTempSize( numRows, numColumns );
|
m.SetTempSize( numRows, numColumns );
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( m.mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
_mm_store_ps( m.mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -504,7 +504,7 @@ idMatX::operator*=
|
||||||
*/
|
*/
|
||||||
ID_INLINE idMatX &idMatX::operator*=( const float a ) {
|
ID_INLINE idMatX &idMatX::operator*=( const float a ) {
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
__m128 va = _mm_load1_ps( & a );
|
__m128 va = _mm_load1_ps( & a );
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) );
|
_mm_store_ps( mat + i, _mm_mul_ps( _mm_load_ps( mat + i ), va ) );
|
||||||
|
@ -537,7 +537,7 @@ idMatX::operator+=
|
||||||
ID_INLINE idMatX &idMatX::operator+=( const idMatX &a ) {
|
ID_INLINE idMatX &idMatX::operator+=( const idMatX &a ) {
|
||||||
assert( numRows == a.numRows && numColumns == a.numColumns );
|
assert( numRows == a.numRows && numColumns == a.numColumns );
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
_mm_store_ps( mat + i, _mm_add_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -558,7 +558,7 @@ idMatX::operator-=
|
||||||
ID_INLINE idMatX &idMatX::operator-=( const idMatX &a ) {
|
ID_INLINE idMatX &idMatX::operator-=( const idMatX &a ) {
|
||||||
assert( numRows == a.numRows && numColumns == a.numColumns );
|
assert( numRows == a.numRows && numColumns == a.numColumns );
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
_mm_store_ps( mat + i, _mm_sub_ps( _mm_load_ps( mat + i ), _mm_load_ps( a.mat + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -744,7 +744,7 @@ idMatX::Zero
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idMatX::Zero() {
|
ID_INLINE void idMatX::Zero() {
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( mat + i, _mm_setzero_ps() );
|
_mm_store_ps( mat + i, _mm_setzero_ps() );
|
||||||
}
|
}
|
||||||
|
@ -838,7 +838,7 @@ idMatX::Negate
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idMatX::Negate() {
|
ID_INLINE void idMatX::Negate() {
|
||||||
int s = numRows * numColumns;
|
int s = numRows * numColumns;
|
||||||
#ifdef MATX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(MATX_SIMD)
|
||||||
ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
|
ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
|
||||||
for ( int i = 0; i < s; i += 4 ) {
|
for ( int i = 0; i < s; i += 4 ) {
|
||||||
_mm_store_ps( mat + i, _mm_xor_ps( _mm_load_ps( mat + i ), (__m128 &) signBit[0] ) );
|
_mm_store_ps( mat + i, _mm_xor_ps( _mm_load_ps( mat + i ), (__m128 &) signBit[0] ) );
|
||||||
|
|
|
@ -51,6 +51,7 @@ const float idMath::INFINITY = 1e30f;
|
||||||
const float idMath::FLT_EPSILON = 1.192092896e-07f;
|
const float idMath::FLT_EPSILON = 1.192092896e-07f;
|
||||||
const float idMath::FLT_SMALLEST_NON_DENORMAL = * reinterpret_cast< const float * >( & SMALLEST_NON_DENORMAL ); // 1.1754944e-038f
|
const float idMath::FLT_SMALLEST_NON_DENORMAL = * reinterpret_cast< const float * >( & SMALLEST_NON_DENORMAL ); // 1.1754944e-038f
|
||||||
|
|
||||||
|
#if defined( ID_WIN_X86_SSE_INTRIN )
|
||||||
const __m128 idMath::SIMD_SP_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
|
const __m128 idMath::SIMD_SP_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||||
const __m128 idMath::SIMD_SP_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
|
const __m128 idMath::SIMD_SP_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
|
||||||
const __m128 idMath::SIMD_SP_min_char = { -128.0f, -128.0f, -128.0f, -128.0f };
|
const __m128 idMath::SIMD_SP_min_char = { -128.0f, -128.0f, -128.0f, -128.0f };
|
||||||
|
@ -61,6 +62,7 @@ const __m128 idMath::SIMD_SP_smallestNonDenorm = { FLT_SMALLEST_NON_DENORMAL, FL
|
||||||
const __m128 idMath::SIMD_SP_tiny = { 1e-4f, 1e-4f, 1e-4f, 1e-4f };
|
const __m128 idMath::SIMD_SP_tiny = { 1e-4f, 1e-4f, 1e-4f, 1e-4f };
|
||||||
const __m128 idMath::SIMD_SP_rsqrt_c0 = { 3.0f, 3.0f, 3.0f, 3.0f };
|
const __m128 idMath::SIMD_SP_rsqrt_c0 = { 3.0f, 3.0f, 3.0f, 3.0f };
|
||||||
const __m128 idMath::SIMD_SP_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f };
|
const __m128 idMath::SIMD_SP_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f };
|
||||||
|
#endif
|
||||||
|
|
||||||
bool idMath::initialized = false;
|
bool idMath::initialized = false;
|
||||||
dword idMath::iSqrt[SQRT_TABLE_SIZE]; // inverse square root lookup table
|
dword idMath::iSqrt[SQRT_TABLE_SIZE]; // inverse square root lookup table
|
||||||
|
|
|
@ -419,6 +419,7 @@ public:
|
||||||
static const float FLT_EPSILON; // smallest positive number such that 1.0+FLT_EPSILON != 1.0
|
static const float FLT_EPSILON; // smallest positive number such that 1.0+FLT_EPSILON != 1.0
|
||||||
static const float FLT_SMALLEST_NON_DENORMAL; // smallest non-denormal 32-bit floating point value
|
static const float FLT_SMALLEST_NON_DENORMAL; // smallest non-denormal 32-bit floating point value
|
||||||
|
|
||||||
|
#if defined( ID_WIN_X86_SSE_INTRIN )
|
||||||
static const __m128 SIMD_SP_zero;
|
static const __m128 SIMD_SP_zero;
|
||||||
static const __m128 SIMD_SP_255;
|
static const __m128 SIMD_SP_255;
|
||||||
static const __m128 SIMD_SP_min_char;
|
static const __m128 SIMD_SP_min_char;
|
||||||
|
@ -429,6 +430,7 @@ public:
|
||||||
static const __m128 SIMD_SP_tiny;
|
static const __m128 SIMD_SP_tiny;
|
||||||
static const __m128 SIMD_SP_rsqrt_c0;
|
static const __m128 SIMD_SP_rsqrt_c0;
|
||||||
static const __m128 SIMD_SP_rsqrt_c1;
|
static const __m128 SIMD_SP_rsqrt_c1;
|
||||||
|
#endif
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum {
|
enum {
|
||||||
|
@ -460,9 +462,15 @@ idMath::InvSqrt
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE float idMath::InvSqrt( float x ) {
|
ID_INLINE float idMath::InvSqrt( float x ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -471,9 +479,15 @@ idMath::InvSqrt16
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE float idMath::InvSqrt16( float x ) {
|
ID_INLINE float idMath::InvSqrt16( float x ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
|
|
||||||
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -482,7 +496,11 @@ idMath::Sqrt
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE float idMath::Sqrt( float x ) {
|
ID_INLINE float idMath::Sqrt( float x ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
return ( x >= 0.0f ) ? x * InvSqrt( x ) : 0.0f;
|
return ( x >= 0.0f ) ? x * InvSqrt( x ) : 0.0f;
|
||||||
|
#else
|
||||||
|
return ( x >= 0.0f ) ? sqrtf( x ) : 0.0f;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -491,7 +509,11 @@ idMath::Sqrt16
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE float idMath::Sqrt16( float x ) {
|
ID_INLINE float idMath::Sqrt16( float x ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
return ( x >= 0.0f ) ? x * InvSqrt16( x ) : 0.0f;
|
return ( x >= 0.0f ) ? x * InvSqrt16( x ) : 0.0f;
|
||||||
|
#else
|
||||||
|
return ( x >= 0.0f ) ? sqrtf( x ) : 0.0f;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -601,6 +623,7 @@ idMath::SinCos
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idMath::SinCos( float a, float &s, float &c ) {
|
ID_INLINE void idMath::SinCos( float a, float &s, float &c ) {
|
||||||
|
#if defined( ID_WIN_X86_ASM )
|
||||||
_asm {
|
_asm {
|
||||||
fld a
|
fld a
|
||||||
fsincos
|
fsincos
|
||||||
|
@ -609,6 +632,10 @@ ID_INLINE void idMath::SinCos( float a, float &s, float &c ) {
|
||||||
fstp dword ptr [ecx]
|
fstp dword ptr [ecx]
|
||||||
fstp dword ptr [edx]
|
fstp dword ptr [edx]
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
s = sinf( a );
|
||||||
|
c = cosf( a );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1128,11 +1155,24 @@ idMath::Ftoi
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE int idMath::Ftoi( float f ) {
|
ID_INLINE int idMath::Ftoi( float f ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
// If a converted result is larger than the maximum signed doubleword integer,
|
// If a converted result is larger than the maximum signed doubleword integer,
|
||||||
// the floating-point invalid exception is raised, and if this exception is masked,
|
// the floating-point invalid exception is raised, and if this exception is masked,
|
||||||
// the indefinite integer value (80000000H) is returned.
|
// the indefinite integer value (80000000H) is returned.
|
||||||
__m128 x = _mm_load_ss( &f );
|
__m128 x = _mm_load_ss( &f );
|
||||||
return _mm_cvttss_si32( x );
|
return _mm_cvttss_si32( x );
|
||||||
|
#elif 0 // round chop (C/C++ standard)
|
||||||
|
int i, s, e, m, shift;
|
||||||
|
i = *reinterpret_cast<int *>(&f);
|
||||||
|
s = i >> IEEE_FLT_SIGN_BIT;
|
||||||
|
e = ( ( i >> IEEE_FLT_MANTISSA_BITS ) & ( ( 1 << IEEE_FLT_EXPONENT_BITS ) - 1 ) ) - IEEE_FLT_EXPONENT_BIAS;
|
||||||
|
m = ( i & ( ( 1 << IEEE_FLT_MANTISSA_BITS ) - 1 ) ) | ( 1 << IEEE_FLT_MANTISSA_BITS );
|
||||||
|
shift = e - IEEE_FLT_MANTISSA_BITS;
|
||||||
|
return ( ( ( ( m >> -shift ) | ( m << shift ) ) & ~( e >> INT32_SIGN_BIT ) ) ^ s ) - s;
|
||||||
|
#else
|
||||||
|
// If a converted result is larger than the maximum signed doubleword integer the result is undefined.
|
||||||
|
return C_FLOAT_TO_INT( f );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1141,10 +1181,21 @@ idMath::Ftoi8
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE char idMath::Ftoi8( float f ) {
|
ID_INLINE char idMath::Ftoi8( float f ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
__m128 x = _mm_load_ss( &f );
|
__m128 x = _mm_load_ss( &f );
|
||||||
x = _mm_max_ss( x, SIMD_SP_min_char );
|
x = _mm_max_ss( x, SIMD_SP_min_char );
|
||||||
x = _mm_min_ss( x, SIMD_SP_max_char );
|
x = _mm_min_ss( x, SIMD_SP_max_char );
|
||||||
return static_cast<char>( _mm_cvttss_si32( x ) );
|
return static_cast<char>( _mm_cvttss_si32( x ) );
|
||||||
|
#else
|
||||||
|
// The converted result is clamped to the range [-128,127].
|
||||||
|
int i = C_FLOAT_TO_INT( f );
|
||||||
|
if ( i < -128 ) {
|
||||||
|
return -128;
|
||||||
|
} else if ( i > 127 ) {
|
||||||
|
return 127;
|
||||||
|
}
|
||||||
|
return static_cast<char>( i );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1153,10 +1204,21 @@ idMath::Ftoi16
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE short idMath::Ftoi16( float f ) {
|
ID_INLINE short idMath::Ftoi16( float f ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
__m128 x = _mm_load_ss( &f );
|
__m128 x = _mm_load_ss( &f );
|
||||||
x = _mm_max_ss( x, SIMD_SP_min_short );
|
x = _mm_max_ss( x, SIMD_SP_min_short );
|
||||||
x = _mm_min_ss( x, SIMD_SP_max_short );
|
x = _mm_min_ss( x, SIMD_SP_max_short );
|
||||||
return static_cast<short>( _mm_cvttss_si32( x ) );
|
return static_cast<short>( _mm_cvttss_si32( x ) );
|
||||||
|
#else
|
||||||
|
// The converted result is clamped to the range [-32768,32767].
|
||||||
|
int i = C_FLOAT_TO_INT( f );
|
||||||
|
if ( i < -32768 ) {
|
||||||
|
return -32768;
|
||||||
|
} else if ( i > 32767 ) {
|
||||||
|
return 32767;
|
||||||
|
}
|
||||||
|
return static_cast<short>( i );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1183,12 +1245,23 @@ idMath::Ftob
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE byte idMath::Ftob( float f ) {
|
ID_INLINE byte idMath::Ftob( float f ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE_INTRIN
|
||||||
// If a converted result is negative the value (0) is returned and if the
|
// If a converted result is negative the value (0) is returned and if the
|
||||||
// converted result is larger than the maximum byte the value (255) is returned.
|
// converted result is larger than the maximum byte the value (255) is returned.
|
||||||
__m128 x = _mm_load_ss( &f );
|
__m128 x = _mm_load_ss( &f );
|
||||||
x = _mm_max_ss( x, SIMD_SP_zero );
|
x = _mm_max_ss( x, SIMD_SP_zero );
|
||||||
x = _mm_min_ss( x, SIMD_SP_255 );
|
x = _mm_min_ss( x, SIMD_SP_255 );
|
||||||
return static_cast<byte>( _mm_cvttss_si32( x ) );
|
return static_cast<byte>( _mm_cvttss_si32( x ) );
|
||||||
|
#else
|
||||||
|
// The converted result is clamped to the range [0,255].
|
||||||
|
int i = C_FLOAT_TO_INT( f );
|
||||||
|
if ( i < 0 ) {
|
||||||
|
return 0;
|
||||||
|
} else if ( i > 255 ) {
|
||||||
|
return 255;
|
||||||
|
}
|
||||||
|
return static_cast<byte>( i );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -213,7 +213,7 @@ ID_INLINE idVecX idVecX::operator-() const {
|
||||||
idVecX m;
|
idVecX m;
|
||||||
|
|
||||||
m.SetTempSize( size );
|
m.SetTempSize( size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
ALIGN16( unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
|
ALIGN16( unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( m.p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) );
|
_mm_store_ps( m.p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) );
|
||||||
|
@ -233,7 +233,7 @@ idVecX::operator=
|
||||||
*/
|
*/
|
||||||
ID_INLINE idVecX &idVecX::operator=( const idVecX &a ) {
|
ID_INLINE idVecX &idVecX::operator=( const idVecX &a ) {
|
||||||
SetSize( a.size );
|
SetSize( a.size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < a.size; i += 4 ) {
|
for ( int i = 0; i < a.size; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_load_ps( a.p + i ) );
|
_mm_store_ps( p + i, _mm_load_ps( a.p + i ) );
|
||||||
}
|
}
|
||||||
|
@ -254,7 +254,7 @@ ID_INLINE idVecX idVecX::operator+( const idVecX &a ) const {
|
||||||
|
|
||||||
assert( size == a.size );
|
assert( size == a.size );
|
||||||
m.SetTempSize( size );
|
m.SetTempSize( size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( m.p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
_mm_store_ps( m.p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -276,7 +276,7 @@ ID_INLINE idVecX idVecX::operator-( const idVecX &a ) const {
|
||||||
|
|
||||||
assert( size == a.size );
|
assert( size == a.size );
|
||||||
m.SetTempSize( size );
|
m.SetTempSize( size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( m.p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
_mm_store_ps( m.p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -295,7 +295,7 @@ idVecX::operator+=
|
||||||
*/
|
*/
|
||||||
ID_INLINE idVecX &idVecX::operator+=( const idVecX &a ) {
|
ID_INLINE idVecX &idVecX::operator+=( const idVecX &a ) {
|
||||||
assert( size == a.size );
|
assert( size == a.size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
_mm_store_ps( p + i, _mm_add_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -315,7 +315,7 @@ idVecX::operator-=
|
||||||
*/
|
*/
|
||||||
ID_INLINE idVecX &idVecX::operator-=( const idVecX &a ) {
|
ID_INLINE idVecX &idVecX::operator-=( const idVecX &a ) {
|
||||||
assert( size == a.size );
|
assert( size == a.size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
_mm_store_ps( p + i, _mm_sub_ps( _mm_load_ps( p + i ), _mm_load_ps( a.p + i ) ) );
|
||||||
}
|
}
|
||||||
|
@ -337,7 +337,7 @@ ID_INLINE idVecX idVecX::operator*( const float a ) const {
|
||||||
idVecX m;
|
idVecX m;
|
||||||
|
|
||||||
m.SetTempSize( size );
|
m.SetTempSize( size );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
__m128 va = _mm_load1_ps( & a );
|
__m128 va = _mm_load1_ps( & a );
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( m.p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) );
|
_mm_store_ps( m.p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) );
|
||||||
|
@ -356,7 +356,7 @@ idVecX::operator*=
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE idVecX &idVecX::operator*=( const float a ) {
|
ID_INLINE idVecX &idVecX::operator*=( const float a ) {
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
__m128 va = _mm_load1_ps( & a );
|
__m128 va = _mm_load1_ps( & a );
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) );
|
_mm_store_ps( p + i, _mm_mul_ps( _mm_load_ps( p + i ), va ) );
|
||||||
|
@ -551,7 +551,7 @@ idVecX::Zero
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idVecX::Zero() {
|
ID_INLINE void idVecX::Zero() {
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_setzero_ps() );
|
_mm_store_ps( p + i, _mm_setzero_ps() );
|
||||||
}
|
}
|
||||||
|
@ -567,7 +567,7 @@ idVecX::Zero
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idVecX::Zero( int length ) {
|
ID_INLINE void idVecX::Zero( int length ) {
|
||||||
SetSize( length );
|
SetSize( length );
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
for ( int i = 0; i < length; i += 4 ) {
|
for ( int i = 0; i < length; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_setzero_ps() );
|
_mm_store_ps( p + i, _mm_setzero_ps() );
|
||||||
}
|
}
|
||||||
|
@ -611,7 +611,7 @@ idVecX::Negate
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idVecX::Negate() {
|
ID_INLINE void idVecX::Negate() {
|
||||||
#ifdef VECX_SIMD
|
#if defined(ID_WIN_X86_SSE_INTRIN) && defined(VECX_SIMD)
|
||||||
ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
|
ALIGN16( const unsigned int signBit[4] ) = { IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK, IEEE_FLT_SIGN_MASK };
|
||||||
for ( int i = 0; i < size; i += 4 ) {
|
for ( int i = 0; i < size; i += 4 ) {
|
||||||
_mm_store_ps( p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) );
|
_mm_store_ps( p + i, _mm_xor_ps( _mm_load_ps( p + i ), (__m128 &) signBit[0] ) );
|
||||||
|
|
|
@ -451,6 +451,10 @@ ID_INLINE idVec3 operator*( const float a, const idVec3 b ) {
|
||||||
return idVec3( b.x * a, b.y * a, b.z * a );
|
return idVec3( b.x * a, b.y * a, b.z * a );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ID_INLINE idVec3 operator/( const float a, const idVec3 b ) {
|
||||||
|
return idVec3( a / b.x, a / b.y, a / b.z );
|
||||||
|
}
|
||||||
|
|
||||||
ID_INLINE idVec3 idVec3::operator+( const idVec3 &a ) const {
|
ID_INLINE idVec3 idVec3::operator+( const idVec3 &a ) const {
|
||||||
return idVec3( x + a.x, y + a.y, z + a.z );
|
return idVec3( x + a.x, y + a.y, z + a.z );
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,58 @@ If you have questions concerning this license or the applicable additional terms
|
||||||
#ifndef SYS_DEFINES_H
|
#ifndef SYS_DEFINES_H
|
||||||
#define SYS_DEFINES_H
|
#define SYS_DEFINES_H
|
||||||
|
|
||||||
|
/*
|
||||||
|
================================================================================================
|
||||||
|
|
||||||
|
Platform Specific ID_ Defines
|
||||||
|
|
||||||
|
The ID_ defines are the only platform defines we should be using.
|
||||||
|
|
||||||
|
================================================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
#undef ID_PC
|
||||||
|
#undef ID_PC_WIN
|
||||||
|
#undef ID_PC_WIN64
|
||||||
|
#undef ID_CONSOLE
|
||||||
|
#undef ID_WIN32
|
||||||
|
#undef ID_LITTLE_ENDIAN
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
// _WIN32 always defined
|
||||||
|
// _WIN64 also defined for x64 target
|
||||||
|
/*
|
||||||
|
#if !defined( _MANAGED )
|
||||||
|
#if !defined( _WIN64 )
|
||||||
|
#define ID_WIN_X86_ASM
|
||||||
|
#define ID_WIN_X86_MMX_ASM
|
||||||
|
#define ID_WIN_X86_MMX_INTRIN
|
||||||
|
#define ID_WIN_X86_SSE_ASM
|
||||||
|
#define ID_WIN_X86_SSE_INTRIN
|
||||||
|
#define ID_WIN_X86_SSE2_ASM
|
||||||
|
#define ID_WIN_X86_SSE2_INTRIN
|
||||||
|
// the 32 bit build is now as close to the console builds as possible
|
||||||
|
#define ID_CONSOLE
|
||||||
|
#else
|
||||||
|
#define ID_PC_WIN64
|
||||||
|
#define ID_WIN_X86_MMX_INTRIN
|
||||||
|
#define ID_WIN_X86_SSE_INTRIN
|
||||||
|
#define ID_WIN_X86_SSE2_INTRIN
|
||||||
|
#define ID_WIN_X86_SSE3_INTRIN
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define ID_PC
|
||||||
|
#define ID_PC_WIN
|
||||||
|
#define ID_WIN32
|
||||||
|
#define ID_LITTLE_ENDIAN
|
||||||
|
#else
|
||||||
|
#error Unknown Platform
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define ID_OPENGL
|
||||||
|
|
||||||
/*
|
/*
|
||||||
================================================================================================
|
================================================================================================
|
||||||
|
|
||||||
|
@ -36,6 +88,7 @@ If you have questions concerning this license or the applicable additional terms
|
||||||
================================================================================================
|
================================================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifdef ID_PC_WIN
|
||||||
|
|
||||||
#define CPUSTRING "x86"
|
#define CPUSTRING "x86"
|
||||||
|
|
||||||
|
@ -69,6 +122,8 @@ If you have questions concerning this license or the applicable additional terms
|
||||||
#define WIN32
|
#define WIN32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
================================================================================================
|
================================================================================================
|
||||||
|
|
||||||
|
@ -108,6 +163,8 @@ bulk of the codebase, so it is the best place for analyze pragmas.
|
||||||
================================================================================================
|
================================================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#if defined( ID_WIN32 )
|
||||||
|
|
||||||
// disable some /analyze warnings here
|
// disable some /analyze warnings here
|
||||||
#pragma warning( disable: 6255 ) // warning C6255: _alloca indicates failure by raising a stack overflow exception. Consider using _malloca instead. (Note: _malloca requires _freea.)
|
#pragma warning( disable: 6255 ) // warning C6255: _alloca indicates failure by raising a stack overflow exception. Consider using _malloca instead. (Note: _malloca requires _freea.)
|
||||||
#pragma warning( disable: 6262 ) // warning C6262: Function uses '36924' bytes of stack: exceeds /analyze:stacksize'32768'. Consider moving some data to heap
|
#pragma warning( disable: 6262 ) // warning C6262: Function uses '36924' bytes of stack: exceeds /analyze:stacksize'32768'. Consider moving some data to heap
|
||||||
|
@ -135,6 +192,7 @@ bulk of the codebase, so it is the best place for analyze pragmas.
|
||||||
// guaranteed to be false in the following code
|
// guaranteed to be false in the following code
|
||||||
#define NO_RETURN __declspec(noreturn)
|
#define NO_RETURN __declspec(noreturn)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// I don't want to disable "warning C6031: Return value ignored" from /analyze
|
// I don't want to disable "warning C6031: Return value ignored" from /analyze
|
||||||
// but there are several cases with sprintf where we pre-initialized the variables
|
// but there are several cases with sprintf where we pre-initialized the variables
|
||||||
|
|
|
@ -56,6 +56,8 @@ ID_INLINE_EXTERN float __frndz( float x ) { return (float)( (int)( x ) ); }
|
||||||
================================================================================================
|
================================================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
// The code below assumes that a cache line is 64 bytes.
|
// The code below assumes that a cache line is 64 bytes.
|
||||||
// We specify the cache line size as 128 here to make the code consistent with the consoles.
|
// We specify the cache line size as 128 here to make the code consistent with the consoles.
|
||||||
#define CACHE_LINE_SIZE 128
|
#define CACHE_LINE_SIZE 128
|
||||||
|
@ -84,6 +86,24 @@ ID_FORCE_INLINE void FlushCacheLine( const void * ptr, int offset ) {
|
||||||
_mm_clflush( bytePtr + 64 );
|
_mm_clflush( bytePtr + 64 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
================================================
|
||||||
|
Other
|
||||||
|
================================================
|
||||||
|
*/
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define CACHE_LINE_SIZE 128
|
||||||
|
|
||||||
|
ID_INLINE void Prefetch( const void * ptr, int offset ) {}
|
||||||
|
ID_INLINE void ZeroCacheLine( void * ptr, int offset ) {
|
||||||
|
byte * bytePtr = (byte *)( ( ( (UINT_PTR) ( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) );
|
||||||
|
memset( bytePtr, 0, CACHE_LINE_SIZE );
|
||||||
|
}
|
||||||
|
ID_INLINE void FlushCacheLine( const void * ptr, int offset ) {}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
================================================
|
================================================
|
||||||
Block Clear Macros
|
Block Clear Macros
|
||||||
|
|
|
@ -72,6 +72,7 @@ void UnbindBufferObjects() {
|
||||||
qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 );
|
qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
||||||
assert_16_byte_aligned( dst );
|
assert_16_byte_aligned( dst );
|
||||||
|
@ -109,6 +110,15 @@ void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
||||||
_mm_sfence();
|
_mm_sfence();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
|
||||||
|
assert_16_byte_aligned( dst );
|
||||||
|
assert_16_byte_aligned( src );
|
||||||
|
memcpy( dst, src, numBytes );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
================================================================================================
|
================================================================================================
|
||||||
|
|
|
@ -258,7 +258,11 @@ idDxtEncoder::CompressImageDXT1Fast
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height );
|
CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height );
|
||||||
|
#else
|
||||||
|
CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -267,7 +271,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height );
|
CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height );
|
||||||
|
#else
|
||||||
|
CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -276,7 +284,11 @@ idDxtEncoder::CompressImageDXT5Fast
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||||
|
#else
|
||||||
|
CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -294,7 +306,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||||
|
#else
|
||||||
|
CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -312,7 +328,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte *inBuf, byte *outBuf, int width, int height ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||||
|
#else
|
||||||
|
CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -52,6 +52,7 @@ idDxtEncoder::NV4XHardwareBugFix
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
|
void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
|
||||||
|
#ifdef ID_WIN_X86_ASM
|
||||||
int minq = ( ( minColor[0] << 16 ) | ( minColor[1] << 8 ) | minColor[2] ) & 0x00F8FCF8;
|
int minq = ( ( minColor[0] << 16 ) | ( minColor[1] << 8 ) | minColor[2] ) & 0x00F8FCF8;
|
||||||
int maxq = ( ( maxColor[0] << 16 ) | ( maxColor[1] << 8 ) | maxColor[2] ) & 0x00F8FCF8;
|
int maxq = ( ( maxColor[0] << 16 ) | ( maxColor[1] << 8 ) | maxColor[2] ) & 0x00F8FCF8;
|
||||||
int mask = -( minq > maxq ) & 0x00FFFFFF;
|
int mask = -( minq > maxq ) & 0x00FFFFFF;
|
||||||
|
@ -62,6 +63,13 @@ void idDxtEncoder::NV4XHardwareBugFix( byte *minColor, byte *maxColor ) const {
|
||||||
min ^= max;
|
min ^= max;
|
||||||
*(int *)minColor = min;
|
*(int *)minColor = min;
|
||||||
*(int *)maxColor = max;
|
*(int *)maxColor = max;
|
||||||
|
#else
|
||||||
|
if ( ColorTo565( minColor ) > ColorTo565( maxColor ) ) {
|
||||||
|
SwapValues( minColor[0], maxColor[0] );
|
||||||
|
SwapValues( minColor[1], maxColor[1] );
|
||||||
|
SwapValues( minColor[2], maxColor[2] );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -950,6 +958,7 @@ int idDxtEncoder::GetMinMaxNormalYHQ( const byte *colorBlock, byte *minColor, by
|
||||||
return bestError;
|
return bestError;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined( ID_WIN_X86_ASM )
|
||||||
ALIGN16( static float SIMD_SSE2_float_scale[4] ) = { 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f };
|
ALIGN16( static float SIMD_SSE2_float_scale[4] ) = { 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f, 2.0f / 255.0f };
|
||||||
ALIGN16( static float SIMD_SSE2_float_descale[4] ) = { 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f };
|
ALIGN16( static float SIMD_SSE2_float_descale[4] ) = { 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f, 255.0f / 2.0f };
|
||||||
ALIGN16( static float SIMD_SSE2_float_zero[4] ) = { 0.0f, 0.0f, 0.0f, 0.0f };
|
ALIGN16( static float SIMD_SSE2_float_zero[4] ) = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||||
|
@ -961,6 +970,7 @@ ALIGN16( static float SIMD_SP_rsqrt_c1[4] ) = { -0.5f, -0.5f, -0.5f, -0.5f };
|
||||||
ALIGN16( static dword SIMD_SSE2_dword_maskFirstThree[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
|
ALIGN16( static dword SIMD_SSE2_dword_maskFirstThree[4] ) = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
|
||||||
ALIGN16( static dword SIMD_SSE2_dword_maskWords[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00000000 };
|
ALIGN16( static dword SIMD_SSE2_dword_maskWords[4] ) = { 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00000000 };
|
||||||
#define R_SHUFFLE_PS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
#define R_SHUFFLE_PS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
========================
|
========================
|
||||||
|
@ -968,6 +978,7 @@ NormalDistanceDXT1
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
int NormalDistanceDXT1( const int *vector, const int *normalized ) {
|
int NormalDistanceDXT1( const int *vector, const int *normalized ) {
|
||||||
|
#if defined( ID_WIN_X86_ASM )
|
||||||
int result;
|
int result;
|
||||||
__asm {
|
__asm {
|
||||||
mov esi, vector
|
mov esi, vector
|
||||||
|
@ -1007,6 +1018,24 @@ int NormalDistanceDXT1( const int *vector, const int *normalized ) {
|
||||||
movd result, xmm0
|
movd result, xmm0
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
#else
|
||||||
|
float floatNormal[3];
|
||||||
|
byte intNormal[4];
|
||||||
|
floatNormal[0] = vector[0] * ( 2.0f / 255.0f ) - 1.0f;
|
||||||
|
floatNormal[1] = vector[1] * ( 2.0f / 255.0f ) - 1.0f;
|
||||||
|
floatNormal[2] = vector[2] * ( 2.0f / 255.0f ) - 1.0f;
|
||||||
|
float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
|
||||||
|
floatNormal[0] *= rcplen;
|
||||||
|
floatNormal[1] *= rcplen;
|
||||||
|
floatNormal[2] *= rcplen;
|
||||||
|
intNormal[0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
|
||||||
|
intNormal[1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
|
||||||
|
intNormal[2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) * ( 255.0f / 2.0f ) + 0.5f );
|
||||||
|
int result = ( ( intNormal[ 0 ] - normalized[ 0 ] ) * ( intNormal[ 0 ] - normalized[ 0 ] ) ) +
|
||||||
|
( ( intNormal[ 1 ] - normalized[ 1 ] ) * ( intNormal[ 1 ] - normalized[ 1 ] ) ) +
|
||||||
|
( ( intNormal[ 2 ] - normalized[ 2 ] ) * ( intNormal[ 2 ] - normalized[ 2 ] ) );
|
||||||
|
return result;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1015,6 +1044,7 @@ NormalDistanceDXT5
|
||||||
========================
|
========================
|
||||||
*/
|
*/
|
||||||
int NormalDistanceDXT5( const int *vector, const int *normalized ) {
|
int NormalDistanceDXT5( const int *vector, const int *normalized ) {
|
||||||
|
#if defined( ID_WIN_X86_ASM )
|
||||||
int result;
|
int result;
|
||||||
__asm {
|
__asm {
|
||||||
mov esi, vector
|
mov esi, vector
|
||||||
|
@ -1064,6 +1094,33 @@ int NormalDistanceDXT5( const int *vector, const int *normalized ) {
|
||||||
movd result, xmm0
|
movd result, xmm0
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
#else
|
||||||
|
#if 0 // object-space
|
||||||
|
const int c0 = 0;
|
||||||
|
const int c1 = 1;
|
||||||
|
const int c2 = 3;
|
||||||
|
#else
|
||||||
|
const int c0 = 1;
|
||||||
|
const int c1 = 2;
|
||||||
|
const int c2 = 3;
|
||||||
|
#endif
|
||||||
|
float floatNormal[3];
|
||||||
|
byte intNormal[4];
|
||||||
|
floatNormal[0] = vector[c0] / 255.0f * 2.0f - 1.0f;
|
||||||
|
floatNormal[1] = vector[c1] / 255.0f * 2.0f - 1.0f;
|
||||||
|
floatNormal[2] = vector[c2] / 255.0f * 2.0f - 1.0f;
|
||||||
|
float rcplen = idMath::InvSqrt( floatNormal[0] * floatNormal[0] + floatNormal[1] * floatNormal[1] + floatNormal[2] * floatNormal[2] );
|
||||||
|
floatNormal[0] *= rcplen;
|
||||||
|
floatNormal[1] *= rcplen;
|
||||||
|
floatNormal[2] *= rcplen;
|
||||||
|
intNormal[c0] = idMath::Ftob( ( floatNormal[0] + 1.0f ) / 2.0f * 255.0f + 0.5f );
|
||||||
|
intNormal[c1] = idMath::Ftob( ( floatNormal[1] + 1.0f ) / 2.0f * 255.0f + 0.5f );
|
||||||
|
intNormal[c2] = idMath::Ftob( ( floatNormal[2] + 1.0f ) / 2.0f * 255.0f + 0.5f );
|
||||||
|
int result = ( ( intNormal[ c0 ] - normalized[ c0 ] ) * ( intNormal[ c0 ] - normalized[ c0 ] ) ) +
|
||||||
|
( ( intNormal[ c1 ] - normalized[ c1 ] ) * ( intNormal[ c1 ] - normalized[ c1 ] ) ) +
|
||||||
|
( ( intNormal[ c2 ] - normalized[ c2 ] ) * ( intNormal[ c2 ] - normalized[ c2 ] ) );
|
||||||
|
return result;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -72,6 +72,7 @@ R_MatrixMultiply
|
||||||
==========================
|
==========================
|
||||||
*/
|
*/
|
||||||
void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
|
void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
__m128 a0 = _mm_loadu_ps( a + 0*4 );
|
__m128 a0 = _mm_loadu_ps( a + 0*4 );
|
||||||
__m128 a1 = _mm_loadu_ps( a + 1*4 );
|
__m128 a1 = _mm_loadu_ps( a + 1*4 );
|
||||||
|
@ -108,6 +109,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] ) {
|
||||||
_mm_storeu_ps( out + 2*4, t2 );
|
_mm_storeu_ps( out + 2*4, t2 );
|
||||||
_mm_storeu_ps( out + 3*4, t3 );
|
_mm_storeu_ps( out + 3*4, t3 );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/*
|
||||||
|
for ( int i = 0; i < 4; i++ ) {
|
||||||
|
for ( int j = 0; j < 4; j++ ) {
|
||||||
|
out[ i * 4 + j ] =
|
||||||
|
a[ i * 4 + 0 ] * b[ 0 * 4 + j ] +
|
||||||
|
a[ i * 4 + 1 ] * b[ 1 * 4 + j ] +
|
||||||
|
a[ i * 4 + 2 ] * b[ 2 * 4 + j ] +
|
||||||
|
a[ i * 4 + 3 ] * b[ 3 * 4 + j ];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
out[0*4+0] = a[0*4+0]*b[0*4+0] + a[0*4+1]*b[1*4+0] + a[0*4+2]*b[2*4+0] + a[0*4+3]*b[3*4+0];
|
||||||
|
out[0*4+1] = a[0*4+0]*b[0*4+1] + a[0*4+1]*b[1*4+1] + a[0*4+2]*b[2*4+1] + a[0*4+3]*b[3*4+1];
|
||||||
|
out[0*4+2] = a[0*4+0]*b[0*4+2] + a[0*4+1]*b[1*4+2] + a[0*4+2]*b[2*4+2] + a[0*4+3]*b[3*4+2];
|
||||||
|
out[0*4+3] = a[0*4+0]*b[0*4+3] + a[0*4+1]*b[1*4+3] + a[0*4+2]*b[2*4+3] + a[0*4+3]*b[3*4+3];
|
||||||
|
|
||||||
|
out[1*4+0] = a[1*4+0]*b[0*4+0] + a[1*4+1]*b[1*4+0] + a[1*4+2]*b[2*4+0] + a[1*4+3]*b[3*4+0];
|
||||||
|
out[1*4+1] = a[1*4+0]*b[0*4+1] + a[1*4+1]*b[1*4+1] + a[1*4+2]*b[2*4+1] + a[1*4+3]*b[3*4+1];
|
||||||
|
out[1*4+2] = a[1*4+0]*b[0*4+2] + a[1*4+1]*b[1*4+2] + a[1*4+2]*b[2*4+2] + a[1*4+3]*b[3*4+2];
|
||||||
|
out[1*4+3] = a[1*4+0]*b[0*4+3] + a[1*4+1]*b[1*4+3] + a[1*4+2]*b[2*4+3] + a[1*4+3]*b[3*4+3];
|
||||||
|
|
||||||
|
out[2*4+0] = a[2*4+0]*b[0*4+0] + a[2*4+1]*b[1*4+0] + a[2*4+2]*b[2*4+0] + a[2*4+3]*b[3*4+0];
|
||||||
|
out[2*4+1] = a[2*4+0]*b[0*4+1] + a[2*4+1]*b[1*4+1] + a[2*4+2]*b[2*4+1] + a[2*4+3]*b[3*4+1];
|
||||||
|
out[2*4+2] = a[2*4+0]*b[0*4+2] + a[2*4+1]*b[1*4+2] + a[2*4+2]*b[2*4+2] + a[2*4+3]*b[3*4+2];
|
||||||
|
out[2*4+3] = a[2*4+0]*b[0*4+3] + a[2*4+1]*b[1*4+3] + a[2*4+2]*b[2*4+3] + a[2*4+3]*b[3*4+3];
|
||||||
|
|
||||||
|
out[3*4+0] = a[3*4+0]*b[0*4+0] + a[3*4+1]*b[1*4+0] + a[3*4+2]*b[2*4+0] + a[3*4+3]*b[3*4+0];
|
||||||
|
out[3*4+1] = a[3*4+0]*b[0*4+1] + a[3*4+1]*b[1*4+1] + a[3*4+2]*b[2*4+1] + a[3*4+3]*b[3*4+1];
|
||||||
|
out[3*4+2] = a[3*4+0]*b[0*4+2] + a[3*4+1]*b[1*4+2] + a[3*4+2]*b[2*4+2] + a[3*4+3]*b[3*4+2];
|
||||||
|
out[3*4+3] = a[3*4+0]*b[0*4+3] + a[3*4+1]*b[1*4+3] + a[3*4+2]*b[2*4+3] + a[3*4+3]*b[3*4+3];
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -274,6 +274,7 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con
|
||||||
assert_16_byte_aligned( cullBits );
|
assert_16_byte_aligned( cullBits );
|
||||||
assert_16_byte_aligned( verts );
|
assert_16_byte_aligned( verts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -376,6 +377,37 @@ static void R_DecalPointCullStatic( byte * cullBits, const idPlane * planes, con
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
const idVec3 & v = vertsODS[i].xyz;
|
||||||
|
|
||||||
|
const float d0 = planes[0].Distance( v );
|
||||||
|
const float d1 = planes[1].Distance( v );
|
||||||
|
const float d2 = planes[2].Distance( v );
|
||||||
|
const float d3 = planes[3].Distance( v );
|
||||||
|
const float d4 = planes[4].Distance( v );
|
||||||
|
const float d5 = planes[5].Distance( v );
|
||||||
|
|
||||||
|
byte bits;
|
||||||
|
bits = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0;
|
||||||
|
bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1;
|
||||||
|
bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2;
|
||||||
|
bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3;
|
||||||
|
bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4;
|
||||||
|
bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5;
|
||||||
|
|
||||||
|
cullBits[i] = bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -573,6 +605,7 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i
|
||||||
assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
||||||
assert_16_byte_aligned( fadeColor );
|
assert_16_byte_aligned( fadeColor );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
||||||
const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
|
const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
|
||||||
|
@ -612,6 +645,25 @@ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * i
|
||||||
|
|
||||||
_mm_sfence();
|
_mm_sfence();
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// copy vertices and apply depth/time based fading
|
||||||
|
for ( int i = 0; i < decal->numVerts; i++ ) {
|
||||||
|
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
|
||||||
|
verts[numVerts + i] = decal->verts[i];
|
||||||
|
for ( int j = 0; j < 4; j++ ) {
|
||||||
|
verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy indices
|
||||||
|
assert( ( decal->numIndexes & 1 ) == 0 );
|
||||||
|
for ( int i = 0; i < decal->numIndexes; i += 2 ) {
|
||||||
|
assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts );
|
||||||
|
WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -102,6 +102,7 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
|
||||||
assert_16_byte_aligned( texCoordT );
|
assert_16_byte_aligned( texCoordT );
|
||||||
assert_16_byte_aligned( verts );
|
assert_16_byte_aligned( verts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -176,6 +177,39 @@ static void R_OverlayPointCullStatic( byte * cullBits, halfFloat_t * texCoordS,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
const idVec3 & v = vertsODS[i].xyz;
|
||||||
|
|
||||||
|
const float d0 = planes[0].Distance( v );
|
||||||
|
const float d1 = planes[1].Distance( v );
|
||||||
|
const float d2 = 1.0f - d0;
|
||||||
|
const float d3 = 1.0f - d1;
|
||||||
|
|
||||||
|
halfFloat_t s = Scalar_FastF32toF16( d0 );
|
||||||
|
halfFloat_t t = Scalar_FastF32toF16( d1 );
|
||||||
|
|
||||||
|
texCoordS[i] = s;
|
||||||
|
texCoordT[i] = t;
|
||||||
|
|
||||||
|
byte bits;
|
||||||
|
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
|
||||||
|
|
||||||
|
cullBits[i] = bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -189,6 +223,7 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
|
||||||
assert_16_byte_aligned( texCoordT );
|
assert_16_byte_aligned( texCoordT );
|
||||||
assert_16_byte_aligned( verts );
|
assert_16_byte_aligned( verts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -263,6 +298,39 @@ static void R_OverlayPointCullSkinned( byte * cullBits, halfFloat_t * texCoordS,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||||
|
|
||||||
|
const float d0 = planes[0].Distance( transformed );
|
||||||
|
const float d1 = planes[1].Distance( transformed );
|
||||||
|
const float d2 = 1.0f - d0;
|
||||||
|
const float d3 = 1.0f - d1;
|
||||||
|
|
||||||
|
halfFloat_t s = Scalar_FastF32toF16( d0 );
|
||||||
|
halfFloat_t t = Scalar_FastF32toF16( d1 );
|
||||||
|
|
||||||
|
texCoordS[i] = s;
|
||||||
|
texCoordT[i] = t;
|
||||||
|
|
||||||
|
byte bits;
|
||||||
|
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
|
||||||
|
|
||||||
|
cullBits[i] = bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -446,6 +514,7 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
|
||||||
assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
|
assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
|
||||||
assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
|
const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
|
||||||
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
||||||
|
@ -482,6 +551,25 @@ static void R_CopyOverlaySurface( idDrawVert * verts, int numVerts, triIndex_t *
|
||||||
|
|
||||||
_mm_sfence();
|
_mm_sfence();
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// copy vertices
|
||||||
|
for ( int i = 0; i < overlay->numVerts; i++ ) {
|
||||||
|
const overlayVertex_t &overlayVert = overlay->verts[i];
|
||||||
|
|
||||||
|
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
|
||||||
|
verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
|
||||||
|
verts[numVerts + i].st[0] = overlayVert.st[0];
|
||||||
|
verts[numVerts + i].st[1] = overlayVert.st[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy indexes
|
||||||
|
for ( int i = 0; i < overlay->numIndexes; i += 2 ) {
|
||||||
|
assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
|
||||||
|
WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -32,10 +32,12 @@ If you have questions concerning this license or the applicable additional terms
|
||||||
#include "tr_local.h"
|
#include "tr_local.h"
|
||||||
#include "Model_local.h"
|
#include "Model_local.h"
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
|
static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
|
||||||
static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY };
|
static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY };
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
static const char *MD5_SnapshotName = "_MD5_Snapshot_";
|
static const char *MD5_SnapshotName = "_MD5_Snapshot_";
|
||||||
|
|
||||||
|
@ -501,6 +503,7 @@ idMD5Mesh::CalculateBounds
|
||||||
====================
|
====================
|
||||||
*/
|
*/
|
||||||
void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {
|
void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
__m128 minX = vector_float_posInfinity;
|
__m128 minX = vector_float_posInfinity;
|
||||||
__m128 minY = vector_float_posInfinity;
|
__m128 minY = vector_float_posInfinity;
|
||||||
|
@ -534,6 +537,16 @@ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds
|
||||||
_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
|
_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
|
||||||
_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );
|
_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
bounds.Clear();
|
||||||
|
for ( int i = 0; i < numMeshJoints; i++ ) {
|
||||||
|
const idJointMat & joint = entJoints[meshJoints[i]];
|
||||||
|
bounds.AddPoint( joint.GetTranslation() );
|
||||||
|
}
|
||||||
|
bounds.ExpandSelf( maxJointVertDist );
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1085,6 +1098,7 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin
|
||||||
assert_16_byte_aligned( inFloats1 );
|
assert_16_byte_aligned( inFloats1 );
|
||||||
assert_16_byte_aligned( inFloats2 );
|
assert_16_byte_aligned( inFloats2 );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
|
const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
|
||||||
|
|
||||||
|
@ -1160,6 +1174,13 @@ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoin
|
||||||
_mm_store_ps( outFloats + 1 * 12 + 8, ri1 );
|
_mm_store_ps( outFloats + 1 * 12 + 8, ri1 );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
for ( int i = 0; i < numJoints; i++ ) {
|
||||||
|
idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] );
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -87,6 +87,7 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r
|
||||||
assert_16_byte_aligned( cullBits );
|
assert_16_byte_aligned( cullBits );
|
||||||
assert_16_byte_aligned( verts );
|
assert_16_byte_aligned( verts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -208,6 +209,54 @@ static void R_ShadowVolumeCullBits( byte *cullBits, byte &totalOr, const float r
|
||||||
|
|
||||||
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
byte tOr = 0;
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
const idVec3 & v = vertsODS[i].xyzw.ToVec3();
|
||||||
|
|
||||||
|
const float d0 = planes[0].Distance( v );
|
||||||
|
const float d1 = planes[1].Distance( v );
|
||||||
|
const float d2 = planes[2].Distance( v );
|
||||||
|
const float d3 = planes[3].Distance( v );
|
||||||
|
|
||||||
|
const float t0 = d0 + radius;
|
||||||
|
const float t1 = d1 + radius;
|
||||||
|
const float t2 = d2 + radius;
|
||||||
|
const float t3 = d3 + radius;
|
||||||
|
|
||||||
|
const float s0 = d0 - radius;
|
||||||
|
const float s1 = d1 - radius;
|
||||||
|
const float s2 = d2 - radius;
|
||||||
|
const float s3 = d3 - radius;
|
||||||
|
|
||||||
|
byte bits;
|
||||||
|
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||||
|
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||||
|
|
||||||
|
bits ^= 0x0F; // flip lower four bits
|
||||||
|
|
||||||
|
tOr |= bits;
|
||||||
|
cullBits[i] = bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
totalOr = tOr;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -31,6 +31,7 @@ If you have questions concerning this license or the applicable additional terms
|
||||||
#include "../../../idlib/sys/sys_intrinsics.h"
|
#include "../../../idlib/sys/sys_intrinsics.h"
|
||||||
#include "../../../idlib/geometry/DrawVert_intrinsics.h"
|
#include "../../../idlib/geometry/DrawVert_intrinsics.h"
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 );
|
static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 );
|
||||||
|
|
||||||
|
@ -126,6 +127,69 @@ static __forceinline __m128i TriangleCulled_SSE2( const __m128 & vert0X, const _
|
||||||
return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
|
return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/*
|
||||||
|
=====================
|
||||||
|
TriangleFacing
|
||||||
|
|
||||||
|
Returns 255 if the triangle is facing the light origin, otherwise returns 0.
|
||||||
|
=====================
|
||||||
|
*/
|
||||||
|
static byte TriangleFacing_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idVec3 & lightOrigin ) {
|
||||||
|
const float sx = v2.x - v1.x;
|
||||||
|
const float sy = v2.y - v1.y;
|
||||||
|
const float sz = v2.z - v1.z;
|
||||||
|
|
||||||
|
const float tx = v3.x - v1.x;
|
||||||
|
const float ty = v3.y - v1.y;
|
||||||
|
const float tz = v3.z - v1.z;
|
||||||
|
|
||||||
|
const float normalX = ty * sz - tz * sy;
|
||||||
|
const float normalY = tz * sx - tx * sz;
|
||||||
|
const float normalZ = tx * sy - ty * sx;
|
||||||
|
const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z;
|
||||||
|
|
||||||
|
const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW;
|
||||||
|
return ( d > 0.0f ) ? 255 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
=====================
|
||||||
|
TriangleCulled
|
||||||
|
|
||||||
|
Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0.
|
||||||
|
The clip space of the 'lightProject' is assumed to be in the range [0, 1].
|
||||||
|
=====================
|
||||||
|
*/
|
||||||
|
static byte TriangleCulled_Generic( const idVec3 & v1, const idVec3 & v2, const idVec3 & v3, const idRenderMatrix & lightProject ) {
|
||||||
|
// transform the triangle
|
||||||
|
idVec4 c[3];
|
||||||
|
for ( int i = 0; i < 4; i++ ) {
|
||||||
|
c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3];
|
||||||
|
c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3];
|
||||||
|
c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3];
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculate the culled bits
|
||||||
|
int bits = 0;
|
||||||
|
for ( int i = 0; i < 3; i++ ) {
|
||||||
|
const float minW = 0.0f;
|
||||||
|
const float maxW = c[i][3];
|
||||||
|
|
||||||
|
if ( c[i][0] > minW ) { bits |= ( 1 << 0 ); }
|
||||||
|
if ( c[i][0] < maxW ) { bits |= ( 1 << 1 ); }
|
||||||
|
if ( c[i][1] > minW ) { bits |= ( 1 << 2 ); }
|
||||||
|
if ( c[i][1] < maxW ) { bits |= ( 1 << 3 ); }
|
||||||
|
if ( c[i][2] > minW ) { bits |= ( 1 << 4 ); }
|
||||||
|
if ( c[i][2] < maxW ) { bits |= ( 1 << 5 ); }
|
||||||
|
}
|
||||||
|
|
||||||
|
// if any bits weren't set, the triangle is completely off one side of the frustum
|
||||||
|
return ( bits != 63 ) ? 255 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
=====================
|
=====================
|
||||||
|
@ -155,6 +219,7 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte *
|
||||||
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
||||||
const float lineLength = lineLengthSqr * lineLengthRcp;
|
const float lineLength = lineLengthSqr * lineLengthRcp;
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4 * 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4 * 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
||||||
|
|
||||||
|
@ -261,6 +326,55 @@ static int CalculateTriangleFacingCulledStatic( byte * __restrict facing, byte *
|
||||||
|
|
||||||
return _mm_cvtsi128_si32( numFrontFacing );
|
return _mm_cvtsi128_si32( numFrontFacing );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
||||||
|
|
||||||
|
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
|
||||||
|
|
||||||
|
int numFrontFacing = 0;
|
||||||
|
|
||||||
|
for ( int i = 0, j = 0; i < numIndexes; ) {
|
||||||
|
|
||||||
|
const int batchStart = i;
|
||||||
|
const int batchEnd = indexedVertsODS.FetchNextBatch();
|
||||||
|
const int indexStart = j;
|
||||||
|
|
||||||
|
for ( ; i <= batchEnd - 3; i += 3, j++ ) {
|
||||||
|
const idVec3 & v1 = indexedVertsODS[i + 0].xyz;
|
||||||
|
const idVec3 & v2 = indexedVertsODS[i + 1].xyz;
|
||||||
|
const idVec3 & v3 = indexedVertsODS[i + 2].xyz;
|
||||||
|
|
||||||
|
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
|
||||||
|
|
||||||
|
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
|
||||||
|
|
||||||
|
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
|
||||||
|
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
|
||||||
|
|
||||||
|
culled[j] = triangleCulled;
|
||||||
|
facing[j] = triangleFacing;
|
||||||
|
|
||||||
|
// count the number of facing triangles
|
||||||
|
numFrontFacing += ( triangleFacing & 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( insideShadowVolume != NULL ) {
|
||||||
|
for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
|
||||||
|
if ( !facing[n] ) {
|
||||||
|
if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) ) {
|
||||||
|
*insideShadowVolume = true;
|
||||||
|
insideShadowVolume = NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return numFrontFacing;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -291,6 +405,7 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte
|
||||||
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
||||||
const float lineLength = lineLengthSqr * lineLengthRcp;
|
const float lineLength = lineLengthSqr * lineLengthRcp;
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -428,6 +543,74 @@ static int CalculateTriangleFacingCulledSkinned( byte * __restrict facing, byte
|
||||||
|
|
||||||
return _mm_cvtsi128_si32( numFrontFacing );
|
return _mm_cvtsi128_si32( numFrontFacing );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||||
|
tempVerts[i].w = 1.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
|
||||||
|
|
||||||
|
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
|
||||||
|
|
||||||
|
int numFrontFacing = 0;
|
||||||
|
|
||||||
|
for ( int i = 0, j = 0; i < numIndexes; ) {
|
||||||
|
|
||||||
|
const int batchStart = i;
|
||||||
|
const int batchEnd = indexesODS.FetchNextBatch();
|
||||||
|
const int indexStart = j;
|
||||||
|
|
||||||
|
for ( ; i <= batchEnd - 3; i += 3, j++ ) {
|
||||||
|
const int i0 = indexesODS[i + 0];
|
||||||
|
const int i1 = indexesODS[i + 1];
|
||||||
|
const int i2 = indexesODS[i + 2];
|
||||||
|
|
||||||
|
const idVec3 & v1 = tempVerts[i0].ToVec3();
|
||||||
|
const idVec3 & v2 = tempVerts[i1].ToVec3();
|
||||||
|
const idVec3 & v3 = tempVerts[i2].ToVec3();
|
||||||
|
|
||||||
|
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
|
||||||
|
|
||||||
|
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
|
||||||
|
|
||||||
|
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
|
||||||
|
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
|
||||||
|
|
||||||
|
culled[j] = triangleCulled;
|
||||||
|
facing[j] = triangleFacing;
|
||||||
|
|
||||||
|
// count the number of facing triangles
|
||||||
|
numFrontFacing += ( triangleFacing & 1 );
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( insideShadowVolume != NULL ) {
|
||||||
|
for ( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ ) {
|
||||||
|
if ( !facing[n] ) {
|
||||||
|
const int i0 = indexesODS[k + 0];
|
||||||
|
const int i1 = indexesODS[k + 1];
|
||||||
|
const int i2 = indexesODS[k + 2];
|
||||||
|
if ( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) ) {
|
||||||
|
*insideShadowVolume = true;
|
||||||
|
insideShadowVolume = NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return numFrontFacing;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -440,6 +623,7 @@ static void StreamOut( void * dst, const void * src, int numBytes ) {
|
||||||
assert_16_byte_aligned( dst );
|
assert_16_byte_aligned( dst );
|
||||||
assert_16_byte_aligned( src );
|
assert_16_byte_aligned( src );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for ( ; i + 128 <= numBytes; i += 128 ) {
|
for ( ; i + 128 <= numBytes; i += 128 ) {
|
||||||
__m128i d0 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 0*16 ) );
|
__m128i d0 = _mm_load_si128( (const __m128i *)( (byte *)src + i + 0*16 ) );
|
||||||
|
@ -463,6 +647,9 @@ static void StreamOut( void * dst, const void * src, int numBytes ) {
|
||||||
__m128i d = _mm_load_si128( (__m128i *)( (byte *)src + i ) );
|
__m128i d = _mm_load_si128( (__m128i *)( (byte *)src + i ) );
|
||||||
_mm_stream_si128( (__m128i *)( (byte *)dst + i ), d );
|
_mm_stream_si128( (__m128i *)( (byte *)dst + i ), d );
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
memcpy( dst, src, numBytes );
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -671,7 +858,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t *__restrict shadowIndices,
|
||||||
|
|
||||||
numShadowIndexesTotal = numShadowIndices;
|
numShadowIndexesTotal = numShadowIndices;
|
||||||
|
|
||||||
|
#if defined( ID_WIN_X86_SSE2_INTRIN )
|
||||||
_mm_sfence();
|
_mm_sfence();
|
||||||
|
#endif
|
||||||
|
|
||||||
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
||||||
|
|
||||||
|
@ -844,7 +1033,9 @@ void R_CreateLightTriangles( triIndex_t * __restrict lightIndices, triIndex_t *
|
||||||
|
|
||||||
numLightIndicesTotal = numLightIndices;
|
numLightIndicesTotal = numLightIndices;
|
||||||
|
|
||||||
|
#if defined( ID_WIN_X86_SSE2_INTRIN )
|
||||||
_mm_sfence();
|
_mm_sfence();
|
||||||
|
#endif
|
||||||
|
|
||||||
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,7 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r
|
||||||
assert_16_byte_aligned( cullBits );
|
assert_16_byte_aligned( cullBits );
|
||||||
assert_16_byte_aligned( verts );
|
assert_16_byte_aligned( verts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -164,6 +165,54 @@ static void R_TracePointCullStatic( byte *cullBits, byte &totalOr, const float r
|
||||||
|
|
||||||
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
byte tOr = 0;
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
const idVec3 & v = vertsODS[i].xyz;
|
||||||
|
|
||||||
|
const float d0 = planes[0].Distance( v );
|
||||||
|
const float d1 = planes[1].Distance( v );
|
||||||
|
const float d2 = planes[2].Distance( v );
|
||||||
|
const float d3 = planes[3].Distance( v );
|
||||||
|
|
||||||
|
const float t0 = d0 + radius;
|
||||||
|
const float t1 = d1 + radius;
|
||||||
|
const float t2 = d2 + radius;
|
||||||
|
const float t3 = d3 + radius;
|
||||||
|
|
||||||
|
const float s0 = d0 - radius;
|
||||||
|
const float s1 = d1 - radius;
|
||||||
|
const float s2 = d2 - radius;
|
||||||
|
const float s3 = d3 - radius;
|
||||||
|
|
||||||
|
byte bits;
|
||||||
|
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||||
|
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||||
|
|
||||||
|
bits ^= 0x0F; // flip lower four bits
|
||||||
|
|
||||||
|
tOr |= bits;
|
||||||
|
cullBits[i] = bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
totalOr = tOr;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -175,6 +224,7 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float
|
||||||
assert_16_byte_aligned( cullBits );
|
assert_16_byte_aligned( cullBits );
|
||||||
assert_16_byte_aligned( verts );
|
assert_16_byte_aligned( verts );
|
||||||
|
|
||||||
|
#ifdef ID_WIN_X86_SSE2_INTRIN
|
||||||
|
|
||||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
@ -296,6 +346,54 @@ static void R_TracePointCullSkinned( byte *cullBits, byte &totalOr, const float
|
||||||
|
|
||||||
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
totalOr = (byte) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||||
|
|
||||||
|
byte tOr = 0;
|
||||||
|
for ( int i = 0; i < numVerts; ) {
|
||||||
|
|
||||||
|
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||||
|
|
||||||
|
for ( ; i <= nextNumVerts; i++ ) {
|
||||||
|
const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||||
|
|
||||||
|
const float d0 = planes[0].Distance( v );
|
||||||
|
const float d1 = planes[1].Distance( v );
|
||||||
|
const float d2 = planes[2].Distance( v );
|
||||||
|
const float d3 = planes[3].Distance( v );
|
||||||
|
|
||||||
|
const float t0 = d0 + radius;
|
||||||
|
const float t1 = d1 + radius;
|
||||||
|
const float t2 = d2 + radius;
|
||||||
|
const float t3 = d3 + radius;
|
||||||
|
|
||||||
|
const float s0 = d0 - radius;
|
||||||
|
const float s1 = d1 - radius;
|
||||||
|
const float s2 = d2 - radius;
|
||||||
|
const float s3 = d3 - radius;
|
||||||
|
|
||||||
|
byte bits;
|
||||||
|
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||||
|
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||||
|
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||||
|
|
||||||
|
bits ^= 0x0F; // flip lower four bits
|
||||||
|
|
||||||
|
tOr |= bits;
|
||||||
|
cullBits[i] = bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
totalOr = tOr;
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in a new issue