mirror of
https://github.com/id-Software/DOOM-3-BFG.git
synced 2025-01-18 23:41:42 +00:00
Merged generic C++ fallbacks for SSE optimized code to allow support for non-x86 based platforms like ARM
This commit is contained in:
parent
e510691c36
commit
db715535cc
28 changed files with 2747 additions and 337 deletions
|
@ -1,4 +1,4 @@
|
|||
astyle.exe -v --options=astyle-options.ini --exclude="libs" --recursive *.h
|
||||
astyle.exe -v --options=astyle-options.ini --exclude="libs" --exclude="idlib/math/Simd.cpp" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
|
||||
astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --recursive *.h
|
||||
astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
|
||||
|
||||
pause
|
|
@ -1,5 +1,5 @@
|
|||
#!/bin/sh
|
||||
./astyle.exe -v --options=astyle-options.ini --exclude="libs" --recursive *.h
|
||||
./astyle.exe -v --options=astyle-options.ini --exclude="libs" --exclude="idlib/math/Simd.cpp" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
|
||||
./astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --recursive *.h
|
||||
./astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
|
||||
|
||||
#pause
|
|
@ -301,7 +301,13 @@ idCVar idFileSystemLocal::fs_debugResources( "fs_debugResources", "0", CVAR_SYST
|
|||
idCVar idFileSystemLocal::fs_enableBGL( "fs_enableBGL", "0", CVAR_SYSTEM | CVAR_BOOL, "" );
|
||||
idCVar idFileSystemLocal::fs_debugBGL( "fs_debugBGL", "0", CVAR_SYSTEM | CVAR_BOOL, "" );
|
||||
idCVar idFileSystemLocal::fs_copyfiles( "fs_copyfiles", "0", CVAR_SYSTEM | CVAR_INIT | CVAR_BOOL, "Copy every file touched to fs_savepath" );
|
||||
// RB
|
||||
#if defined(RETAIL)
|
||||
idCVar idFileSystemLocal::fs_buildResources( "fs_buildresources", "0", CVAR_SYSTEM | CVAR_BOOL | CVAR_INIT, "Copy every file touched to a resource file" );
|
||||
#else
|
||||
idCVar idFileSystemLocal::fs_buildResources( "fs_buildresources", "1", CVAR_SYSTEM | CVAR_BOOL | CVAR_INIT, "Copy every file touched to a resource file" );
|
||||
#endif
|
||||
// RB end
|
||||
idCVar idFileSystemLocal::fs_game( "fs_game", "", CVAR_SYSTEM | CVAR_INIT | CVAR_SERVERINFO, "mod path" );
|
||||
idCVar idFileSystemLocal::fs_game_base( "fs_game_base", "", CVAR_SYSTEM | CVAR_INIT | CVAR_SERVERINFO, "alternate mod path, searched after the main fs_game path, before the basedir" );
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2013 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
|
@ -113,11 +114,20 @@ ID_INLINE halfFloat_t F32toF16( float a )
|
|||
|
||||
class idDrawVert
|
||||
{
|
||||
friend class idSwap;
|
||||
friend class idShadowVertSkinned;
|
||||
friend class idRenderModelStatic;
|
||||
|
||||
friend void TransformVertsAndTangents( idDrawVert* targetVerts, const int numVerts, const idDrawVert* baseVerts, const idJointMat* joints );
|
||||
|
||||
public:
|
||||
idVec3 xyz; // 12 bytes
|
||||
private:
|
||||
// RB: don't let the old tools code mess with these values
|
||||
halfFloat_t st[2]; // 4 bytes
|
||||
byte normal[4]; // 4 bytes
|
||||
byte tangent[4]; // 4 bytes -- [3] is texture polarity sign
|
||||
public:
|
||||
byte color[4]; // 4 bytes
|
||||
byte color2[4]; // 4 bytes -- weights for skinning
|
||||
|
||||
|
@ -187,9 +197,14 @@ public:
|
|||
#define DRAWVERT_COLOR_OFFSET (6*4)
|
||||
#define DRAWVERT_COLOR2_OFFSET (7*4)
|
||||
|
||||
// RB begin
|
||||
assert_sizeof( idDrawVert, DRAWVERT_SIZE );
|
||||
#if 0
|
||||
assert_offsetof( idDrawVert, xyz, DRAWVERT_XYZ_OFFSET );
|
||||
assert_offsetof( idDrawVert, normal, DRAWVERT_NORMAL_OFFSET );
|
||||
assert_offsetof( idDrawVert, tangent, DRAWVERT_TANGENT_OFFSET );
|
||||
#endif
|
||||
// RB end
|
||||
|
||||
/*
|
||||
========================
|
||||
|
@ -202,6 +217,7 @@ ID_INLINE void VertexFloatToByte( const float& x, const float& y, const float& z
|
|||
{
|
||||
assert_4_byte_aligned( bval ); // for __stvebx
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
|
@ -218,6 +234,13 @@ ID_INLINE void VertexFloatToByte( const float& x, const float& y, const float& z
|
|||
bval[1] = ( byte )_mm_extract_epi16( xyz16, 1 );
|
||||
bval[2] = ( byte )_mm_extract_epi16( xyz16, 2 );
|
||||
|
||||
#else
|
||||
|
||||
bval[0] = VERTEX_FLOAT_TO_BYTE( x );
|
||||
bval[1] = VERTEX_FLOAT_TO_BYTE( y );
|
||||
bval[2] = VERTEX_FLOAT_TO_BYTE( z );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -655,6 +678,7 @@ ID_INLINE void WriteDrawVerts16( idDrawVert* destVerts, const idDrawVert* localV
|
|||
assert_16_byte_aligned( destVerts );
|
||||
assert_16_byte_aligned( localVerts );
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
for( int i = 0; i < numVerts; i++ )
|
||||
{
|
||||
|
@ -664,6 +688,11 @@ ID_INLINE void WriteDrawVerts16( idDrawVert* destVerts, const idDrawVert* localV
|
|||
_mm_stream_si128( ( __m128i* )( ( byte* )( destVerts + i ) + 16 ), v1 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
memcpy( destVerts, localVerts, numVerts * sizeof( idDrawVert ) );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -30,6 +30,7 @@ If you have questions concerning this license or the applicable additional terms
|
|||
#define __DRAWVERT_INTRINSICS_H__
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
|
||||
static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
|
||||
static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
|
||||
|
@ -50,13 +51,14 @@ static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0
|
|||
static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
|
||||
static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
====================
|
||||
FastF32toF16
|
||||
====================
|
||||
*/
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits )
|
||||
{
|
||||
__m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
|
||||
|
@ -77,6 +79,7 @@ ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits )
|
|||
|
||||
return _mm_packs_epi32( flt16, flt16 );
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 )
|
||||
|
@ -117,7 +120,7 @@ ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 )
|
|||
LoadSkinnedDrawVertPosition
|
||||
====================
|
||||
*/
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, const idJointMat* joints )
|
||||
{
|
||||
const idJointMat& j0 = joints[base.color[0]];
|
||||
|
@ -178,7 +181,7 @@ ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, con
|
|||
|
||||
return r0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert& vert, const idJointMat* joints )
|
||||
{
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -44,8 +44,11 @@ const float LCP_DELTA_FORCE_EPSILON = 1e-9f;
|
|||
|
||||
#define IGNORE_UNSATISFIABLE_VARIABLES
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
#define LCP_SIMD
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(LCP_SIMD)
|
||||
ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f };
|
||||
|
@ -70,7 +73,7 @@ ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 };
|
|||
ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 };
|
||||
ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 };
|
||||
ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 };
|
||||
|
||||
#endif // #if defined(LCP_SIMD)
|
||||
/*
|
||||
========================
|
||||
Multiply_SIMD
|
||||
|
@ -91,6 +94,7 @@ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, con
|
|||
dst[i] = src0[i] * src1[i];
|
||||
}
|
||||
|
||||
#if defined(LCP_SIMD)
|
||||
|
||||
for( ; i + 4 <= count; i += 4 )
|
||||
{
|
||||
|
@ -104,6 +108,21 @@ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, con
|
|||
_mm_store_ps( dst + i, s0 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for( ; i + 4 <= count; i += 4 )
|
||||
{
|
||||
assert_16_byte_aligned( &dst[i] );
|
||||
assert_16_byte_aligned( &src0[i] );
|
||||
assert_16_byte_aligned( &src1[i] );
|
||||
|
||||
dst[i + 0] = src0[i + 0] * src1[i + 0];
|
||||
dst[i + 1] = src0[i + 1] * src1[i + 1];
|
||||
dst[i + 2] = src0[i + 2] * src1[i + 2];
|
||||
dst[i + 3] = src0[i + 3] * src1[i + 3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for( ; i < count; i++ )
|
||||
{
|
||||
|
@ -124,6 +143,7 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src
|
|||
{
|
||||
int i = 0;
|
||||
|
||||
|
||||
// RB: changed unsigned int to uintptr_t
|
||||
for( ; ( ( uintptr_t )dst & 0xF ) != 0 && i < count; i++ )
|
||||
// RB end
|
||||
|
@ -131,6 +151,7 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src
|
|||
dst[i] += constant * src[i];
|
||||
}
|
||||
|
||||
#if defined(LCP_SIMD)
|
||||
|
||||
__m128 c = _mm_load1_ps( & constant );
|
||||
for( ; i + 4 <= count; i += 4 )
|
||||
|
@ -144,6 +165,20 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src
|
|||
_mm_store_ps( dst + i, s );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for( ; i + 4 <= count; i += 4 )
|
||||
{
|
||||
assert_16_byte_aligned( &src[i] );
|
||||
assert_16_byte_aligned( &dst[i] );
|
||||
|
||||
dst[i + 0] += constant * src[i + 0];
|
||||
dst[i + 1] += constant * src[i + 1];
|
||||
dst[i + 2] += constant * src[i + 2];
|
||||
dst[i + 3] += constant * src[i + 3];
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
for( ; i < count; i++ )
|
||||
{
|
||||
|
@ -163,7 +198,7 @@ static float DotProduct_SIMD( const float* src0, const float* src1, const int co
|
|||
assert_16_byte_aligned( src0 );
|
||||
assert_16_byte_aligned( src1 );
|
||||
|
||||
#ifndef _lint
|
||||
#if defined(LCP_SIMD)
|
||||
|
||||
__m128 sum = ( __m128& ) SIMD_SP_zero;
|
||||
int i = 0;
|
||||
|
@ -328,7 +363,7 @@ static void LowerTriangularSolve_SIMD( const idMatX& L, float* x, const float* b
|
|||
|
||||
int i = skip;
|
||||
|
||||
#ifndef _lint
|
||||
#if defined(LCP_SIMD)
|
||||
|
||||
// work up to a multiple of 4 rows
|
||||
for( ; ( i & 3 ) != 0 && i < n; i++ )
|
||||
|
@ -601,7 +636,7 @@ static void LowerTriangularSolveTranspose_SIMD( const idMatX& L, float* x, const
|
|||
const float* lptr = L.ToFloatPtr() + m * nc + m - 4;
|
||||
float* xptr = x + m;
|
||||
|
||||
#ifndef _lint
|
||||
#if defined(LCP_SIMD)
|
||||
|
||||
// process 4 rows at a time
|
||||
for( int i = m; i >= 4; i -= 4 )
|
||||
|
@ -982,7 +1017,7 @@ static bool LDLT_Factor_SIMD( idMatX& mat, idVecX& invDiag, const int n )
|
|||
mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d;
|
||||
}
|
||||
|
||||
#ifndef _lint
|
||||
#if defined(LCP_SIMD)
|
||||
|
||||
__m128 vzero = _mm_setzero_ps();
|
||||
for( int i = 4; i < n; i += 4 )
|
||||
|
@ -1360,7 +1395,7 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_
|
|||
int d, float dir, float& maxStep, int& limit, int& limitSide )
|
||||
{
|
||||
|
||||
|
||||
#if defined(LCP_SIMD)
|
||||
__m128 vMaxStep;
|
||||
__m128i vLimit;
|
||||
__m128i vLimitSide;
|
||||
|
@ -1484,6 +1519,117 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_
|
|||
_mm_store_ss( & maxStep, vMaxStep );
|
||||
limit = _mm_cvtsi128_si32( vLimit );
|
||||
limitSide = _mm_cvtsi128_si32( vLimitSide );
|
||||
#else
|
||||
int i;
|
||||
float s;
|
||||
|
||||
// default to a full step for the current variable
|
||||
if( idMath::Fabs( delta_a[d] ) > LCP_DELTA_ACCEL_EPSILON )
|
||||
{
|
||||
maxStep = -a[d] / delta_a[d];
|
||||
}
|
||||
else
|
||||
{
|
||||
maxStep = 0.0f;
|
||||
}
|
||||
limit = d;
|
||||
limitSide = 0;
|
||||
|
||||
// test the current variable
|
||||
if( dir < 0.0f )
|
||||
{
|
||||
if( lo[d] != -idMath::INFINITY )
|
||||
{
|
||||
s = ( lo[d] - f[d] ) / dir;
|
||||
if( s < maxStep )
|
||||
{
|
||||
maxStep = s;
|
||||
limitSide = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if( hi[d] != idMath::INFINITY )
|
||||
{
|
||||
s = ( hi[d] - f[d] ) / dir;
|
||||
if( s < maxStep )
|
||||
{
|
||||
maxStep = s;
|
||||
limitSide = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// test the clamped bounded variables
|
||||
for( i = numUnbounded; i < numClamped; i++ )
|
||||
{
|
||||
if( delta_f[i] < -LCP_DELTA_FORCE_EPSILON )
|
||||
{
|
||||
// if there is a low boundary
|
||||
if( lo[i] != -idMath::INFINITY )
|
||||
{
|
||||
s = ( lo[i] - f[i] ) / delta_f[i];
|
||||
if( s < maxStep )
|
||||
{
|
||||
maxStep = s;
|
||||
limit = i;
|
||||
limitSide = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( delta_f[i] > LCP_DELTA_FORCE_EPSILON )
|
||||
{
|
||||
// if there is a high boundary
|
||||
if( hi[i] != idMath::INFINITY )
|
||||
{
|
||||
s = ( hi[i] - f[i] ) / delta_f[i];
|
||||
if( s < maxStep )
|
||||
{
|
||||
maxStep = s;
|
||||
limit = i;
|
||||
limitSide = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// test the not clamped bounded variables
|
||||
for( i = numClamped; i < d; i++ )
|
||||
{
|
||||
if( side[i] == -1 )
|
||||
{
|
||||
if( delta_a[i] >= -LCP_DELTA_ACCEL_EPSILON )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if( side[i] == 1 )
|
||||
{
|
||||
if( delta_a[i] <= LCP_DELTA_ACCEL_EPSILON )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// ignore variables for which the force is not allowed to take any substantial value
|
||||
if( lo[i] >= -LCP_BOUND_EPSILON && hi[i] <= LCP_BOUND_EPSILON )
|
||||
{
|
||||
continue;
|
||||
}
|
||||
s = -a[i] / delta_a[i];
|
||||
if( s < maxStep )
|
||||
{
|
||||
maxStep = s;
|
||||
limit = i;
|
||||
limitSide = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -201,6 +201,7 @@ void idMatX::CopyLowerToUpperTriangle()
|
|||
assert( ( GetNumColumns() & 3 ) == 0 );
|
||||
assert( GetNumColumns() >= GetNumRows() );
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
const int n = GetNumColumns();
|
||||
const int m = GetNumRows();
|
||||
|
@ -341,6 +342,22 @@ void idMatX::CopyLowerToUpperTriangle()
|
|||
_mm_store_ps( basePtr + n0, r0 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
const int n = GetNumColumns();
|
||||
const int m = GetNumRows();
|
||||
for( int i = 0; i < m; i++ )
|
||||
{
|
||||
const float* __restrict ptr = ToFloatPtr() + ( i + 1 ) * n + i;
|
||||
float* __restrict dstPtr = ToFloatPtr() + i * n;
|
||||
for( int j = i + 1; j < m; j++ )
|
||||
{
|
||||
dstPtr[j] = ptr[0];
|
||||
ptr += n;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef _DEBUG
|
||||
for( int i = 0; i < numRows; i++ )
|
||||
|
|
|
@ -46,7 +46,10 @@ NOTE: due to the temporary memory pool idMatX cannot be used by multiple threads
|
|||
#define MATX_CLEAREND() int s = numRows * numColumns; while( s < ( ( s + 3 ) & ~3 ) ) { mat[s++] = 0.0f; }
|
||||
#define MATX_ALLOCA( n ) ( (float *) _alloca16( MATX_QUAD( n ) ) )
|
||||
#define MATX_ALLOCA_CACHE_LINES( n ) ( (float *) _alloca128( ( ( n ) * sizeof( float ) + CACHE_LINE_SIZE - 1 ) & ~ ( CACHE_LINE_SIZE - 1 ) ) )
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
#define MATX_SIMD
|
||||
#endif
|
||||
|
||||
class idMatX
|
||||
{
|
||||
|
|
|
@ -51,6 +51,7 @@ const float idMath::INFINITY = 1e30f;
|
|||
const float idMath::FLT_EPSILON = 1.192092896e-07f;
|
||||
const float idMath::FLT_SMALLEST_NON_DENORMAL = * reinterpret_cast< const float* >( & SMALLEST_NON_DENORMAL ); // 1.1754944e-038f
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
const __m128 idMath::SIMD_SP_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
const __m128 idMath::SIMD_SP_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
|
||||
const __m128 idMath::SIMD_SP_min_char = { -128.0f, -128.0f, -128.0f, -128.0f };
|
||||
|
@ -61,6 +62,7 @@ const __m128 idMath::SIMD_SP_smallestNonDenorm = { FLT_SMALLEST_NON_DENORMAL, FL
|
|||
const __m128 idMath::SIMD_SP_tiny = { 1e-4f, 1e-4f, 1e-4f, 1e-4f };
|
||||
const __m128 idMath::SIMD_SP_rsqrt_c0 = { 3.0f, 3.0f, 3.0f, 3.0f };
|
||||
const __m128 idMath::SIMD_SP_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f };
|
||||
#endif
|
||||
|
||||
bool idMath::initialized = false;
|
||||
dword idMath::iSqrt[SQRT_TABLE_SIZE]; // inverse square root lookup table
|
||||
|
|
|
@ -469,6 +469,7 @@ public:
|
|||
static const float FLT_EPSILON; // smallest positive number such that 1.0+FLT_EPSILON != 1.0
|
||||
static const float FLT_SMALLEST_NON_DENORMAL; // smallest non-denormal 32-bit floating point value
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
static const __m128 SIMD_SP_zero;
|
||||
static const __m128 SIMD_SP_255;
|
||||
static const __m128 SIMD_SP_min_char;
|
||||
|
@ -479,6 +480,7 @@ public:
|
|||
static const __m128 SIMD_SP_tiny;
|
||||
static const __m128 SIMD_SP_rsqrt_c0;
|
||||
static const __m128 SIMD_SP_rsqrt_c1;
|
||||
#endif
|
||||
|
||||
private:
|
||||
enum
|
||||
|
@ -526,9 +528,7 @@ idMath::InvSqrt16
|
|||
*/
|
||||
ID_INLINE float idMath::InvSqrt16( float x )
|
||||
{
|
||||
|
||||
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1321,8 +1321,21 @@ ID_INLINE int idMath::Ftoi( float f )
|
|||
// If a converted result is larger than the maximum signed doubleword integer,
|
||||
// the floating-point invalid exception is raised, and if this exception is masked,
|
||||
// the indefinite integer value (80000000H) is returned.
|
||||
#if defined(USE_INTRINSICS)
|
||||
__m128 x = _mm_load_ss( &f );
|
||||
return _mm_cvttss_si32( x );
|
||||
#elif 0 // round chop (C/C++ standard)
|
||||
int i, s, e, m, shift;
|
||||
i = *reinterpret_cast<int*>( &f );
|
||||
s = i >> IEEE_FLT_SIGN_BIT;
|
||||
e = ( ( i >> IEEE_FLT_MANTISSA_BITS ) & ( ( 1 << IEEE_FLT_EXPONENT_BITS ) - 1 ) ) - IEEE_FLT_EXPONENT_BIAS;
|
||||
m = ( i & ( ( 1 << IEEE_FLT_MANTISSA_BITS ) - 1 ) ) | ( 1 << IEEE_FLT_MANTISSA_BITS );
|
||||
shift = e - IEEE_FLT_MANTISSA_BITS;
|
||||
return ( ( ( ( m >> -shift ) | ( m << shift ) ) & ~( e >> INT32_SIGN_BIT ) ) ^ s ) - s;
|
||||
#else
|
||||
// If a converted result is larger than the maximum signed doubleword integer the result is undefined.
|
||||
return C_FLOAT_TO_INT( f );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1332,10 +1345,24 @@ idMath::Ftoi8
|
|||
*/
|
||||
ID_INLINE char idMath::Ftoi8( float f )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
__m128 x = _mm_load_ss( &f );
|
||||
x = _mm_max_ss( x, SIMD_SP_min_char );
|
||||
x = _mm_min_ss( x, SIMD_SP_max_char );
|
||||
return static_cast<char>( _mm_cvttss_si32( x ) );
|
||||
#else
|
||||
// The converted result is clamped to the range [-128,127].
|
||||
int i = C_FLOAT_TO_INT( f );
|
||||
if( i < -128 )
|
||||
{
|
||||
return -128;
|
||||
}
|
||||
else if( i > 127 )
|
||||
{
|
||||
return 127;
|
||||
}
|
||||
return static_cast<char>( i );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1345,10 +1372,24 @@ idMath::Ftoi16
|
|||
*/
|
||||
ID_INLINE short idMath::Ftoi16( float f )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
__m128 x = _mm_load_ss( &f );
|
||||
x = _mm_max_ss( x, SIMD_SP_min_short );
|
||||
x = _mm_min_ss( x, SIMD_SP_max_short );
|
||||
return static_cast<short>( _mm_cvttss_si32( x ) );
|
||||
#else
|
||||
// The converted result is clamped to the range [-32768,32767].
|
||||
int i = C_FLOAT_TO_INT( f );
|
||||
if( i < -32768 )
|
||||
{
|
||||
return -32768;
|
||||
}
|
||||
else if( i > 32767 )
|
||||
{
|
||||
return 32767;
|
||||
}
|
||||
return static_cast<short>( i );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1382,10 +1423,25 @@ ID_INLINE byte idMath::Ftob( float f )
|
|||
{
|
||||
// If a converted result is negative the value (0) is returned and if the
|
||||
// converted result is larger than the maximum byte the value (255) is returned.
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
__m128 x = _mm_load_ss( &f );
|
||||
x = _mm_max_ss( x, SIMD_SP_zero );
|
||||
x = _mm_min_ss( x, SIMD_SP_255 );
|
||||
return static_cast<byte>( _mm_cvttss_si32( x ) );
|
||||
#else
|
||||
// The converted result is clamped to the range [0,255].
|
||||
int i = C_FLOAT_TO_INT( f );
|
||||
if( i < 0 )
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else if( i > 255 )
|
||||
{
|
||||
return 255;
|
||||
}
|
||||
return static_cast<byte>( i );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -38,6 +38,7 @@ If you have questions concerning this license or the applicable additional terms
|
|||
// E
|
||||
//===============================================================
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
|
@ -973,3 +974,5 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat* jointMats, const int* par
|
|||
}
|
||||
}
|
||||
|
||||
#endif // #if defined(USE_INTRINSICS)
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2013 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
|
@ -37,6 +38,8 @@ If you have questions concerning this license or the applicable additional terms
|
|||
===============================================================================
|
||||
*/
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
class idSIMD_SSE : public idSIMD_Generic
|
||||
{
|
||||
public:
|
||||
|
@ -50,4 +53,6 @@ public:
|
|||
virtual void VPCALL UntransformJoints( idJointMat* jointMats, const int* parents, const int firstJoint, const int lastJoint );
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* !__MATH_SIMD_SSE_H__ */
|
||||
|
|
|
@ -45,7 +45,10 @@ NOTE: due to the temporary memory pool idVecX cannot be used by multiple threads
|
|||
#define VECX_QUAD( x ) ( ( ( ( x ) + 3 ) & ~3 ) * sizeof( float ) )
|
||||
#define VECX_CLEAREND() int s = size; while( s < ( ( s + 3) & ~3 ) ) { p[s++] = 0.0f; }
|
||||
#define VECX_ALLOCA( n ) ( (float *) _alloca16( VECX_QUAD( n ) ) )
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
#define VECX_SIMD
|
||||
#endif
|
||||
|
||||
class idVecX
|
||||
{
|
||||
|
|
|
@ -516,6 +516,11 @@ ID_INLINE idVec3 operator*( const float a, const idVec3 b )
|
|||
return idVec3( b.x * a, b.y * a, b.z * a );
|
||||
}
|
||||
|
||||
ID_INLINE idVec3 operator/( const float a, const idVec3 b )
|
||||
{
|
||||
return idVec3( a / b.x, a / b.y, a / b.z );
|
||||
}
|
||||
|
||||
ID_INLINE idVec3 idVec3::operator+( const idVec3& a ) const
|
||||
{
|
||||
return idVec3( x + a.x, y + a.y, z + a.z );
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2013 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
|
@ -28,8 +29,11 @@ If you have questions concerning this license or the applicable additional terms
|
|||
#ifndef __SYS_INTRIINSICS_H__
|
||||
#define __SYS_INTRIINSICS_H__
|
||||
|
||||
#include <emmintrin.h>
|
||||
#define USE_INTRINSICS
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
/*
|
||||
================================================================================================
|
||||
|
||||
|
@ -91,6 +95,7 @@ ID_INLINE_EXTERN float __frndz( float x )
|
|||
================================================================================================
|
||||
*/
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
// The code below assumes that a cache line is 64 bytes.
|
||||
// We specify the cache line size as 128 here to make the code consistent with the consoles.
|
||||
#define CACHE_LINE_SIZE 128
|
||||
|
@ -122,6 +127,26 @@ ID_FORCE_INLINE void FlushCacheLine( const void* ptr, int offset )
|
|||
_mm_clflush( bytePtr + 64 );
|
||||
}
|
||||
|
||||
/*
|
||||
================================================
|
||||
#endif
|
||||
Other
|
||||
================================================
|
||||
*/
|
||||
#else
|
||||
|
||||
#define CACHE_LINE_SIZE 128
|
||||
|
||||
ID_INLINE void Prefetch( const void* ptr, int offset ) {}
|
||||
ID_INLINE void ZeroCacheLine( void* ptr, int offset )
|
||||
{
|
||||
byte* bytePtr = ( byte* )( ( ( ( uintptr_t )( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) );
|
||||
memset( bytePtr, 0, CACHE_LINE_SIZE );
|
||||
}
|
||||
ID_INLINE void FlushCacheLine( const void* ptr, int offset ) {}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
================================================
|
||||
Block Clear Macros
|
||||
|
@ -168,6 +193,8 @@ ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size )
|
|||
================================================================================================
|
||||
*/
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
/*
|
||||
================================================
|
||||
PC Windows
|
||||
|
@ -194,6 +221,7 @@ ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size )
|
|||
#endif
|
||||
// DG end
|
||||
|
||||
|
||||
// make the intrinsics "type unsafe"
|
||||
typedef union DECLSPEC_INTRINTYPE _CRT_ALIGN( 16 ) __m128c
|
||||
{
|
||||
|
@ -275,4 +303,6 @@ ID_FORCE_INLINE_EXTERN __m128 _mm_div16_ps( __m128 x, __m128 y )
|
|||
// load idBounds::GetMaxs()
|
||||
#define _mm_loadu_bounds_1( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[1].x ), (__m64 *) & bounds[1].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) )
|
||||
|
||||
#endif // #if defined(USE_INTRINSICS)
|
||||
|
||||
#endif // !__SYS_INTRIINSICS_H__
|
||||
|
|
|
@ -79,6 +79,7 @@ void UnbindBufferObjects()
|
|||
qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 );
|
||||
}
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
void CopyBuffer( byte* dst, const byte* src, int numBytes )
|
||||
{
|
||||
|
@ -121,6 +122,16 @@ void CopyBuffer( byte* dst, const byte* src, int numBytes )
|
|||
_mm_sfence();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void CopyBuffer( byte* dst, const byte* src, int numBytes )
|
||||
{
|
||||
assert_16_byte_aligned( dst );
|
||||
assert_16_byte_aligned( src );
|
||||
memcpy( dst, src, numBytes );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
================================================================================================
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2013 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
|
@ -305,7 +306,11 @@ idDxtEncoder::CompressImageDXT1Fast
|
|||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte* inBuf, byte* outBuf, int width, int height )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -315,7 +320,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast
|
|||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte* inBuf, byte* outBuf, int width, int height )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -325,7 +334,11 @@ idDxtEncoder::CompressImageDXT5Fast
|
|||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -345,7 +358,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast
|
|||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -365,7 +382,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast
|
|||
*/
|
||||
ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height );
|
||||
#else
|
||||
CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -34,6 +34,7 @@ Contains the DxtEncoder implementation for SSE2.
|
|||
#include "DXTCodec_local.h"
|
||||
#include "DXTCodec.h"
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
//#define TEST_COMPRESSION
|
||||
#ifdef TEST_COMPRESSION
|
||||
|
@ -1627,3 +1628,4 @@ void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte* inBuf, byte* outB
|
|||
#endif
|
||||
}
|
||||
|
||||
#endif // #if defined(USE_INTRINSICS)
|
|
@ -74,7 +74,7 @@ R_MatrixMultiply
|
|||
*/
|
||||
void R_MatrixMultiply( const float a[16], const float b[16], float out[16] )
|
||||
{
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
__m128 a0 = _mm_loadu_ps( a + 0 * 4 );
|
||||
__m128 a1 = _mm_loadu_ps( a + 1 * 4 );
|
||||
__m128 a2 = _mm_loadu_ps( a + 2 * 4 );
|
||||
|
@ -110,6 +110,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] )
|
|||
_mm_storeu_ps( out + 2 * 4, t2 );
|
||||
_mm_storeu_ps( out + 3 * 4, t3 );
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
for ( int i = 0; i < 4; i++ ) {
|
||||
for ( int j = 0; j < 4; j++ ) {
|
||||
out[ i * 4 + j ] =
|
||||
a[ i * 4 + 0 ] * b[ 0 * 4 + j ] +
|
||||
a[ i * 4 + 1 ] * b[ 1 * 4 + j ] +
|
||||
a[ i * 4 + 2 ] * b[ 2 * 4 + j ] +
|
||||
a[ i * 4 + 3 ] * b[ 3 * 4 + j ];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
out[0 * 4 + 0] = a[0 * 4 + 0] * b[0 * 4 + 0] + a[0 * 4 + 1] * b[1 * 4 + 0] + a[0 * 4 + 2] * b[2 * 4 + 0] + a[0 * 4 + 3] * b[3 * 4 + 0];
|
||||
out[0 * 4 + 1] = a[0 * 4 + 0] * b[0 * 4 + 1] + a[0 * 4 + 1] * b[1 * 4 + 1] + a[0 * 4 + 2] * b[2 * 4 + 1] + a[0 * 4 + 3] * b[3 * 4 + 1];
|
||||
out[0 * 4 + 2] = a[0 * 4 + 0] * b[0 * 4 + 2] + a[0 * 4 + 1] * b[1 * 4 + 2] + a[0 * 4 + 2] * b[2 * 4 + 2] + a[0 * 4 + 3] * b[3 * 4 + 2];
|
||||
out[0 * 4 + 3] = a[0 * 4 + 0] * b[0 * 4 + 3] + a[0 * 4 + 1] * b[1 * 4 + 3] + a[0 * 4 + 2] * b[2 * 4 + 3] + a[0 * 4 + 3] * b[3 * 4 + 3];
|
||||
|
||||
out[1 * 4 + 0] = a[1 * 4 + 0] * b[0 * 4 + 0] + a[1 * 4 + 1] * b[1 * 4 + 0] + a[1 * 4 + 2] * b[2 * 4 + 0] + a[1 * 4 + 3] * b[3 * 4 + 0];
|
||||
out[1 * 4 + 1] = a[1 * 4 + 0] * b[0 * 4 + 1] + a[1 * 4 + 1] * b[1 * 4 + 1] + a[1 * 4 + 2] * b[2 * 4 + 1] + a[1 * 4 + 3] * b[3 * 4 + 1];
|
||||
out[1 * 4 + 2] = a[1 * 4 + 0] * b[0 * 4 + 2] + a[1 * 4 + 1] * b[1 * 4 + 2] + a[1 * 4 + 2] * b[2 * 4 + 2] + a[1 * 4 + 3] * b[3 * 4 + 2];
|
||||
out[1 * 4 + 3] = a[1 * 4 + 0] * b[0 * 4 + 3] + a[1 * 4 + 1] * b[1 * 4 + 3] + a[1 * 4 + 2] * b[2 * 4 + 3] + a[1 * 4 + 3] * b[3 * 4 + 3];
|
||||
|
||||
out[2 * 4 + 0] = a[2 * 4 + 0] * b[0 * 4 + 0] + a[2 * 4 + 1] * b[1 * 4 + 0] + a[2 * 4 + 2] * b[2 * 4 + 0] + a[2 * 4 + 3] * b[3 * 4 + 0];
|
||||
out[2 * 4 + 1] = a[2 * 4 + 0] * b[0 * 4 + 1] + a[2 * 4 + 1] * b[1 * 4 + 1] + a[2 * 4 + 2] * b[2 * 4 + 1] + a[2 * 4 + 3] * b[3 * 4 + 1];
|
||||
out[2 * 4 + 2] = a[2 * 4 + 0] * b[0 * 4 + 2] + a[2 * 4 + 1] * b[1 * 4 + 2] + a[2 * 4 + 2] * b[2 * 4 + 2] + a[2 * 4 + 3] * b[3 * 4 + 2];
|
||||
out[2 * 4 + 3] = a[2 * 4 + 0] * b[0 * 4 + 3] + a[2 * 4 + 1] * b[1 * 4 + 3] + a[2 * 4 + 2] * b[2 * 4 + 3] + a[2 * 4 + 3] * b[3 * 4 + 3];
|
||||
|
||||
out[3 * 4 + 0] = a[3 * 4 + 0] * b[0 * 4 + 0] + a[3 * 4 + 1] * b[1 * 4 + 0] + a[3 * 4 + 2] * b[2 * 4 + 0] + a[3 * 4 + 3] * b[3 * 4 + 0];
|
||||
out[3 * 4 + 1] = a[3 * 4 + 0] * b[0 * 4 + 1] + a[3 * 4 + 1] * b[1 * 4 + 1] + a[3 * 4 + 2] * b[2 * 4 + 1] + a[3 * 4 + 3] * b[3 * 4 + 1];
|
||||
out[3 * 4 + 2] = a[3 * 4 + 0] * b[0 * 4 + 2] + a[3 * 4 + 1] * b[1 * 4 + 2] + a[3 * 4 + 2] * b[2 * 4 + 2] + a[3 * 4 + 3] * b[3 * 4 + 2];
|
||||
out[3 * 4 + 3] = a[3 * 4 + 0] * b[0 * 4 + 3] + a[3 * 4 + 1] * b[1 * 4 + 3] + a[3 * 4 + 2] * b[2 * 4 + 3] + a[3 * 4 + 3] * b[3 * 4 + 3];
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -302,10 +302,10 @@ static void R_DecalPointCullStatic( byte* cullBits, const idPlane* planes, const
|
|||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
const __m128 vector_float_zero = _mm_setzero_ps();
|
||||
const __m128i vector_int_mask0 = _mm_set1_epi32( 1 << 0 );
|
||||
const __m128i vector_int_mask1 = _mm_set1_epi32( 1 << 1 );
|
||||
const __m128i vector_int_mask2 = _mm_set1_epi32( 1 << 2 );
|
||||
|
@ -406,6 +406,39 @@ static void R_DecalPointCullStatic( byte* cullBits, const idPlane* planes, const
|
|||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
const idVec3& v = vertsODS[i].xyz;
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
const float d4 = planes[4].Distance( v );
|
||||
const float d5 = planes[5].Distance( v );
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5;
|
||||
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -637,6 +670,7 @@ static void R_CopyDecalSurface( idDrawVert* verts, int numVerts, triIndex_t* ind
|
|||
assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
||||
assert_16_byte_aligned( fadeColor );
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
||||
const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
|
||||
|
@ -678,6 +712,28 @@ static void R_CopyDecalSurface( idDrawVert* verts, int numVerts, triIndex_t* ind
|
|||
|
||||
_mm_sfence();
|
||||
|
||||
#else
|
||||
|
||||
// copy vertices and apply depth/time based fading
|
||||
for( int i = 0; i < decal->numVerts; i++ )
|
||||
{
|
||||
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
|
||||
verts[numVerts + i] = decal->verts[i];
|
||||
for( int j = 0; j < 4; j++ )
|
||||
{
|
||||
verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] );
|
||||
}
|
||||
}
|
||||
|
||||
// copy indices
|
||||
assert( ( decal->numIndexes & 1 ) == 0 );
|
||||
for( int i = 0; i < decal->numIndexes; i += 2 )
|
||||
{
|
||||
assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts );
|
||||
WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2013 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
|
@ -111,7 +112,7 @@ static void R_OverlayPointCullStatic( byte* cullBits, halfFloat_t* texCoordS, ha
|
|||
assert_16_byte_aligned( texCoordT );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
|
@ -187,6 +188,41 @@ static void R_OverlayPointCullStatic( byte* cullBits, halfFloat_t* texCoordS, ha
|
|||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
const idVec3& v = vertsODS[i].xyz;
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = 1.0f - d0;
|
||||
const float d3 = 1.0f - d1;
|
||||
|
||||
halfFloat_t s = Scalar_FastF32toF16( d0 );
|
||||
halfFloat_t t = Scalar_FastF32toF16( d1 );
|
||||
|
||||
texCoordS[i] = s;
|
||||
texCoordT[i] = t;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
|
||||
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -201,7 +237,7 @@ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, h
|
|||
assert_16_byte_aligned( texCoordT );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
|
||||
|
@ -277,6 +313,41 @@ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, h
|
|||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||
|
||||
const float d0 = planes[0].Distance( transformed );
|
||||
const float d1 = planes[1].Distance( transformed );
|
||||
const float d2 = 1.0f - d0;
|
||||
const float d3 = 1.0f - d1;
|
||||
|
||||
halfFloat_t s = Scalar_FastF32toF16( d0 );
|
||||
halfFloat_t t = Scalar_FastF32toF16( d1 );
|
||||
|
||||
texCoordS[i] = s;
|
||||
texCoordT[i] = t;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
|
||||
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -486,6 +557,7 @@ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* i
|
|||
assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
|
||||
assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
|
||||
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
|
||||
|
@ -524,6 +596,30 @@ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* i
|
|||
|
||||
_mm_sfence();
|
||||
|
||||
#else
|
||||
|
||||
// copy vertices
|
||||
for( int i = 0; i < overlay->numVerts; i++ )
|
||||
{
|
||||
const overlayVertex_t& overlayVert = overlay->verts[i];
|
||||
|
||||
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
|
||||
verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
|
||||
|
||||
// RB begin
|
||||
verts[numVerts + i].SetTexCoordS( overlayVert.st[0] );
|
||||
verts[numVerts + i].SetTexCoordT( overlayVert.st[1] );
|
||||
// RB end
|
||||
}
|
||||
|
||||
// copy indexes
|
||||
for( int i = 0; i < overlay->numIndexes; i += 2 )
|
||||
{
|
||||
assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
|
||||
WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -32,10 +32,10 @@ If you have questions concerning this license or the applicable additional terms
|
|||
#include "tr_local.h"
|
||||
#include "Model_local.h"
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
|
||||
static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY };
|
||||
|
||||
#endif
|
||||
|
||||
static const char* MD5_SnapshotName = "_MD5_Snapshot_";
|
||||
|
||||
|
@ -561,6 +561,7 @@ idMD5Mesh::CalculateBounds
|
|||
*/
|
||||
void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds ) const
|
||||
{
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
__m128 minX = vector_float_posInfinity;
|
||||
__m128 minY = vector_float_posInfinity;
|
||||
|
@ -595,6 +596,17 @@ void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds )
|
|||
_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
|
||||
_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );
|
||||
|
||||
#else
|
||||
|
||||
bounds.Clear();
|
||||
for( int i = 0; i < numMeshJoints; i++ )
|
||||
{
|
||||
const idJointMat& joint = entJoints[meshJoints[i]];
|
||||
bounds.AddPoint( joint.GetTranslation() );
|
||||
}
|
||||
bounds.ExpandSelf( maxJointVertDist );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1220,6 +1232,7 @@ static void TransformJoints( idJointMat* __restrict outJoints, const int numJoin
|
|||
assert_16_byte_aligned( inFloats1 );
|
||||
assert_16_byte_aligned( inFloats2 );
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
|
||||
|
||||
|
@ -1296,6 +1309,14 @@ static void TransformJoints( idJointMat* __restrict outJoints, const int numJoin
|
|||
_mm_store_ps( outFloats + 1 * 12 + 8, ri1 );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
for( int i = 0; i < numJoints; i++ )
|
||||
{
|
||||
idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] );
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -92,7 +92,7 @@ static void R_ShadowVolumeCullBits( byte* cullBits, byte& totalOr, const float r
|
|||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 );
|
||||
|
@ -215,6 +215,56 @@ static void R_ShadowVolumeCullBits( byte* cullBits, byte& totalOr, const float r
|
|||
|
||||
totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
byte tOr = 0;
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
const idVec3& v = vertsODS[i].xyzw.ToVec3();
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
|
||||
const float t0 = d0 + radius;
|
||||
const float t1 = d1 + radius;
|
||||
const float t2 = d2 + radius;
|
||||
const float t3 = d3 + radius;
|
||||
|
||||
const float s0 = d0 - radius;
|
||||
const float s1 = d1 - radius;
|
||||
const float s2 = d2 - radius;
|
||||
const float s3 = d3 - radius;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||
|
||||
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||
|
||||
bits ^= 0x0F; // flip lower four bits
|
||||
|
||||
tOr |= bits;
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
totalOr = tOr;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -31,14 +31,16 @@ If you have questions concerning this license or the applicable additional terms
|
|||
#include "../../../idlib/sys/sys_intrinsics.h"
|
||||
#include "../../../idlib/geometry/DrawVert_intrinsics.h"
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 );
|
||||
#endif
|
||||
|
||||
/*
|
||||
=====================
|
||||
TriangleFacing_SSE2
|
||||
=====================
|
||||
*/
|
||||
#if defined(USE_INTRINSICS)
|
||||
static ID_FORCE_INLINE __m128i TriangleFacing_SSE2( const __m128& vert0X, const __m128& vert0Y, const __m128& vert0Z,
|
||||
const __m128& vert1X, const __m128& vert1Y, const __m128& vert1Z,
|
||||
const __m128& vert2X, const __m128& vert2Y, const __m128& vert2Z,
|
||||
|
@ -60,6 +62,7 @@ static ID_FORCE_INLINE __m128i TriangleFacing_SSE2( const __m128& vert0X, const
|
|||
const __m128 delta = _mm_nmsub_ps( lightOriginX, normalX, _mm_nmsub_ps( lightOriginY, normalY, _mm_nmsub_ps( lightOriginZ, normalZ, normalW ) ) );
|
||||
return _mm_castps_si128( _mm_cmplt_ps( delta, _mm_setzero_ps() ) );
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
=====================
|
||||
|
@ -68,6 +71,7 @@ TriangleCulled
|
|||
The clip space of the 'lightProject' is assumed to be in the range [0, 1].
|
||||
=====================
|
||||
*/
|
||||
#if defined(USE_INTRINSICS)
|
||||
static ID_FORCE_INLINE __m128i TriangleCulled_SSE2( const __m128& vert0X, const __m128& vert0Y, const __m128& vert0Z,
|
||||
const __m128& vert1X, const __m128& vert1Y, const __m128& vert1Z,
|
||||
const __m128& vert2X, const __m128& vert2Y, const __m128& vert2Z,
|
||||
|
@ -128,6 +132,92 @@ static ID_FORCE_INLINE __m128i TriangleCulled_SSE2( const __m128& vert0X, const
|
|||
return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*
|
||||
=====================
|
||||
TriangleFacing
|
||||
|
||||
Returns 255 if the triangle is facing the light origin, otherwise returns 0.
|
||||
=====================
|
||||
*/
|
||||
static byte TriangleFacing_Generic( const idVec3& v1, const idVec3& v2, const idVec3& v3, const idVec3& lightOrigin )
|
||||
{
|
||||
const float sx = v2.x - v1.x;
|
||||
const float sy = v2.y - v1.y;
|
||||
const float sz = v2.z - v1.z;
|
||||
|
||||
const float tx = v3.x - v1.x;
|
||||
const float ty = v3.y - v1.y;
|
||||
const float tz = v3.z - v1.z;
|
||||
|
||||
const float normalX = ty * sz - tz * sy;
|
||||
const float normalY = tz * sx - tx * sz;
|
||||
const float normalZ = tx * sy - ty * sx;
|
||||
const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z;
|
||||
|
||||
const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW;
|
||||
return ( d > 0.0f ) ? 255 : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
=====================
|
||||
TriangleCulled
|
||||
|
||||
Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0.
|
||||
The clip space of the 'lightProject' is assumed to be in the range [0, 1].
|
||||
=====================
|
||||
*/
|
||||
static byte TriangleCulled_Generic( const idVec3& v1, const idVec3& v2, const idVec3& v3, const idRenderMatrix& lightProject )
|
||||
{
|
||||
// transform the triangle
|
||||
idVec4 c[3];
|
||||
for( int i = 0; i < 4; i++ )
|
||||
{
|
||||
c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3];
|
||||
c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3];
|
||||
c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3];
|
||||
}
|
||||
|
||||
// calculate the culled bits
|
||||
int bits = 0;
|
||||
for( int i = 0; i < 3; i++ )
|
||||
{
|
||||
const float minW = 0.0f;
|
||||
const float maxW = c[i][3];
|
||||
|
||||
if( c[i][0] > minW )
|
||||
{
|
||||
bits |= ( 1 << 0 );
|
||||
}
|
||||
if( c[i][0] < maxW )
|
||||
{
|
||||
bits |= ( 1 << 1 );
|
||||
}
|
||||
if( c[i][1] > minW )
|
||||
{
|
||||
bits |= ( 1 << 2 );
|
||||
}
|
||||
if( c[i][1] < maxW )
|
||||
{
|
||||
bits |= ( 1 << 3 );
|
||||
}
|
||||
if( c[i][2] > minW )
|
||||
{
|
||||
bits |= ( 1 << 4 );
|
||||
}
|
||||
if( c[i][2] < maxW )
|
||||
{
|
||||
bits |= ( 1 << 5 );
|
||||
}
|
||||
}
|
||||
|
||||
// if any bits weren't set, the triangle is completely off one side of the frustum
|
||||
return ( bits != 63 ) ? 255 : 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
=====================
|
||||
|
@ -159,6 +249,7 @@ static int CalculateTriangleFacingCulledStatic( byte* __restrict facing, byte* _
|
|||
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
||||
const float lineLength = lineLengthSqr * lineLengthRcp;
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4* 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
||||
|
||||
|
@ -271,6 +362,61 @@ static int CalculateTriangleFacingCulledStatic( byte* __restrict facing, byte* _
|
|||
|
||||
return _mm_cvtsi128_si32( numFrontFacing );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
|
||||
|
||||
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
|
||||
|
||||
int numFrontFacing = 0;
|
||||
|
||||
for( int i = 0, j = 0; i < numIndexes; )
|
||||
{
|
||||
|
||||
const int batchStart = i;
|
||||
const int batchEnd = indexedVertsODS.FetchNextBatch();
|
||||
const int indexStart = j;
|
||||
|
||||
for( ; i <= batchEnd - 3; i += 3, j++ )
|
||||
{
|
||||
const idVec3& v1 = indexedVertsODS[i + 0].xyz;
|
||||
const idVec3& v2 = indexedVertsODS[i + 1].xyz;
|
||||
const idVec3& v3 = indexedVertsODS[i + 2].xyz;
|
||||
|
||||
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
|
||||
|
||||
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
|
||||
|
||||
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
|
||||
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
|
||||
|
||||
culled[j] = triangleCulled;
|
||||
facing[j] = triangleFacing;
|
||||
|
||||
// count the number of facing triangles
|
||||
numFrontFacing += ( triangleFacing & 1 );
|
||||
}
|
||||
|
||||
if( insideShadowVolume != NULL )
|
||||
{
|
||||
for( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ )
|
||||
{
|
||||
if( !facing[n] )
|
||||
{
|
||||
if( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) )
|
||||
{
|
||||
*insideShadowVolume = true;
|
||||
insideShadowVolume = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numFrontFacing;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -303,6 +449,7 @@ static int CalculateTriangleFacingCulledSkinned( byte* __restrict facing, byte*
|
|||
const idVec3 lineDir = lineDelta * lineLengthRcp;
|
||||
const float lineLength = lineLengthSqr * lineLengthRcp;
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
|
||||
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
|
@ -448,6 +595,82 @@ static int CalculateTriangleFacingCulledSkinned( byte* __restrict facing, byte*
|
|||
|
||||
return _mm_cvtsi128_si32( numFrontFacing );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||
tempVerts[i].w = 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
|
||||
|
||||
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
|
||||
|
||||
int numFrontFacing = 0;
|
||||
|
||||
for( int i = 0, j = 0; i < numIndexes; )
|
||||
{
|
||||
|
||||
const int batchStart = i;
|
||||
const int batchEnd = indexesODS.FetchNextBatch();
|
||||
const int indexStart = j;
|
||||
|
||||
for( ; i <= batchEnd - 3; i += 3, j++ )
|
||||
{
|
||||
const int i0 = indexesODS[i + 0];
|
||||
const int i1 = indexesODS[i + 1];
|
||||
const int i2 = indexesODS[i + 2];
|
||||
|
||||
const idVec3& v1 = tempVerts[i0].ToVec3();
|
||||
const idVec3& v2 = tempVerts[i1].ToVec3();
|
||||
const idVec3& v3 = tempVerts[i2].ToVec3();
|
||||
|
||||
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
|
||||
|
||||
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
|
||||
|
||||
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
|
||||
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
|
||||
|
||||
culled[j] = triangleCulled;
|
||||
facing[j] = triangleFacing;
|
||||
|
||||
// count the number of facing triangles
|
||||
numFrontFacing += ( triangleFacing & 1 );
|
||||
}
|
||||
|
||||
if( insideShadowVolume != NULL )
|
||||
{
|
||||
for( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ )
|
||||
{
|
||||
if( !facing[n] )
|
||||
{
|
||||
const int i0 = indexesODS[k + 0];
|
||||
const int i1 = indexesODS[k + 1];
|
||||
const int i2 = indexesODS[k + 2];
|
||||
if( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) )
|
||||
{
|
||||
*insideShadowVolume = true;
|
||||
insideShadowVolume = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numFrontFacing;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -461,6 +684,7 @@ static void StreamOut( void* dst, const void* src, int numBytes )
|
|||
assert_16_byte_aligned( dst );
|
||||
assert_16_byte_aligned( src );
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
int i = 0;
|
||||
for( ; i + 128 <= numBytes; i += 128 )
|
||||
{
|
||||
|
@ -486,6 +710,9 @@ static void StreamOut( void* dst, const void* src, int numBytes )
|
|||
__m128i d = _mm_load_si128( ( __m128i* )( ( byte* )src + i ) );
|
||||
_mm_stream_si128( ( __m128i* )( ( byte* )dst + i ), d );
|
||||
}
|
||||
#else
|
||||
memcpy( dst, src, numBytes );
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -706,7 +933,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t* __restrict shadowIndices,
|
|||
|
||||
numShadowIndexesTotal = numShadowIndices;
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
_mm_sfence();
|
||||
#endif
|
||||
|
||||
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
||||
|
||||
|
@ -893,7 +1122,9 @@ void R_CreateLightTriangles( triIndex_t* __restrict lightIndices, triIndex_t* __
|
|||
|
||||
numLightIndicesTotal = numLightIndices;
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
_mm_sfence();
|
||||
#endif
|
||||
|
||||
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
Doom 3 BFG Edition GPL Source Code
|
||||
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
|
||||
Copyright (C) 2013 Robert Beckebans
|
||||
|
||||
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
|
||||
|
||||
|
@ -44,7 +45,7 @@ static void R_TracePointCullStatic( byte* cullBits, byte& totalOr, const float r
|
|||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 );
|
||||
|
@ -167,6 +168,56 @@ static void R_TracePointCullStatic( byte* cullBits, byte& totalOr, const float r
|
|||
|
||||
totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
byte tOr = 0;
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
const idVec3& v = vertsODS[i].xyz;
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
|
||||
const float t0 = d0 + radius;
|
||||
const float t1 = d1 + radius;
|
||||
const float t2 = d2 + radius;
|
||||
const float t3 = d3 + radius;
|
||||
|
||||
const float s0 = d0 - radius;
|
||||
const float s1 = d1 - radius;
|
||||
const float s2 = d2 - radius;
|
||||
const float s3 = d3 - radius;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||
|
||||
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||
|
||||
bits ^= 0x0F; // flip lower four bits
|
||||
|
||||
tOr |= bits;
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
totalOr = tOr;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -179,7 +230,7 @@ static void R_TracePointCullSkinned( byte* cullBits, byte& totalOr, const float
|
|||
assert_16_byte_aligned( cullBits );
|
||||
assert_16_byte_aligned( verts );
|
||||
|
||||
|
||||
#if defined(USE_INTRINSICS)
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
|
||||
|
||||
const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 );
|
||||
|
@ -302,6 +353,56 @@ static void R_TracePointCullSkinned( byte* cullBits, byte& totalOr, const float
|
|||
|
||||
totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte );
|
||||
|
||||
#else
|
||||
|
||||
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
|
||||
|
||||
byte tOr = 0;
|
||||
for( int i = 0; i < numVerts; )
|
||||
{
|
||||
|
||||
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
|
||||
|
||||
for( ; i <= nextNumVerts; i++ )
|
||||
{
|
||||
const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
|
||||
|
||||
const float d0 = planes[0].Distance( v );
|
||||
const float d1 = planes[1].Distance( v );
|
||||
const float d2 = planes[2].Distance( v );
|
||||
const float d3 = planes[3].Distance( v );
|
||||
|
||||
const float t0 = d0 + radius;
|
||||
const float t1 = d1 + radius;
|
||||
const float t2 = d2 + radius;
|
||||
const float t3 = d3 + radius;
|
||||
|
||||
const float s0 = d0 - radius;
|
||||
const float s1 = d1 - radius;
|
||||
const float s2 = d2 - radius;
|
||||
const float s3 = d3 - radius;
|
||||
|
||||
byte bits;
|
||||
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
|
||||
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
|
||||
|
||||
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
|
||||
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
|
||||
|
||||
bits ^= 0x0F; // flip lower four bits
|
||||
|
||||
tOr |= bits;
|
||||
cullBits[i] = bits;
|
||||
}
|
||||
}
|
||||
|
||||
totalOr = tOr;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -1722,10 +1722,12 @@ void R_TestDegenerateTextureSpace( srfTriangles_t* tri )
|
|||
const idDrawVert& b = tri->verts[tri->indexes[i + 1]];
|
||||
const idDrawVert& c = tri->verts[tri->indexes[i + 2]];
|
||||
|
||||
if( a.st == b.st || b.st == c.st || c.st == a.st )
|
||||
// RB: compare texcoords instead of pointers
|
||||
if( a.GetTexCoord() == b.GetTexCoord() || b.GetTexCoord() == c.GetTexCoord() || c.GetTexCoord() == a.GetTexCoord() )
|
||||
{
|
||||
c_degenerate++;
|
||||
}
|
||||
// RB end
|
||||
}
|
||||
|
||||
if( c_degenerate )
|
||||
|
|
Loading…
Reference in a new issue