Merged generic C++ fallbacks for SSE optimized code to allow support for non-x86 based platforms like ARM

This commit is contained in:
Robert Beckebans 2013-06-01 15:13:00 +02:00
parent e510691c36
commit db715535cc
28 changed files with 2747 additions and 337 deletions

View file

@ -1,4 +1,4 @@
astyle.exe -v --options=astyle-options.ini --exclude="libs" --recursive *.h
astyle.exe -v --options=astyle-options.ini --exclude="libs" --exclude="idlib/math/Simd.cpp" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --recursive *.h
astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
pause

View file

@ -1,5 +1,5 @@
#!/bin/sh
./astyle.exe -v --options=astyle-options.ini --exclude="libs" --recursive *.h
./astyle.exe -v --options=astyle-options.ini --exclude="libs" --exclude="idlib/math/Simd.cpp" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
./astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --recursive *.h
./astyle.exe -v --formatted --options=astyle-options.ini --exclude="libs" --exclude="d3xp/gamesys/SysCvar.cpp" --exclude="d3xp/gamesys/Callbacks.cpp" --exclude="sys/win32/win_cpu.cpp" --exclude="sys/win32/win_main.cpp" --recursive *.cpp
#pause

View file

@ -301,7 +301,13 @@ idCVar idFileSystemLocal::fs_debugResources( "fs_debugResources", "0", CVAR_SYST
idCVar idFileSystemLocal::fs_enableBGL( "fs_enableBGL", "0", CVAR_SYSTEM | CVAR_BOOL, "" );
idCVar idFileSystemLocal::fs_debugBGL( "fs_debugBGL", "0", CVAR_SYSTEM | CVAR_BOOL, "" );
idCVar idFileSystemLocal::fs_copyfiles( "fs_copyfiles", "0", CVAR_SYSTEM | CVAR_INIT | CVAR_BOOL, "Copy every file touched to fs_savepath" );
// RB
#if defined(RETAIL)
idCVar idFileSystemLocal::fs_buildResources( "fs_buildresources", "0", CVAR_SYSTEM | CVAR_BOOL | CVAR_INIT, "Copy every file touched to a resource file" );
#else
idCVar idFileSystemLocal::fs_buildResources( "fs_buildresources", "1", CVAR_SYSTEM | CVAR_BOOL | CVAR_INIT, "Copy every file touched to a resource file" );
#endif
// RB end
idCVar idFileSystemLocal::fs_game( "fs_game", "", CVAR_SYSTEM | CVAR_INIT | CVAR_SERVERINFO, "mod path" );
idCVar idFileSystemLocal::fs_game_base( "fs_game_base", "", CVAR_SYSTEM | CVAR_INIT | CVAR_SERVERINFO, "alternate mod path, searched after the main fs_game path, before the basedir" );

View file

@ -3,6 +3,7 @@
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2013 Robert Beckebans
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
@ -113,11 +114,20 @@ ID_INLINE halfFloat_t F32toF16( float a )
class idDrawVert
{
friend class idSwap;
friend class idShadowVertSkinned;
friend class idRenderModelStatic;
friend void TransformVertsAndTangents( idDrawVert* targetVerts, const int numVerts, const idDrawVert* baseVerts, const idJointMat* joints );
public:
idVec3 xyz; // 12 bytes
private:
// RB: don't let the old tools code mess with these values
halfFloat_t st[2]; // 4 bytes
byte normal[4]; // 4 bytes
byte tangent[4]; // 4 bytes -- [3] is texture polarity sign
public:
byte color[4]; // 4 bytes
byte color2[4]; // 4 bytes -- weights for skinning
@ -187,9 +197,14 @@ public:
#define DRAWVERT_COLOR_OFFSET (6*4)
#define DRAWVERT_COLOR2_OFFSET (7*4)
// RB begin
assert_sizeof( idDrawVert, DRAWVERT_SIZE );
#if 0
assert_offsetof( idDrawVert, xyz, DRAWVERT_XYZ_OFFSET );
assert_offsetof( idDrawVert, normal, DRAWVERT_NORMAL_OFFSET );
assert_offsetof( idDrawVert, tangent, DRAWVERT_TANGENT_OFFSET );
#endif
// RB end
/*
========================
@ -202,6 +217,7 @@ ID_INLINE void VertexFloatToByte( const float& x, const float& y, const float& z
{
assert_4_byte_aligned( bval ); // for __stvebx
#if defined(USE_INTRINSICS)
const __m128 vector_float_one = { 1.0f, 1.0f, 1.0f, 1.0f };
const __m128 vector_float_half = { 0.5f, 0.5f, 0.5f, 0.5f };
@ -218,6 +234,13 @@ ID_INLINE void VertexFloatToByte( const float& x, const float& y, const float& z
bval[1] = ( byte )_mm_extract_epi16( xyz16, 1 );
bval[2] = ( byte )_mm_extract_epi16( xyz16, 2 );
#else
bval[0] = VERTEX_FLOAT_TO_BYTE( x );
bval[1] = VERTEX_FLOAT_TO_BYTE( y );
bval[2] = VERTEX_FLOAT_TO_BYTE( z );
#endif
}
/*
@ -655,6 +678,7 @@ ID_INLINE void WriteDrawVerts16( idDrawVert* destVerts, const idDrawVert* localV
assert_16_byte_aligned( destVerts );
assert_16_byte_aligned( localVerts );
#if defined(USE_INTRINSICS)
for( int i = 0; i < numVerts; i++ )
{
@ -664,6 +688,11 @@ ID_INLINE void WriteDrawVerts16( idDrawVert* destVerts, const idDrawVert* localV
_mm_stream_si128( ( __m128i* )( ( byte* )( destVerts + i ) + 16 ), v1 );
}
#else
memcpy( destVerts, localVerts, numVerts * sizeof( idDrawVert ) );
#endif
}
/*

View file

@ -30,6 +30,7 @@ If you have questions concerning this license or the applicable additional terms
#define __DRAWVERT_INTRINSICS_H__
#if defined(USE_INTRINSICS)
static const __m128i vector_int_f32_sign_mask = _mm_set1_epi32( 1U << IEEE_FLT_SIGN_BIT );
static const __m128i vector_int_f32_exponent_mask = _mm_set1_epi32( ( ( 1U << IEEE_FLT_EXPONENT_BITS ) - 1 ) << IEEE_FLT_MANTISSA_BITS );
static const __m128i vector_int_f32_mantissa_mask = _mm_set1_epi32( ( 1U << IEEE_FLT_MANTISSA_BITS ) - 1 );
@ -50,13 +51,14 @@ static const __m128 vector_float_last_one = { 0.0f, 0.0f, 0.0f, 1.0
static const __m128 vector_float_1_over_255 = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f };
static const __m128 vector_float_1_over_4 = { 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f, 1.0f / 4.0f };
#endif
/*
====================
FastF32toF16
====================
*/
#if defined(USE_INTRINSICS)
ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits )
{
__m128i f16_sign = _mm_srli_epi32( _mm_and_si128( f32_bits, vector_int_f32_sign_mask ), f32_to_f16_sign_shift );
@ -77,6 +79,7 @@ ID_INLINE_EXTERN __m128i FastF32toF16( __m128i f32_bits )
return _mm_packs_epi32( flt16, flt16 );
}
#endif
ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 )
@ -117,7 +120,7 @@ ID_INLINE_EXTERN halfFloat_t Scalar_FastF32toF16( float f32 )
LoadSkinnedDrawVertPosition
====================
*/
#if defined(USE_INTRINSICS)
ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, const idJointMat* joints )
{
const idJointMat& j0 = joints[base.color[0]];
@ -178,7 +181,7 @@ ID_INLINE_EXTERN __m128 LoadSkinnedDrawVertPosition( const idDrawVert& base, con
return r0;
}
#endif
ID_INLINE_EXTERN idVec3 Scalar_LoadSkinnedDrawVertPosition( const idDrawVert& vert, const idJointMat* joints )
{

File diff suppressed because it is too large Load diff

View file

@ -44,8 +44,11 @@ const float LCP_DELTA_FORCE_EPSILON = 1e-9f;
#define IGNORE_UNSATISFIABLE_VARIABLES
#if defined(USE_INTRINSICS)
#define LCP_SIMD
#endif
#if defined(LCP_SIMD)
ALIGN16( const __m128 SIMD_SP_zero ) = { 0.0f, 0.0f, 0.0f, 0.0f };
ALIGN16( const __m128 SIMD_SP_one ) = { 1.0f, 1.0f, 1.0f, 1.0f };
ALIGN16( const __m128 SIMD_SP_two ) = { 2.0f, 2.0f, 2.0f, 2.0f };
@ -70,7 +73,7 @@ ALIGN16( const unsigned int SIMD_DW_one[4] ) = { 1, 1, 1, 1 };
ALIGN16( const unsigned int SIMD_DW_four[4] ) = { 4, 4, 4, 4 };
ALIGN16( const unsigned int SIMD_DW_index[4] ) = { 0, 1, 2, 3 };
ALIGN16( const int SIMD_DW_not3[4] ) = { ~3, ~3, ~3, ~3 };
#endif // #if defined(LCP_SIMD)
/*
========================
Multiply_SIMD
@ -91,6 +94,7 @@ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, con
dst[i] = src0[i] * src1[i];
}
#if defined(LCP_SIMD)
for( ; i + 4 <= count; i += 4 )
{
@ -104,6 +108,21 @@ static void Multiply_SIMD( float* dst, const float* src0, const float* src1, con
_mm_store_ps( dst + i, s0 );
}
#else
for( ; i + 4 <= count; i += 4 )
{
assert_16_byte_aligned( &dst[i] );
assert_16_byte_aligned( &src0[i] );
assert_16_byte_aligned( &src1[i] );
dst[i + 0] = src0[i + 0] * src1[i + 0];
dst[i + 1] = src0[i + 1] * src1[i + 1];
dst[i + 2] = src0[i + 2] * src1[i + 2];
dst[i + 3] = src0[i + 3] * src1[i + 3];
}
#endif
for( ; i < count; i++ )
{
@ -124,6 +143,7 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src
{
int i = 0;
// RB: changed unsigned int to uintptr_t
for( ; ( ( uintptr_t )dst & 0xF ) != 0 && i < count; i++ )
// RB end
@ -131,6 +151,7 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src
dst[i] += constant * src[i];
}
#if defined(LCP_SIMD)
__m128 c = _mm_load1_ps( & constant );
for( ; i + 4 <= count; i += 4 )
@ -144,6 +165,20 @@ static void MultiplyAdd_SIMD( float* dst, const float constant, const float* src
_mm_store_ps( dst + i, s );
}
#else
for( ; i + 4 <= count; i += 4 )
{
assert_16_byte_aligned( &src[i] );
assert_16_byte_aligned( &dst[i] );
dst[i + 0] += constant * src[i + 0];
dst[i + 1] += constant * src[i + 1];
dst[i + 2] += constant * src[i + 2];
dst[i + 3] += constant * src[i + 3];
}
#endif
for( ; i < count; i++ )
{
@ -163,7 +198,7 @@ static float DotProduct_SIMD( const float* src0, const float* src1, const int co
assert_16_byte_aligned( src0 );
assert_16_byte_aligned( src1 );
#ifndef _lint
#if defined(LCP_SIMD)
__m128 sum = ( __m128& ) SIMD_SP_zero;
int i = 0;
@ -328,7 +363,7 @@ static void LowerTriangularSolve_SIMD( const idMatX& L, float* x, const float* b
int i = skip;
#ifndef _lint
#if defined(LCP_SIMD)
// work up to a multiple of 4 rows
for( ; ( i & 3 ) != 0 && i < n; i++ )
@ -601,7 +636,7 @@ static void LowerTriangularSolveTranspose_SIMD( const idMatX& L, float* x, const
const float* lptr = L.ToFloatPtr() + m * nc + m - 4;
float* xptr = x + m;
#ifndef _lint
#if defined(LCP_SIMD)
// process 4 rows at a time
for( int i = m; i >= 4; i -= 4 )
@ -982,7 +1017,7 @@ static bool LDLT_Factor_SIMD( idMatX& mat, idVecX& invDiag, const int n )
mptr[j * nc + 3] = ( mptr[j * nc + 3] - v[0] * mptr[j * nc + 0] - v[1] * mptr[j * nc + 1] - v[2] * mptr[j * nc + 2] ) * d;
}
#ifndef _lint
#if defined(LCP_SIMD)
__m128 vzero = _mm_setzero_ps();
for( int i = 4; i < n; i += 4 )
@ -1360,7 +1395,7 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_
int d, float dir, float& maxStep, int& limit, int& limitSide )
{
#if defined(LCP_SIMD)
__m128 vMaxStep;
__m128i vLimit;
__m128i vLimitSide;
@ -1484,6 +1519,117 @@ static void GetMaxStep_SIMD( const float* f, const float* a, const float* delta_
_mm_store_ss( & maxStep, vMaxStep );
limit = _mm_cvtsi128_si32( vLimit );
limitSide = _mm_cvtsi128_si32( vLimitSide );
#else
int i;
float s;
// default to a full step for the current variable
if( idMath::Fabs( delta_a[d] ) > LCP_DELTA_ACCEL_EPSILON )
{
maxStep = -a[d] / delta_a[d];
}
else
{
maxStep = 0.0f;
}
limit = d;
limitSide = 0;
// test the current variable
if( dir < 0.0f )
{
if( lo[d] != -idMath::INFINITY )
{
s = ( lo[d] - f[d] ) / dir;
if( s < maxStep )
{
maxStep = s;
limitSide = -1;
}
}
}
else
{
if( hi[d] != idMath::INFINITY )
{
s = ( hi[d] - f[d] ) / dir;
if( s < maxStep )
{
maxStep = s;
limitSide = 1;
}
}
}
// test the clamped bounded variables
for( i = numUnbounded; i < numClamped; i++ )
{
if( delta_f[i] < -LCP_DELTA_FORCE_EPSILON )
{
// if there is a low boundary
if( lo[i] != -idMath::INFINITY )
{
s = ( lo[i] - f[i] ) / delta_f[i];
if( s < maxStep )
{
maxStep = s;
limit = i;
limitSide = -1;
}
}
}
else if( delta_f[i] > LCP_DELTA_FORCE_EPSILON )
{
// if there is a high boundary
if( hi[i] != idMath::INFINITY )
{
s = ( hi[i] - f[i] ) / delta_f[i];
if( s < maxStep )
{
maxStep = s;
limit = i;
limitSide = 1;
}
}
}
}
// test the not clamped bounded variables
for( i = numClamped; i < d; i++ )
{
if( side[i] == -1 )
{
if( delta_a[i] >= -LCP_DELTA_ACCEL_EPSILON )
{
continue;
}
}
else if( side[i] == 1 )
{
if( delta_a[i] <= LCP_DELTA_ACCEL_EPSILON )
{
continue;
}
}
else
{
continue;
}
// ignore variables for which the force is not allowed to take any substantial value
if( lo[i] >= -LCP_BOUND_EPSILON && hi[i] <= LCP_BOUND_EPSILON )
{
continue;
}
s = -a[i] / delta_a[i];
if( s < maxStep )
{
maxStep = s;
limit = i;
limitSide = 0;
}
}
#endif
}
/*

View file

@ -201,6 +201,7 @@ void idMatX::CopyLowerToUpperTriangle()
assert( ( GetNumColumns() & 3 ) == 0 );
assert( GetNumColumns() >= GetNumRows() );
#if defined(USE_INTRINSICS)
const int n = GetNumColumns();
const int m = GetNumRows();
@ -341,6 +342,22 @@ void idMatX::CopyLowerToUpperTriangle()
_mm_store_ps( basePtr + n0, r0 );
}
#else
const int n = GetNumColumns();
const int m = GetNumRows();
for( int i = 0; i < m; i++ )
{
const float* __restrict ptr = ToFloatPtr() + ( i + 1 ) * n + i;
float* __restrict dstPtr = ToFloatPtr() + i * n;
for( int j = i + 1; j < m; j++ )
{
dstPtr[j] = ptr[0];
ptr += n;
}
}
#endif
#ifdef _DEBUG
for( int i = 0; i < numRows; i++ )

View file

@ -46,7 +46,10 @@ NOTE: due to the temporary memory pool idMatX cannot be used by multiple threads
#define MATX_CLEAREND() int s = numRows * numColumns; while( s < ( ( s + 3 ) & ~3 ) ) { mat[s++] = 0.0f; }
#define MATX_ALLOCA( n ) ( (float *) _alloca16( MATX_QUAD( n ) ) )
#define MATX_ALLOCA_CACHE_LINES( n ) ( (float *) _alloca128( ( ( n ) * sizeof( float ) + CACHE_LINE_SIZE - 1 ) & ~ ( CACHE_LINE_SIZE - 1 ) ) )
#if defined(USE_INTRINSICS)
#define MATX_SIMD
#endif
class idMatX
{

View file

@ -51,6 +51,7 @@ const float idMath::INFINITY = 1e30f;
const float idMath::FLT_EPSILON = 1.192092896e-07f;
const float idMath::FLT_SMALLEST_NON_DENORMAL = * reinterpret_cast< const float* >( & SMALLEST_NON_DENORMAL ); // 1.1754944e-038f
#if defined(USE_INTRINSICS)
const __m128 idMath::SIMD_SP_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
const __m128 idMath::SIMD_SP_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
const __m128 idMath::SIMD_SP_min_char = { -128.0f, -128.0f, -128.0f, -128.0f };
@ -61,6 +62,7 @@ const __m128 idMath::SIMD_SP_smallestNonDenorm = { FLT_SMALLEST_NON_DENORMAL, FL
const __m128 idMath::SIMD_SP_tiny = { 1e-4f, 1e-4f, 1e-4f, 1e-4f };
const __m128 idMath::SIMD_SP_rsqrt_c0 = { 3.0f, 3.0f, 3.0f, 3.0f };
const __m128 idMath::SIMD_SP_rsqrt_c1 = { -0.5f, -0.5f, -0.5f, -0.5f };
#endif
bool idMath::initialized = false;
dword idMath::iSqrt[SQRT_TABLE_SIZE]; // inverse square root lookup table

View file

@ -469,6 +469,7 @@ public:
static const float FLT_EPSILON; // smallest positive number such that 1.0+FLT_EPSILON != 1.0
static const float FLT_SMALLEST_NON_DENORMAL; // smallest non-denormal 32-bit floating point value
#if defined(USE_INTRINSICS)
static const __m128 SIMD_SP_zero;
static const __m128 SIMD_SP_255;
static const __m128 SIMD_SP_min_char;
@ -479,6 +480,7 @@ public:
static const __m128 SIMD_SP_tiny;
static const __m128 SIMD_SP_rsqrt_c0;
static const __m128 SIMD_SP_rsqrt_c1;
#endif
private:
enum
@ -526,9 +528,7 @@ idMath::InvSqrt16
*/
ID_INLINE float idMath::InvSqrt16( float x )
{
return ( x > FLT_SMALLEST_NON_DENORMAL ) ? sqrtf( 1.0f / x ) : INFINITY;
}
/*
@ -1321,8 +1321,21 @@ ID_INLINE int idMath::Ftoi( float f )
// If a converted result is larger than the maximum signed doubleword integer,
// the floating-point invalid exception is raised, and if this exception is masked,
// the indefinite integer value (80000000H) is returned.
#if defined(USE_INTRINSICS)
__m128 x = _mm_load_ss( &f );
return _mm_cvttss_si32( x );
#elif 0 // round chop (C/C++ standard)
int i, s, e, m, shift;
i = *reinterpret_cast<int*>( &f );
s = i >> IEEE_FLT_SIGN_BIT;
e = ( ( i >> IEEE_FLT_MANTISSA_BITS ) & ( ( 1 << IEEE_FLT_EXPONENT_BITS ) - 1 ) ) - IEEE_FLT_EXPONENT_BIAS;
m = ( i & ( ( 1 << IEEE_FLT_MANTISSA_BITS ) - 1 ) ) | ( 1 << IEEE_FLT_MANTISSA_BITS );
shift = e - IEEE_FLT_MANTISSA_BITS;
return ( ( ( ( m >> -shift ) | ( m << shift ) ) & ~( e >> INT32_SIGN_BIT ) ) ^ s ) - s;
#else
// If a converted result is larger than the maximum signed doubleword integer the result is undefined.
return C_FLOAT_TO_INT( f );
#endif
}
/*
@ -1332,10 +1345,24 @@ idMath::Ftoi8
*/
ID_INLINE char idMath::Ftoi8( float f )
{
#if defined(USE_INTRINSICS)
__m128 x = _mm_load_ss( &f );
x = _mm_max_ss( x, SIMD_SP_min_char );
x = _mm_min_ss( x, SIMD_SP_max_char );
return static_cast<char>( _mm_cvttss_si32( x ) );
#else
// The converted result is clamped to the range [-128,127].
int i = C_FLOAT_TO_INT( f );
if( i < -128 )
{
return -128;
}
else if( i > 127 )
{
return 127;
}
return static_cast<char>( i );
#endif
}
/*
@ -1345,10 +1372,24 @@ idMath::Ftoi16
*/
ID_INLINE short idMath::Ftoi16( float f )
{
#if defined(USE_INTRINSICS)
__m128 x = _mm_load_ss( &f );
x = _mm_max_ss( x, SIMD_SP_min_short );
x = _mm_min_ss( x, SIMD_SP_max_short );
return static_cast<short>( _mm_cvttss_si32( x ) );
#else
// The converted result is clamped to the range [-32768,32767].
int i = C_FLOAT_TO_INT( f );
if( i < -32768 )
{
return -32768;
}
else if( i > 32767 )
{
return 32767;
}
return static_cast<short>( i );
#endif
}
/*
@ -1382,10 +1423,25 @@ ID_INLINE byte idMath::Ftob( float f )
{
// If a converted result is negative the value (0) is returned and if the
// converted result is larger than the maximum byte the value (255) is returned.
#if defined(USE_INTRINSICS)
__m128 x = _mm_load_ss( &f );
x = _mm_max_ss( x, SIMD_SP_zero );
x = _mm_min_ss( x, SIMD_SP_255 );
return static_cast<byte>( _mm_cvttss_si32( x ) );
#else
// The converted result is clamped to the range [0,255].
int i = C_FLOAT_TO_INT( f );
if( i < 0 )
{
return 0;
}
else if( i > 255 )
{
return 255;
}
return static_cast<byte>( i );
#endif
}
/*

File diff suppressed because it is too large Load diff

View file

@ -38,6 +38,7 @@ If you have questions concerning this license or the applicable additional terms
// E
//===============================================================
#if defined(USE_INTRINSICS)
#include <xmmintrin.h>
@ -973,3 +974,5 @@ void VPCALL idSIMD_SSE::UntransformJoints( idJointMat* jointMats, const int* par
}
}
#endif // #if defined(USE_INTRINSICS)

View file

@ -3,6 +3,7 @@
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2013 Robert Beckebans
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
@ -37,6 +38,8 @@ If you have questions concerning this license or the applicable additional terms
===============================================================================
*/
#if defined(USE_INTRINSICS)
class idSIMD_SSE : public idSIMD_Generic
{
public:
@ -50,4 +53,6 @@ public:
virtual void VPCALL UntransformJoints( idJointMat* jointMats, const int* parents, const int firstJoint, const int lastJoint );
};
#endif
#endif /* !__MATH_SIMD_SSE_H__ */

View file

@ -45,7 +45,10 @@ NOTE: due to the temporary memory pool idVecX cannot be used by multiple threads
#define VECX_QUAD( x ) ( ( ( ( x ) + 3 ) & ~3 ) * sizeof( float ) )
#define VECX_CLEAREND() int s = size; while( s < ( ( s + 3) & ~3 ) ) { p[s++] = 0.0f; }
#define VECX_ALLOCA( n ) ( (float *) _alloca16( VECX_QUAD( n ) ) )
#if defined(USE_INTRINSICS)
#define VECX_SIMD
#endif
class idVecX
{

View file

@ -516,6 +516,11 @@ ID_INLINE idVec3 operator*( const float a, const idVec3 b )
return idVec3( b.x * a, b.y * a, b.z * a );
}
ID_INLINE idVec3 operator/( const float a, const idVec3 b )
{
return idVec3( a / b.x, a / b.y, a / b.z );
}
ID_INLINE idVec3 idVec3::operator+( const idVec3& a ) const
{
return idVec3( x + a.x, y + a.y, z + a.z );

View file

@ -3,6 +3,7 @@
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2013 Robert Beckebans
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
@ -28,8 +29,11 @@ If you have questions concerning this license or the applicable additional terms
#ifndef __SYS_INTRIINSICS_H__
#define __SYS_INTRIINSICS_H__
#include <emmintrin.h>
#define USE_INTRINSICS
#if defined(USE_INTRINSICS)
#include <emmintrin.h>
#endif
/*
================================================================================================
@ -91,6 +95,7 @@ ID_INLINE_EXTERN float __frndz( float x )
================================================================================================
*/
#if defined(USE_INTRINSICS)
// The code below assumes that a cache line is 64 bytes.
// We specify the cache line size as 128 here to make the code consistent with the consoles.
#define CACHE_LINE_SIZE 128
@ -122,6 +127,26 @@ ID_FORCE_INLINE void FlushCacheLine( const void* ptr, int offset )
_mm_clflush( bytePtr + 64 );
}
/*
================================================
#endif
Other
================================================
*/
#else
#define CACHE_LINE_SIZE 128
ID_INLINE void Prefetch( const void* ptr, int offset ) {}
ID_INLINE void ZeroCacheLine( void* ptr, int offset )
{
byte* bytePtr = ( byte* )( ( ( ( uintptr_t )( ptr ) ) + ( offset ) ) & ~( CACHE_LINE_SIZE - 1 ) );
memset( bytePtr, 0, CACHE_LINE_SIZE );
}
ID_INLINE void FlushCacheLine( const void* ptr, int offset ) {}
#endif
/*
================================================
Block Clear Macros
@ -168,6 +193,8 @@ ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size )
================================================================================================
*/
#if defined(USE_INTRINSICS)
/*
================================================
PC Windows
@ -194,6 +221,7 @@ ID_INLINE_EXTERN int CACHE_LINE_CLEAR_OVERFLOW_COUNT( int size )
#endif
// DG end
// make the intrinsics "type unsafe"
typedef union DECLSPEC_INTRINTYPE _CRT_ALIGN( 16 ) __m128c
{
@ -275,4 +303,6 @@ ID_FORCE_INLINE_EXTERN __m128 _mm_div16_ps( __m128 x, __m128 y )
// load idBounds::GetMaxs()
#define _mm_loadu_bounds_1( bounds ) _mm_perm_ps( _mm_loadh_pi( _mm_load_ss( & bounds[1].x ), (__m64 *) & bounds[1].y ), _MM_SHUFFLE( 1, 3, 2, 0 ) )
#endif // #if defined(USE_INTRINSICS)
#endif // !__SYS_INTRIINSICS_H__

View file

@ -79,6 +79,7 @@ void UnbindBufferObjects()
qglBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, 0 );
}
#if defined(USE_INTRINSICS)
void CopyBuffer( byte* dst, const byte* src, int numBytes )
{
@ -121,6 +122,16 @@ void CopyBuffer( byte* dst, const byte* src, int numBytes )
_mm_sfence();
}
#else
void CopyBuffer( byte* dst, const byte* src, int numBytes )
{
assert_16_byte_aligned( dst );
assert_16_byte_aligned( src );
memcpy( dst, src, numBytes );
}
#endif
/*
================================================================================================

View file

@ -3,6 +3,7 @@
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2013 Robert Beckebans
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
@ -305,7 +306,11 @@ idDxtEncoder::CompressImageDXT1Fast
*/
ID_INLINE void idDxtEncoder::CompressImageDXT1Fast( const byte* inBuf, byte* outBuf, int width, int height )
{
#if defined(USE_INTRINSICS)
CompressImageDXT1Fast_SSE2( inBuf, outBuf, width, height );
#else
CompressImageDXT1Fast_Generic( inBuf, outBuf, width, height );
#endif
}
/*
@ -315,7 +320,11 @@ idDxtEncoder::CompressImageDXT1AlphaFast
*/
ID_INLINE void idDxtEncoder::CompressImageDXT1AlphaFast( const byte* inBuf, byte* outBuf, int width, int height )
{
#if defined(USE_INTRINSICS)
CompressImageDXT1AlphaFast_SSE2( inBuf, outBuf, width, height );
#else
CompressImageDXT1AlphaFast_Generic( inBuf, outBuf, width, height );
#endif
}
/*
@ -325,7 +334,11 @@ idDxtEncoder::CompressImageDXT5Fast
*/
ID_INLINE void idDxtEncoder::CompressImageDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
{
#if defined(USE_INTRINSICS)
CompressImageDXT5Fast_SSE2( inBuf, outBuf, width, height );
#else
CompressImageDXT5Fast_Generic( inBuf, outBuf, width, height );
#endif
}
/*
@ -345,7 +358,11 @@ idDxtEncoder::CompressYCoCgDXT5Fast
*/
ID_INLINE void idDxtEncoder::CompressYCoCgDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
{
#if defined(USE_INTRINSICS)
CompressYCoCgDXT5Fast_SSE2( inBuf, outBuf, width, height );
#else
CompressYCoCgDXT5Fast_Generic( inBuf, outBuf, width, height );
#endif
}
/*
@ -365,7 +382,11 @@ idDxtEncoder::CompressNormalMapDXT5Fast
*/
ID_INLINE void idDxtEncoder::CompressNormalMapDXT5Fast( const byte* inBuf, byte* outBuf, int width, int height )
{
#if defined(USE_INTRINSICS)
CompressNormalMapDXT5Fast_SSE2( inBuf, outBuf, width, height );
#else
CompressNormalMapDXT5Fast_Generic( inBuf, outBuf, width, height );
#endif
}
/*

View file

@ -34,6 +34,7 @@ Contains the DxtEncoder implementation for SSE2.
#include "DXTCodec_local.h"
#include "DXTCodec.h"
#if defined(USE_INTRINSICS)
//#define TEST_COMPRESSION
#ifdef TEST_COMPRESSION
@ -1627,3 +1628,4 @@ void idDxtEncoder::CompressNormalMapDXT5Fast_SSE2( const byte* inBuf, byte* outB
#endif
}
#endif // #if defined(USE_INTRINSICS)

View file

@ -74,7 +74,7 @@ R_MatrixMultiply
*/
void R_MatrixMultiply( const float a[16], const float b[16], float out[16] )
{
#if defined(USE_INTRINSICS)
__m128 a0 = _mm_loadu_ps( a + 0 * 4 );
__m128 a1 = _mm_loadu_ps( a + 1 * 4 );
__m128 a2 = _mm_loadu_ps( a + 2 * 4 );
@ -110,6 +110,41 @@ void R_MatrixMultiply( const float a[16], const float b[16], float out[16] )
_mm_storeu_ps( out + 2 * 4, t2 );
_mm_storeu_ps( out + 3 * 4, t3 );
#else
/*
for ( int i = 0; i < 4; i++ ) {
for ( int j = 0; j < 4; j++ ) {
out[ i * 4 + j ] =
a[ i * 4 + 0 ] * b[ 0 * 4 + j ] +
a[ i * 4 + 1 ] * b[ 1 * 4 + j ] +
a[ i * 4 + 2 ] * b[ 2 * 4 + j ] +
a[ i * 4 + 3 ] * b[ 3 * 4 + j ];
}
}
*/
out[0 * 4 + 0] = a[0 * 4 + 0] * b[0 * 4 + 0] + a[0 * 4 + 1] * b[1 * 4 + 0] + a[0 * 4 + 2] * b[2 * 4 + 0] + a[0 * 4 + 3] * b[3 * 4 + 0];
out[0 * 4 + 1] = a[0 * 4 + 0] * b[0 * 4 + 1] + a[0 * 4 + 1] * b[1 * 4 + 1] + a[0 * 4 + 2] * b[2 * 4 + 1] + a[0 * 4 + 3] * b[3 * 4 + 1];
out[0 * 4 + 2] = a[0 * 4 + 0] * b[0 * 4 + 2] + a[0 * 4 + 1] * b[1 * 4 + 2] + a[0 * 4 + 2] * b[2 * 4 + 2] + a[0 * 4 + 3] * b[3 * 4 + 2];
out[0 * 4 + 3] = a[0 * 4 + 0] * b[0 * 4 + 3] + a[0 * 4 + 1] * b[1 * 4 + 3] + a[0 * 4 + 2] * b[2 * 4 + 3] + a[0 * 4 + 3] * b[3 * 4 + 3];
out[1 * 4 + 0] = a[1 * 4 + 0] * b[0 * 4 + 0] + a[1 * 4 + 1] * b[1 * 4 + 0] + a[1 * 4 + 2] * b[2 * 4 + 0] + a[1 * 4 + 3] * b[3 * 4 + 0];
out[1 * 4 + 1] = a[1 * 4 + 0] * b[0 * 4 + 1] + a[1 * 4 + 1] * b[1 * 4 + 1] + a[1 * 4 + 2] * b[2 * 4 + 1] + a[1 * 4 + 3] * b[3 * 4 + 1];
out[1 * 4 + 2] = a[1 * 4 + 0] * b[0 * 4 + 2] + a[1 * 4 + 1] * b[1 * 4 + 2] + a[1 * 4 + 2] * b[2 * 4 + 2] + a[1 * 4 + 3] * b[3 * 4 + 2];
out[1 * 4 + 3] = a[1 * 4 + 0] * b[0 * 4 + 3] + a[1 * 4 + 1] * b[1 * 4 + 3] + a[1 * 4 + 2] * b[2 * 4 + 3] + a[1 * 4 + 3] * b[3 * 4 + 3];
out[2 * 4 + 0] = a[2 * 4 + 0] * b[0 * 4 + 0] + a[2 * 4 + 1] * b[1 * 4 + 0] + a[2 * 4 + 2] * b[2 * 4 + 0] + a[2 * 4 + 3] * b[3 * 4 + 0];
out[2 * 4 + 1] = a[2 * 4 + 0] * b[0 * 4 + 1] + a[2 * 4 + 1] * b[1 * 4 + 1] + a[2 * 4 + 2] * b[2 * 4 + 1] + a[2 * 4 + 3] * b[3 * 4 + 1];
out[2 * 4 + 2] = a[2 * 4 + 0] * b[0 * 4 + 2] + a[2 * 4 + 1] * b[1 * 4 + 2] + a[2 * 4 + 2] * b[2 * 4 + 2] + a[2 * 4 + 3] * b[3 * 4 + 2];
out[2 * 4 + 3] = a[2 * 4 + 0] * b[0 * 4 + 3] + a[2 * 4 + 1] * b[1 * 4 + 3] + a[2 * 4 + 2] * b[2 * 4 + 3] + a[2 * 4 + 3] * b[3 * 4 + 3];
out[3 * 4 + 0] = a[3 * 4 + 0] * b[0 * 4 + 0] + a[3 * 4 + 1] * b[1 * 4 + 0] + a[3 * 4 + 2] * b[2 * 4 + 0] + a[3 * 4 + 3] * b[3 * 4 + 0];
out[3 * 4 + 1] = a[3 * 4 + 0] * b[0 * 4 + 1] + a[3 * 4 + 1] * b[1 * 4 + 1] + a[3 * 4 + 2] * b[2 * 4 + 1] + a[3 * 4 + 3] * b[3 * 4 + 1];
out[3 * 4 + 2] = a[3 * 4 + 0] * b[0 * 4 + 2] + a[3 * 4 + 1] * b[1 * 4 + 2] + a[3 * 4 + 2] * b[2 * 4 + 2] + a[3 * 4 + 3] * b[3 * 4 + 2];
out[3 * 4 + 3] = a[3 * 4 + 0] * b[0 * 4 + 3] + a[3 * 4 + 1] * b[1 * 4 + 3] + a[3 * 4 + 2] * b[2 * 4 + 3] + a[3 * 4 + 3] * b[3 * 4 + 3];
#endif
}
/*

View file

@ -302,10 +302,10 @@ static void R_DecalPointCullStatic( byte* cullBits, const idPlane* planes, const
assert_16_byte_aligned( cullBits );
assert_16_byte_aligned( verts );
#if defined(USE_INTRINSICS)
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
const __m128 vector_float_zero = _mm_setzero_ps();
const __m128i vector_int_mask0 = _mm_set1_epi32( 1 << 0 );
const __m128i vector_int_mask1 = _mm_set1_epi32( 1 << 1 );
const __m128i vector_int_mask2 = _mm_set1_epi32( 1 << 2 );
@ -406,6 +406,39 @@ static void R_DecalPointCullStatic( byte* cullBits, const idPlane* planes, const
}
}
#else
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
const idVec3& v = vertsODS[i].xyz;
const float d0 = planes[0].Distance( v );
const float d1 = planes[1].Distance( v );
const float d2 = planes[2].Distance( v );
const float d3 = planes[3].Distance( v );
const float d4 = planes[4].Distance( v );
const float d5 = planes[5].Distance( v );
byte bits;
bits = IEEE_FLT_SIGNBITNOTSET( d0 ) << 0;
bits |= IEEE_FLT_SIGNBITNOTSET( d1 ) << 1;
bits |= IEEE_FLT_SIGNBITNOTSET( d2 ) << 2;
bits |= IEEE_FLT_SIGNBITNOTSET( d3 ) << 3;
bits |= IEEE_FLT_SIGNBITNOTSET( d4 ) << 4;
bits |= IEEE_FLT_SIGNBITNOTSET( d5 ) << 5;
cullBits[i] = bits;
}
}
#endif
}
/*
@ -637,6 +670,7 @@ static void R_CopyDecalSurface( idDrawVert* verts, int numVerts, triIndex_t* ind
assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
assert_16_byte_aligned( fadeColor );
#if defined(USE_INTRINSICS)
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
@ -678,6 +712,28 @@ static void R_CopyDecalSurface( idDrawVert* verts, int numVerts, triIndex_t* ind
_mm_sfence();
#else
// copy vertices and apply depth/time based fading
for( int i = 0; i < decal->numVerts; i++ )
{
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
verts[numVerts + i] = decal->verts[i];
for( int j = 0; j < 4; j++ )
{
verts[numVerts + i].color[j] = idMath::Ftob( fadeColor[j] * decal->vertDepthFade[i] );
}
}
// copy indices
assert( ( decal->numIndexes & 1 ) == 0 );
for( int i = 0; i < decal->numIndexes; i += 2 )
{
assert( decal->indexes[i + 0] < decal->numVerts && decal->indexes[i + 1] < decal->numVerts );
WriteIndexPair( &indexes[numIndexes + i], numVerts + decal->indexes[i + 0], numVerts + decal->indexes[i + 1] );
}
#endif
}
/*

View file

@ -3,6 +3,7 @@
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2013 Robert Beckebans
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
@ -111,7 +112,7 @@ static void R_OverlayPointCullStatic( byte* cullBits, halfFloat_t* texCoordS, ha
assert_16_byte_aligned( texCoordT );
assert_16_byte_aligned( verts );
#if defined(USE_INTRINSICS)
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
@ -187,6 +188,41 @@ static void R_OverlayPointCullStatic( byte* cullBits, halfFloat_t* texCoordS, ha
}
}
#else
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
const idVec3& v = vertsODS[i].xyz;
const float d0 = planes[0].Distance( v );
const float d1 = planes[1].Distance( v );
const float d2 = 1.0f - d0;
const float d3 = 1.0f - d1;
halfFloat_t s = Scalar_FastF32toF16( d0 );
halfFloat_t t = Scalar_FastF32toF16( d1 );
texCoordS[i] = s;
texCoordT[i] = t;
byte bits;
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
cullBits[i] = bits;
}
}
#endif
}
/*
@ -201,7 +237,7 @@ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, h
assert_16_byte_aligned( texCoordT );
assert_16_byte_aligned( verts );
#if defined(USE_INTRINSICS)
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
const __m128 vector_float_zero = { 0.0f, 0.0f, 0.0f, 0.0f };
@ -277,6 +313,41 @@ static void R_OverlayPointCullSkinned( byte* cullBits, halfFloat_t* texCoordS, h
}
}
#else
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
const idVec3 transformed = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
const float d0 = planes[0].Distance( transformed );
const float d1 = planes[1].Distance( transformed );
const float d2 = 1.0f - d0;
const float d3 = 1.0f - d1;
halfFloat_t s = Scalar_FastF32toF16( d0 );
halfFloat_t t = Scalar_FastF32toF16( d1 );
texCoordS[i] = s;
texCoordT[i] = t;
byte bits;
bits = IEEE_FLT_SIGNBITSET( d0 ) << 0;
bits |= IEEE_FLT_SIGNBITSET( d1 ) << 1;
bits |= IEEE_FLT_SIGNBITSET( d2 ) << 2;
bits |= IEEE_FLT_SIGNBITSET( d3 ) << 3;
cullBits[i] = bits;
}
}
#endif
}
/*
@ -486,6 +557,7 @@ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* i
assert( ( ( overlay->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
assert( ( ( overlay->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
#if defined(USE_INTRINSICS)
const __m128i vector_int_clear_last = _mm_set_epi32( 0, -1, -1, -1 );
const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
@ -524,6 +596,30 @@ static void R_CopyOverlaySurface( idDrawVert* verts, int numVerts, triIndex_t* i
_mm_sfence();
#else
// copy vertices
for( int i = 0; i < overlay->numVerts; i++ )
{
const overlayVertex_t& overlayVert = overlay->verts[i];
// NOTE: bad out-of-order write-combined write, SIMD code does the right thing
verts[numVerts + i] = sourceVerts[overlayVert.vertexNum];
// RB begin
verts[numVerts + i].SetTexCoordS( overlayVert.st[0] );
verts[numVerts + i].SetTexCoordT( overlayVert.st[1] );
// RB end
}
// copy indexes
for( int i = 0; i < overlay->numIndexes; i += 2 )
{
assert( overlay->indexes[i + 0] < overlay->numVerts && overlay->indexes[i + 1] < overlay->numVerts );
WriteIndexPair( &indexes[numIndexes + i], numVerts + overlay->indexes[i + 0], numVerts + overlay->indexes[i + 1] );
}
#endif
}
/*

View file

@ -32,10 +32,10 @@ If you have questions concerning this license or the applicable additional terms
#include "tr_local.h"
#include "Model_local.h"
#if defined(USE_INTRINSICS)
static const __m128 vector_float_posInfinity = { idMath::INFINITY, idMath::INFINITY, idMath::INFINITY, idMath::INFINITY };
static const __m128 vector_float_negInfinity = { -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY, -idMath::INFINITY };
#endif
static const char* MD5_SnapshotName = "_MD5_Snapshot_";
@ -561,6 +561,7 @@ idMD5Mesh::CalculateBounds
*/
void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds ) const
{
#if defined(USE_INTRINSICS)
__m128 minX = vector_float_posInfinity;
__m128 minY = vector_float_posInfinity;
@ -595,6 +596,17 @@ void idMD5Mesh::CalculateBounds( const idJointMat* entJoints, idBounds& bounds )
_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );
#else
bounds.Clear();
for( int i = 0; i < numMeshJoints; i++ )
{
const idJointMat& joint = entJoints[meshJoints[i]];
bounds.AddPoint( joint.GetTranslation() );
}
bounds.ExpandSelf( maxJointVertDist );
#endif
}
/*
@ -1220,6 +1232,7 @@ static void TransformJoints( idJointMat* __restrict outJoints, const int numJoin
assert_16_byte_aligned( inFloats1 );
assert_16_byte_aligned( inFloats2 );
#if defined(USE_INTRINSICS)
const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
@ -1296,6 +1309,14 @@ static void TransformJoints( idJointMat* __restrict outJoints, const int numJoin
_mm_store_ps( outFloats + 1 * 12 + 8, ri1 );
}
#else
for( int i = 0; i < numJoints; i++ )
{
idJointMat::Multiply( outJoints[i], inJoints1[i], inJoints2[i] );
}
#endif
}
/*

View file

@ -92,7 +92,7 @@ static void R_ShadowVolumeCullBits( byte* cullBits, byte& totalOr, const float r
assert_16_byte_aligned( cullBits );
assert_16_byte_aligned( verts );
#if defined(USE_INTRINSICS)
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 );
@ -215,6 +215,56 @@ static void R_ShadowVolumeCullBits( byte* cullBits, byte& totalOr, const float r
totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte );
#else
idODSStreamedArray< idShadowVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
byte tOr = 0;
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
const idVec3& v = vertsODS[i].xyzw.ToVec3();
const float d0 = planes[0].Distance( v );
const float d1 = planes[1].Distance( v );
const float d2 = planes[2].Distance( v );
const float d3 = planes[3].Distance( v );
const float t0 = d0 + radius;
const float t1 = d1 + radius;
const float t2 = d2 + radius;
const float t3 = d3 + radius;
const float s0 = d0 - radius;
const float s1 = d1 - radius;
const float s2 = d2 - radius;
const float s3 = d3 - radius;
byte bits;
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
}
totalOr = tOr;
#endif
}
/*

View file

@ -31,14 +31,16 @@ If you have questions concerning this license or the applicable additional terms
#include "../../../idlib/sys/sys_intrinsics.h"
#include "../../../idlib/geometry/DrawVert_intrinsics.h"
#if defined(USE_INTRINSICS)
static const __m128i vector_int_neg_one = _mm_set_epi32( -1, -1, -1, -1 );
#endif
/*
=====================
TriangleFacing_SSE2
=====================
*/
#if defined(USE_INTRINSICS)
static ID_FORCE_INLINE __m128i TriangleFacing_SSE2( const __m128& vert0X, const __m128& vert0Y, const __m128& vert0Z,
const __m128& vert1X, const __m128& vert1Y, const __m128& vert1Z,
const __m128& vert2X, const __m128& vert2Y, const __m128& vert2Z,
@ -60,6 +62,7 @@ static ID_FORCE_INLINE __m128i TriangleFacing_SSE2( const __m128& vert0X, const
const __m128 delta = _mm_nmsub_ps( lightOriginX, normalX, _mm_nmsub_ps( lightOriginY, normalY, _mm_nmsub_ps( lightOriginZ, normalZ, normalW ) ) );
return _mm_castps_si128( _mm_cmplt_ps( delta, _mm_setzero_ps() ) );
}
#endif
/*
=====================
@ -68,6 +71,7 @@ TriangleCulled
The clip space of the 'lightProject' is assumed to be in the range [0, 1].
=====================
*/
#if defined(USE_INTRINSICS)
static ID_FORCE_INLINE __m128i TriangleCulled_SSE2( const __m128& vert0X, const __m128& vert0Y, const __m128& vert0Z,
const __m128& vert1X, const __m128& vert1Y, const __m128& vert1Z,
const __m128& vert2X, const __m128& vert2Y, const __m128& vert2Z,
@ -128,6 +132,92 @@ static ID_FORCE_INLINE __m128i TriangleCulled_SSE2( const __m128& vert0X, const
return _mm_castps_si128( _mm_cmpeq_ps( b0, zero ) );
}
#else
/*
=====================
TriangleFacing
Returns 255 if the triangle is facing the light origin, otherwise returns 0.
=====================
*/
static byte TriangleFacing_Generic( const idVec3& v1, const idVec3& v2, const idVec3& v3, const idVec3& lightOrigin )
{
const float sx = v2.x - v1.x;
const float sy = v2.y - v1.y;
const float sz = v2.z - v1.z;
const float tx = v3.x - v1.x;
const float ty = v3.y - v1.y;
const float tz = v3.z - v1.z;
const float normalX = ty * sz - tz * sy;
const float normalY = tz * sx - tx * sz;
const float normalZ = tx * sy - ty * sx;
const float normalW = normalX * v1.x + normalY * v1.y + normalZ * v1.z;
const float d = lightOrigin.x * normalX + lightOrigin.y * normalY + lightOrigin.z * normalZ - normalW;
return ( d > 0.0f ) ? 255 : 0;
}
/*
=====================
TriangleCulled
Returns 255 if the triangle is culled to the light projection matrix, otherwise returns 0.
The clip space of the 'lightProject' is assumed to be in the range [0, 1].
=====================
*/
static byte TriangleCulled_Generic( const idVec3& v1, const idVec3& v2, const idVec3& v3, const idRenderMatrix& lightProject )
{
// transform the triangle
idVec4 c[3];
for( int i = 0; i < 4; i++ )
{
c[0][i] = v1[0] * lightProject[i][0] + v1[1] * lightProject[i][1] + v1[2] * lightProject[i][2] + lightProject[i][3];
c[1][i] = v2[0] * lightProject[i][0] + v2[1] * lightProject[i][1] + v2[2] * lightProject[i][2] + lightProject[i][3];
c[2][i] = v3[0] * lightProject[i][0] + v3[1] * lightProject[i][1] + v3[2] * lightProject[i][2] + lightProject[i][3];
}
// calculate the culled bits
int bits = 0;
for( int i = 0; i < 3; i++ )
{
const float minW = 0.0f;
const float maxW = c[i][3];
if( c[i][0] > minW )
{
bits |= ( 1 << 0 );
}
if( c[i][0] < maxW )
{
bits |= ( 1 << 1 );
}
if( c[i][1] > minW )
{
bits |= ( 1 << 2 );
}
if( c[i][1] < maxW )
{
bits |= ( 1 << 3 );
}
if( c[i][2] > minW )
{
bits |= ( 1 << 4 );
}
if( c[i][2] < maxW )
{
bits |= ( 1 << 5 );
}
}
// if any bits weren't set, the triangle is completely off one side of the frustum
return ( bits != 63 ) ? 255 : 0;
}
#endif
/*
=====================
@ -159,6 +249,7 @@ static int CalculateTriangleFacingCulledStatic( byte* __restrict facing, byte* _
const idVec3 lineDir = lineDelta * lineLengthRcp;
const float lineLength = lineLengthSqr * lineLengthRcp;
#if defined(USE_INTRINSICS)
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 4* 3 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
@ -271,6 +362,61 @@ static int CalculateTriangleFacingCulledStatic( byte* __restrict facing, byte* _
return _mm_cvtsi128_si32( numFrontFacing );
#else
idODSStreamedIndexedArray< idDrawVert, triIndex_t, 32, SBT_QUAD, 1 > indexedVertsODS( verts, numVerts, indexes, numIndexes );
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
int numFrontFacing = 0;
for( int i = 0, j = 0; i < numIndexes; )
{
const int batchStart = i;
const int batchEnd = indexedVertsODS.FetchNextBatch();
const int indexStart = j;
for( ; i <= batchEnd - 3; i += 3, j++ )
{
const idVec3& v1 = indexedVertsODS[i + 0].xyz;
const idVec3& v2 = indexedVertsODS[i + 1].xyz;
const idVec3& v3 = indexedVertsODS[i + 2].xyz;
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
culled[j] = triangleCulled;
facing[j] = triangleFacing;
// count the number of facing triangles
numFrontFacing += ( triangleFacing & 1 );
}
if( insideShadowVolume != NULL )
{
for( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ )
{
if( !facing[n] )
{
if( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, indexedVertsODS[k + 2].xyz, indexedVertsODS[k + 1].xyz, indexedVertsODS[k + 0].xyz ) )
{
*insideShadowVolume = true;
insideShadowVolume = NULL;
break;
}
}
}
}
}
return numFrontFacing;
#endif
}
/*
@ -303,6 +449,7 @@ static int CalculateTriangleFacingCulledSkinned( byte* __restrict facing, byte*
const idVec3 lineDir = lineDelta * lineLengthRcp;
const float lineLength = lineLengthSqr * lineLengthRcp;
#if defined(USE_INTRINSICS)
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
@ -448,6 +595,82 @@ static int CalculateTriangleFacingCulledSkinned( byte* __restrict facing, byte*
return _mm_cvtsi128_si32( numFrontFacing );
#else
idODSStreamedArray< idDrawVert, 32, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
tempVerts[i].ToVec3() = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
tempVerts[i].w = 1.0f;
}
}
idODSStreamedArray< triIndex_t, 256, SBT_QUAD, 1 > indexesODS( indexes, numIndexes );
const byte cullShadowTrianglesToLightMask = cullShadowTrianglesToLight ? 255 : 0;
int numFrontFacing = 0;
for( int i = 0, j = 0; i < numIndexes; )
{
const int batchStart = i;
const int batchEnd = indexesODS.FetchNextBatch();
const int indexStart = j;
for( ; i <= batchEnd - 3; i += 3, j++ )
{
const int i0 = indexesODS[i + 0];
const int i1 = indexesODS[i + 1];
const int i2 = indexesODS[i + 2];
const idVec3& v1 = tempVerts[i0].ToVec3();
const idVec3& v2 = tempVerts[i1].ToVec3();
const idVec3& v3 = tempVerts[i2].ToVec3();
const byte triangleCulled = TriangleCulled_Generic( v1, v2, v3, lightProject );
byte triangleFacing = TriangleFacing_Generic( v1, v2, v3, lightOrigin );
// optionally make triangles that are outside the light frustum facing so they do not contribute to the shadow volume
triangleFacing |= ( triangleCulled & cullShadowTrianglesToLightMask );
culled[j] = triangleCulled;
facing[j] = triangleFacing;
// count the number of facing triangles
numFrontFacing += ( triangleFacing & 1 );
}
if( insideShadowVolume != NULL )
{
for( int k = batchStart, n = indexStart; k <= batchEnd - 3; k += 3, n++ )
{
if( !facing[n] )
{
const int i0 = indexesODS[k + 0];
const int i1 = indexesODS[k + 1];
const int i2 = indexesODS[k + 2];
if( R_LineIntersectsTriangleExpandedWithSphere( lineStart, lineEnd, lineDir, lineLength, radius, tempVerts[i2].ToVec3(), tempVerts[i1].ToVec3(), tempVerts[i0].ToVec3() ) )
{
*insideShadowVolume = true;
insideShadowVolume = NULL;
break;
}
}
}
}
}
return numFrontFacing;
#endif
}
/*
@ -461,6 +684,7 @@ static void StreamOut( void* dst, const void* src, int numBytes )
assert_16_byte_aligned( dst );
assert_16_byte_aligned( src );
#if defined(USE_INTRINSICS)
int i = 0;
for( ; i + 128 <= numBytes; i += 128 )
{
@ -486,6 +710,9 @@ static void StreamOut( void* dst, const void* src, int numBytes )
__m128i d = _mm_load_si128( ( __m128i* )( ( byte* )src + i ) );
_mm_stream_si128( ( __m128i* )( ( byte* )dst + i ), d );
}
#else
memcpy( dst, src, numBytes );
#endif
}
/*
@ -706,7 +933,9 @@ static void R_CreateShadowVolumeTriangles( triIndex_t* __restrict shadowIndices,
numShadowIndexesTotal = numShadowIndices;
#if defined(USE_INTRINSICS)
_mm_sfence();
#endif
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination
@ -893,7 +1122,9 @@ void R_CreateLightTriangles( triIndex_t* __restrict lightIndices, triIndex_t* __
numLightIndicesTotal = numLightIndices;
#if defined(USE_INTRINSICS)
_mm_sfence();
#endif
#else // NOTE: this code will not work on the SPU because it tries to write directly to the destination

View file

@ -3,6 +3,7 @@
Doom 3 BFG Edition GPL Source Code
Copyright (C) 1993-2012 id Software LLC, a ZeniMax Media company.
Copyright (C) 2013 Robert Beckebans
This file is part of the Doom 3 BFG Edition GPL Source Code ("Doom 3 BFG Edition Source Code").
@ -44,7 +45,7 @@ static void R_TracePointCullStatic( byte* cullBits, byte& totalOr, const float r
assert_16_byte_aligned( cullBits );
assert_16_byte_aligned( verts );
#if defined(USE_INTRINSICS)
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 );
@ -167,6 +168,56 @@ static void R_TracePointCullStatic( byte* cullBits, byte& totalOr, const float r
totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte );
#else
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
byte tOr = 0;
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
const idVec3& v = vertsODS[i].xyz;
const float d0 = planes[0].Distance( v );
const float d1 = planes[1].Distance( v );
const float d2 = planes[2].Distance( v );
const float d3 = planes[3].Distance( v );
const float t0 = d0 + radius;
const float t1 = d1 + radius;
const float t2 = d2 + radius;
const float t3 = d3 + radius;
const float s0 = d0 - radius;
const float s1 = d1 - radius;
const float s2 = d2 - radius;
const float s3 = d3 - radius;
byte bits;
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
}
totalOr = tOr;
#endif
}
/*
@ -179,7 +230,7 @@ static void R_TracePointCullSkinned( byte* cullBits, byte& totalOr, const float
assert_16_byte_aligned( cullBits );
assert_16_byte_aligned( verts );
#if defined(USE_INTRINSICS)
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 4 > vertsODS( verts, numVerts );
const __m128 vector_float_radius = _mm_splat_ps( _mm_load_ss( &radius ), 0 );
@ -302,6 +353,56 @@ static void R_TracePointCullSkinned( byte* cullBits, byte& totalOr, const float
totalOr = ( byte ) _mm_cvtsi128_si32( vecTotalOrByte );
#else
idODSStreamedArray< idDrawVert, 16, SBT_DOUBLE, 1 > vertsODS( verts, numVerts );
byte tOr = 0;
for( int i = 0; i < numVerts; )
{
const int nextNumVerts = vertsODS.FetchNextBatch() - 1;
for( ; i <= nextNumVerts; i++ )
{
const idVec3 v = Scalar_LoadSkinnedDrawVertPosition( vertsODS[i], joints );
const float d0 = planes[0].Distance( v );
const float d1 = planes[1].Distance( v );
const float d2 = planes[2].Distance( v );
const float d3 = planes[3].Distance( v );
const float t0 = d0 + radius;
const float t1 = d1 + radius;
const float t2 = d2 + radius;
const float t3 = d3 + radius;
const float s0 = d0 - radius;
const float s1 = d1 - radius;
const float s2 = d2 - radius;
const float s3 = d3 - radius;
byte bits;
bits = IEEE_FLT_SIGNBITSET( t0 ) << 0;
bits |= IEEE_FLT_SIGNBITSET( t1 ) << 1;
bits |= IEEE_FLT_SIGNBITSET( t2 ) << 2;
bits |= IEEE_FLT_SIGNBITSET( t3 ) << 3;
bits |= IEEE_FLT_SIGNBITSET( s0 ) << 4;
bits |= IEEE_FLT_SIGNBITSET( s1 ) << 5;
bits |= IEEE_FLT_SIGNBITSET( s2 ) << 6;
bits |= IEEE_FLT_SIGNBITSET( s3 ) << 7;
bits ^= 0x0F; // flip lower four bits
tOr |= bits;
cullBits[i] = bits;
}
}
totalOr = tOr;
#endif
}
/*

View file

@ -1722,10 +1722,12 @@ void R_TestDegenerateTextureSpace( srfTriangles_t* tri )
const idDrawVert& b = tri->verts[tri->indexes[i + 1]];
const idDrawVert& c = tri->verts[tri->indexes[i + 2]];
if( a.st == b.st || b.st == c.st || c.st == a.st )
// RB: compare texcoords instead of pointers
if( a.GetTexCoord() == b.GetTexCoord() || b.GetTexCoord() == c.GetTexCoord() || c.GetTexCoord() == a.GetTexCoord() )
{
c_degenerate++;
}
// RB end
}
if( c_degenerate )